{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,30]],"date-time":"2026-06-30T15:46:28Z","timestamp":1782834388427,"version":"3.54.5"},"reference-count":46,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"8","license":[{"start":{"date-parts":[[2021,8,1]],"date-time":"2021-08-01T00:00:00Z","timestamp":1627776000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,8,1]],"date-time":"2021-08-01T00:00:00Z","timestamp":1627776000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,8,1]],"date-time":"2021-08-01T00:00:00Z","timestamp":1627776000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Hong Kong RGC GRF","award":["HKBU 12200418"],"award-info":[{"award-number":["HKBU 12200418"]}]},{"name":"Hong Kong RGC GRF","award":["HKUST 16206417"],"award-info":[{"award-number":["HKUST 16206417"]}]},{"name":"Hong Kong RGC GRF","award":["16207818"],"award-info":[{"award-number":["16207818"]}]},{"name":"RGC CRF","award":["C7036-15G"],"award-info":[{"award-number":["C7036-15G"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2021,8,1]]},"DOI":"10.1109\/tpds.2021.3052862","type":"journal-article","created":{"date-parts":[[2021,1,20]],"date-time":"2021-01-20T20:30:15Z","timestamp":1611174615000},"page":"1903-1917","source":"Crossref","is-referenced-by-count":32,"title":["MG-WFBP: Merging Gradients Wisely for Efficient Communication in Distributed Deep Learning"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1418-5160","authenticated-orcid":false,"given":"Shaohuai","family":"Shi","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9745-4372","authenticated-orcid":false,"given":"Xiaowen","family":"Chu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2955-750X","authenticated-orcid":false,"given":"Bo","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref38","first-page":"4278","article-title":"Inception-v4, inception-resNet and the impact of residual connections on learning","author":"szegedy","year":"2017","journal-title":"Proc 31st AAAI Conf Artif Intell"},{"key":"ref33","first-page":"265","article-title":"TensorFlow: A system for large-scale machine learning","author":"abadi","year":"2016","journal-title":"Proc 12th USENIX Conf Operating Syst Des Implementation"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CCBD.2016.029"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2009.09.001"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/505202.505215"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref35","article-title":"Understanding the Python GIL","author":"beazley","year":"2010","journal-title":"Proc PyCON Python Conf"},{"key":"ref34","article-title":"Nvidia CUDA C programming guide","volume":"120","author":"nvidia","year":"2011","journal-title":"NVIDIA Corporation"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2019.00220"},{"key":"ref40","article-title":"Mixed precision training","author":"micikevicius","year":"2018","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/473"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2013.17"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737587"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126954"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3018743.3018769"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref17","first-page":"181","article-title":"Poseidon: An efficient communication architecture for distributed deep learning on GPU clusters","author":"zhang","year":"2017","journal-title":"Proc USENIX Conf USENIX Annu Tech Conf"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/DASC\/PiCom\/DataCom\/CyberSciTec.2018.000-4"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3098822.3098825"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1177\/1094342005051521"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/2901318.2901323"},{"key":"ref27","first-page":"1","article-title":"Optimization of collective reduction operations","author":"rabenseifner","year":"2004","journal-title":"Proc Int Conf Comput Sci"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/2987550.2987586"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737595"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15646-5_3"},{"key":"ref5","first-page":"425","article-title":"A DAG model of synchronous stochastic gradient descent in distributed deep learning","author":"shi","year":"2018","journal-title":"Proc IEEE 24rd Int Conf Parallel Distrib Syst"},{"key":"ref8","article-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training","author":"lin","year":"2018","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref7","first-page":"1707","article-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding","author":"alistarh","year":"2017","journal-title":"Proc Advances Neural Inf Process Syst"},{"key":"ref2","article-title":"Accurate, large minibatch SGD: Training ImageNet in 1 hour","author":"goyal","year":"2017"},{"key":"ref9","first-page":"1508","article-title":"TernGrad: Ternary gradients to reduce communication in distributed deep learning","author":"wen","year":"2017","journal-title":"Proc Advances Neural Inf Process Syst"},{"key":"ref1","first-page":"1223","article-title":"Large scale distributed deep networks","author":"dean","year":"2012","journal-title":"Proc Advances Neural Inf Process Syst"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155446"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934908"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155269"},{"key":"ref22","article-title":"Communication-efficient distributed deep learning: A comprehensive survey","author":"tang","year":"2020"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126912"},{"key":"ref42","first-page":"19","article-title":"Communication efficient distributed machine learning with the parameter server","author":"li","year":"2014","journal-title":"Proc Advances Neural Inf Process Syst"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654889"},{"key":"ref41","article-title":"Highly scalable deep learning training system with mixed-precision: Training ImageNet in four minutes","author":"jia","year":"2018","journal-title":"Proc Workshop Syst ML Open Source Softw Collocated"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/MNET.011.2000530"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3331526"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737367"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICPADS.2017.00097"},{"key":"ref25","first-page":"8024","article-title":"PyTorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"Proc Advances Neural Inf Process Syst"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/71\/9360409\/09328614.pdf?arnumber=9328614","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T14:50:25Z","timestamp":1652194225000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9328614\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,8,1]]},"references-count":46,"journal-issue":{"issue":"8"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2021.3052862","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"value":"1045-9219","type":"print"},{"value":"1558-2183","type":"electronic"},{"value":"2161-9883","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,8,1]]}}}