{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:24:27Z","timestamp":1767986667510,"version":"3.49.0"},"reference-count":47,"publisher":"IEEE","license":[{"start":{"date-parts":[[2020,7,1]],"date-time":"2020-07-01T00:00:00Z","timestamp":1593561600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,7,1]],"date-time":"2020-07-01T00:00:00Z","timestamp":1593561600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,7,1]],"date-time":"2020-07-01T00:00:00Z","timestamp":1593561600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020,7]]},"DOI":"10.1109\/infocom41043.2020.9155269","type":"proceedings-article","created":{"date-parts":[[2020,8,4]],"date-time":"2020-08-04T22:29:35Z","timestamp":1596580175000},"page":"406-415","source":"Crossref","is-referenced-by-count":54,"title":["Communication-Efficient Distributed Deep Learning with Merged Gradient Sparsification on GPUs"],"prefix":"10.1109","author":[{"given":"Shaohuai","family":"Shi","sequence":"first","affiliation":[]},{"given":"Qiang","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Xiaowen","family":"Chu","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yang","family":"Qin","sequence":"additional","affiliation":[]},{"given":"Ruihao","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Xinxiao","family":"Zhao","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","first-page":"559","article-title":"SIGNSGD: Compressed optimisation for non-convex problems","author":"bernstein","year":"2018","journal-title":"Proc of International Conference on Machine Learning"},{"key":"ref38","article-title":"1-bit stochastic gradient descent and application to data-parallel distributed training of speech DNNs","author":"seide","year":"2014","journal-title":"Proc of Interspeech"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21236\/ADA273556"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref31","doi-asserted-by":"crossref","DOI":"10.1609\/aaai.v31i1.11231","article-title":"Inception-v4, inception-resnet and the impact of residual connections on learning","author":"szegedy","year":"2017","journal-title":"Proc 31st AAAI"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737595"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/2966884.2966912"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2010.04.056"},{"key":"ref34","article-title":"Bringing HPC techniques to deep learning.(2017)","author":"gibiansky","year":"2017"},{"key":"ref10","article-title":"Understanding top-k sparsification in distributed deep learning","author":"shi","year":"2019"},{"key":"ref40","article-title":"Scalable distributed DNN training using commodity GPU cloud computing","author":"strom","year":"2015","journal-title":"Proc of Interspeech"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CCBD.2016.029"},{"key":"ref12","article-title":"Stochastic nonconvex optimization with large minibatches","author":"wang","year":"2019","journal-title":"Proceedings of International Conference on Algorithmic Learning Theory"},{"key":"ref13","first-page":"181","article-title":"Poseidon: An efficient communication architecture for distributed deep learning on GPU clusters","author":"zhang","year":"2017","journal-title":"Proc of USENIX ATC"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3018743.3018769"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126912"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737367"},{"key":"ref17","article-title":"Horovod: fast and easy distributed deep learning in TensorFlow","author":"sergeev","year":"2018"},{"key":"ref18","article-title":"Highly scalable deep learning training system with mixed-precision: Training ImageNet in four minutes","author":"jia","year":"2018","journal-title":"Proc of Workshop on Systems for ML and Open Source Software collocated with NeurIPS 2018"},{"key":"ref19","article-title":"Layer-wise adaptive gradient sparsification for distributed deep learning with convergence guarantees","author":"shi","year":"2020","journal-title":"Proc of The 24th European Conference on Artificial Intelligence"},{"key":"ref28","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2014"},{"key":"ref4","article-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training","author":"lin","year":"2018","journal-title":"International Conference on Learning Representations"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934908"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1045"},{"key":"ref6","first-page":"1707","article-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding","author":"alistarh","year":"2017","journal-title":"Proc of Advances in Neural Information Processing systems"},{"key":"ref29","article-title":"Cifar-10 (canadian institute for advanced research)","author":"krizhevsky","year":"2010"},{"key":"ref5","first-page":"1509","article-title":"Terngrad: Ternary gradients to reduce communication in distributed deep learning","author":"wen","year":"2017","journal-title":"Proc of Advances in Neural Information Processing systems"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/473"},{"key":"ref7","first-page":"2530","article-title":"A linear speedup analysis of distributed deep learning with sparse and quantized communication","author":"jiang","year":"2018","journal-title":"Proc of Advances in Neural Information Processing systems"},{"key":"ref2","article-title":"Accurate, large minibatch SGD: Training ImageNet in 1 hour","author":"goyal","year":"2017"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3183713.3183735"},{"key":"ref1","doi-asserted-by":"crossref","first-page":"436","DOI":"10.1038\/nature14539","article-title":"Deep learning","volume":"521","author":"lecun","year":"2015","journal-title":"Nature"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3225058.3225069"},{"key":"ref20","first-page":"265","article-title":"Tensorflow: A system for large-scale machine learning","author":"abadi","year":"2016","journal-title":"Proc of USENIX OSDI"},{"key":"ref45","article-title":"Local SGD converges fast and communicates little","author":"stich","year":"2019","journal-title":"International Conference on Learning Representations"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/DASC\/PiCom\/DataCom\/CyberSciTec.2018.000-4"},{"key":"ref47","first-page":"8045","article-title":"Pipe-SGD: A decentralized pipelined SGD framework for distributed deep net training","author":"li","year":"2018","journal-title":"Proc of Advances in Neural Information Processing systems"},{"key":"ref21","article-title":"Automatic differentiation in PyTorch","author":"paszke","year":"2017","journal-title":"Proc NIPS Autodiff Workshop"},{"key":"ref42","first-page":"1299","article-title":"Gradient sparsification for communication-efficient distributed optimization","author":"wangni","year":"2018","journal-title":"Proc of Advances in Neural Information Processing systems"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356222"},{"key":"ref41","article-title":"Adacomp: Adaptive residual gradient compression for data-parallel distributed training","author":"chen","year":"2018","journal-title":"Proc 32nd AAAI"},{"key":"ref23","article-title":"Benchmarking the performance and power of AI accelerators for AI training","author":"wang","year":"2019"},{"key":"ref44","article-title":"Don&#x2019;t use large mini-batches, use local SGD","author":"lin","year":"2018"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/505202.505215"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2019.00220"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1177\/1094342005051521"}],"event":{"name":"IEEE INFOCOM 2020 - IEEE Conference on Computer Communications","location":"Toronto, ON, Canada","start":{"date-parts":[[2020,7,6]]},"end":{"date-parts":[[2020,7,9]]}},"container-title":["IEEE INFOCOM 2020 - IEEE Conference on Computer Communications"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9146503\/9155217\/09155269.pdf?arnumber=9155269","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,5]],"date-time":"2022-11-05T19:16:56Z","timestamp":1667675816000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9155269\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,7]]},"references-count":47,"URL":"https:\/\/doi.org\/10.1109\/infocom41043.2020.9155269","relation":{},"subject":[],"published":{"date-parts":[[2020,7]]}}}