{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T09:41:40Z","timestamp":1775122900545,"version":"3.50.1"},"reference-count":44,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,5,10]],"date-time":"2021-05-10T00:00:00Z","timestamp":1620604800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,5,10]],"date-time":"2021-05-10T00:00:00Z","timestamp":1620604800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,5,10]],"date-time":"2021-05-10T00:00:00Z","timestamp":1620604800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,5,10]]},"DOI":"10.1109\/infocom42981.2021.9488803","type":"proceedings-article","created":{"date-parts":[[2021,7,26]],"date-time":"2021-07-26T20:07:32Z","timestamp":1627330052000},"page":"1-10","source":"Crossref","is-referenced-by-count":34,"title":["Exploiting Simultaneous Communications to Accelerate Data Parallel Distributed Deep Learning"],"prefix":"10.1109","author":[{"given":"Shaohuai","family":"Shi","sequence":"first","affiliation":[]},{"given":"Xiaowen","family":"Chu","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Li","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","article-title":"Blueconnect: Novel hierarchical all-reduce on multi-tired network for deep learning","author":"cho","year":"2019","journal-title":"Proc of MLSys 2019"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737595"},{"key":"ref33","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"Proc of ACL"},{"key":"ref32","doi-asserted-by":"crossref","DOI":"10.1609\/aaai.v31i1.11231","article-title":"Inception-v4, inception-resnet and the impact of residual connections on learning","author":"szegedy","year":"2017","journal-title":"Proc of the 31st AAAI"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"ref30","article-title":"Towards scalable distributed training of deep learning on public cloud clusters","author":"shi","year":"2020"},{"key":"ref37","article-title":"Massively distributed sgd: Imagenet\/resnet-50 training in a flash","author":"mikami","year":"2018"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/2966884.2966912"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2009.09.001"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-24685-5_1"},{"key":"ref10","first-page":"3054","article-title":"A comprehensive linear speedup analysis for asynchronous stochastic parallel optimization from zeroth-order to first-order","author":"lian","year":"2016","journal-title":"Advances in neural information processing systems"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3332466.3374528"},{"key":"ref11","first-page":"1709","article-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding","author":"alistarh","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref12","first-page":"1509","article-title":"Terngrad: Ternary gradients to reduce communication in distributed deep learning","author":"wen","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref13","first-page":"1299","article-title":"Gradient sparsification for communication-efficient distributed optimization","author":"wangni","year":"2018","journal-title":"Advances in neural information processing systems"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356222"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2019.00220"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155269"},{"key":"ref17","article-title":"ScaleCom: Scalable sparsified gradient compression for communication-efficient distributed training","volume":"33","author":"chen","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref18","first-page":"4120","article-title":"Asynchronous stochastic gradient descent with delay compensation","author":"zheng","year":"2017","journal-title":"International Conference on Machine Learning"},{"key":"ref19","first-page":"3252","article-title":"Error feedback fixes SignSGD and other gradient compression schemes","author":"karimireddy","year":"2019","journal-title":"International Conference on Machine Learning"},{"key":"ref28","article-title":"Horovod: fast and easy distributed deep learning in TensorFlow","author":"sergeev","year":"2018"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155446"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737367"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref29","first-page":"1","article-title":"Enabling work-conserving bandwidth guarantees for multi-tenant data-centers via dynamic tenant-queue binding","author":"liu","year":"2018","journal-title":"Proc of IEEE INFOCOM"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/DASC\/PiCom\/DataCom\/CyberSciTec.2018.000-4"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737587"},{"key":"ref7","first-page":"1223","article-title":"More effective distributed ML via a stale synchronous parallel parameter server","author":"ho","year":"2013","journal-title":"Advances in neural information processing systems"},{"key":"ref2","article-title":"Highly scalable deep learning training system with mixed-precision: Training ImageNet in four minutes","author":"jia","year":"2018","journal-title":"Proc of Workshop on Systems for ML and Open Source Software collocated with NeurIPS 2018"},{"key":"ref9","first-page":"2737","article-title":"Asynchronous parallel stochastic gradient for nonconvex optimization","author":"lian","year":"2015","journal-title":"Advances in neural information processing systems"},{"key":"ref1","first-page":"1223","article-title":"Large scale distributed deep networks","author":"dean","year":"2012","journal-title":"Advances in neural information processing systems"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/473"},{"key":"ref22","first-page":"181","article-title":"Poseidon: An efficient communication architecture for distributed deep learning on GPU clusters","author":"zhang","year":"2017","journal-title":"Proc of USENIX ATC 17"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3018743.3018769"},{"key":"ref42","first-page":"172","article-title":"Blink: Fast and generic collectives for distributed ML","author":"wang","year":"2020","journal-title":"Proc of MLSys 2020"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3052862"},{"key":"ref41","article-title":"Image classification at supercomputer scale","author":"ying","year":"2018","journal-title":"Proc of Workshop on Systems for ML and Open Source Software collocated with NeurIPS 2018"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126912"},{"key":"ref44","first-page":"82","article-title":"PLink: Discovering and exploiting locality for accelerated distributed training on the public cloud","author":"luo","year":"2020","journal-title":"Proc of MLSys 2020"},{"key":"ref26","article-title":"Priority-based parameter propagation for distributed DNN training","author":"jayarajan","year":"2019","journal-title":"Proc of MLSys 2019"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3392717.3392771"},{"key":"ref25","article-title":"TicTac: Accelerating distributed deep learning with communication scheduling","author":"hashemi","year":"2019","journal-title":"Proc of MLSys 2019"}],"event":{"name":"IEEE INFOCOM 2021 - IEEE Conference on Computer Communications","location":"Vancouver, BC, Canada","start":{"date-parts":[[2021,5,10]]},"end":{"date-parts":[[2021,5,13]]}},"container-title":["IEEE INFOCOM 2021 - IEEE Conference on Computer Communications"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9488422\/9488423\/09488803.pdf?arnumber=9488803","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,5]],"date-time":"2023-01-05T10:21:18Z","timestamp":1672914078000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9488803\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,5,10]]},"references-count":44,"URL":"https:\/\/doi.org\/10.1109\/infocom42981.2021.9488803","relation":{},"subject":[],"published":{"date-parts":[[2021,5,10]]}}}