{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T21:03:13Z","timestamp":1763499793123,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,8,29]],"date-time":"2022-08-29T00:00:00Z","timestamp":1661731200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China","award":["2021YFB0301200"],"award-info":[{"award-number":["2021YFB0301200"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,8,29]]},"DOI":"10.1145\/3545008.3545011","type":"proceedings-article","created":{"date-parts":[[2023,1,15]],"date-time":"2023-01-15T01:04:08Z","timestamp":1673744648000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["EmbRace: Accelerating Sparse Communication for Distributed Training of Deep Neural Networks"],"prefix":"10.1145","author":[{"given":"Shengwei","family":"Li","sequence":"first","affiliation":[{"name":"National University of Defense Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiquan","family":"Lai","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dongsheng","family":"Li","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yiming","family":"Zhang","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, China and Xiamen University, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiangyu","family":"Ye","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yabo","family":"Duan","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2019. Gloo: a collective communications library.https:\/\/github.com\/facebookincubator\/gloo"},{"key":"e_1_3_2_1_2_1","unstructured":"2019. NVIDIA Collective Communications Library (NCCL). https:\/\/developer.nvidia.com\/nccl"},{"key":"e_1_3_2_1_3_1","first-page":"1709","article-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding","volume":"30","author":"Alistarh Dan","year":"2017","unstructured":"Dan Alistarh, Demjan Grubic, Jerry Li, Ryota Tomioka, and Milan Vojnovic. 2017. QSGD: Communication-efficient SGD via gradient quantization and encoding. Advances in Neural Information Processing Systems 30 (2017), 1709\u20131720.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155446"},{"key":"e_1_3_2_1_5_1","volume-title":"Findings of the 2014 workshop on statistical machine translation. In Proceedings of the ninth workshop on statistical machine translation. 12\u201358","author":"Bojar Ond\u0159ej","year":"2014","unstructured":"Ond\u0159ej Bojar, Christian Buck, Christian Federmann, Barry Haddow, Philipp Koehn, Johannes Leveling, Christof Monz, Pavel Pecina, Matt Post, Herve Saint-Amand, 2014. Findings of the 2014 workshop on statistical machine translation. In Proceedings of the ninth workshop on statistical machine translation. 12\u201358."},{"key":"e_1_3_2_1_6_1","volume-title":"Findings of the 2016 conference on machine translation. In Proceedings of the First Conference on Machine Translation","volume":"2","author":"Bojar Ond\u0159ej","year":"2016","unstructured":"Ond\u0159ej Bojar, Rajen Chatterjee, Christian Federmann, Yvette Graham, Barry Haddow, Matthias Huck, Antonio\u00a0Jimeno Yepes, Philipp Koehn, Varvara Logacheva, Christof Monz, 2016. Findings of the 2016 conference on machine translation. In Proceedings of the First Conference on Machine Translation: Volume 2, Shared Task Papers. 131\u2013198."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Ciprian Chelba Tomas Mikolov Mike Schuster Qi Ge Thorsten Brants Phillipp Koehn and Tony Robinson. 2013. One billion word benchmark for measuring progress in statistical language modeling. arXiv preprint arXiv:1312.3005(2013).","DOI":"10.21437\/Interspeech.2014-564"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 2nd SysML Conference.","author":"Cho Minsik","year":"2019","unstructured":"Minsik Cho, Ulrich Finkler, and David Kung. 2019. BlueConnect: Novel hierarchical all-reduce on multi-tired network for deep learning. In Proceedings of the 2nd SysML Conference."},{"key":"e_1_3_2_1_9_1","unstructured":"Jeffrey Dean Greg\u00a0S Corrado Rajat Monga Kai Chen Matthieu Devin Quoc\u00a0V Le Mark\u00a0Z Mao Marc\u2019Aurelio Ranzato Andrew Senior Paul Tucker 2012. Large scale distributed deep networks. (2012)."},{"key":"e_1_3_2_1_10_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805(2018).","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805(2018)."},{"key":"e_1_3_2_1_11_1","volume-title":"Adaptive subgradient methods for online learning and stochastic optimization.Journal of machine learning research 12, 7","author":"Duchi John","year":"2011","unstructured":"John Duchi, Elad Hazan, and Yoram Singer. 2011. Adaptive subgradient methods for online learning and stochastic optimization.Journal of machine learning research 12, 7 (2011)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-30218-6_19"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747408"},{"key":"e_1_3_2_1_15_1","volume-title":"Tictac: Accelerating distributed deep learning with communication scheduling. arXiv preprint arXiv:1803.03288(2018).","author":"Hashemi Sayed\u00a0Hadi","year":"2018","unstructured":"Sayed\u00a0Hadi Hashemi, Sangeetha\u00a0Abdu Jyothi, and Roy\u00a0H Campbell. 2018. Tictac: Accelerating distributed deep learning with communication scheduling. arXiv preprint arXiv:1803.03288(2018)."},{"key":"e_1_3_2_1_16_1","unstructured":"Anand Jayarajan Jinliang Wei Garth Gibson Alexandra Fedorova and Gennady Pekhimenko. 2019. Priority-based parameter propagation for distributed DNN training. arXiv preprint arXiv:1905.03960(2019)."},{"key":"e_1_3_2_1_17_1","unstructured":"Xianyan Jia Shutao Song Wei He Yangzihao Wang Haidong Rong Feihu Zhou Liqiang Xie Zhenyu Guo Yuanzhou Yang Liwei Yu 2018. Highly scalable deep learning training system with mixed-precision: Training imagenet in four minutes. arXiv preprint arXiv:1807.11205(2018)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.5555\/3488766.3488792"},{"key":"e_1_3_2_1_19_1","unstructured":"Rafal Jozefowicz Oriol Vinyals Mike Schuster Noam Shazeer and Yonghui Wu. 2016. Exploring the limits of language modeling. arXiv preprint arXiv:1602.02410(2016)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/MLHPC.2016.006"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303957"},{"key":"e_1_3_2_1_22_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980(2014).","author":"Kingma P","year":"2014","unstructured":"Diederik\u00a0P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980(2014)."},{"key":"e_1_3_2_1_23_1","volume-title":"Sentencepiece: A simple and language independent subword tokenizer and detokenizer for neural text processing. arXiv preprint arXiv:1808.06226(2018).","author":"Kudo Taku","year":"2018","unstructured":"Taku Kudo and John Richardson. 2018. Sentencepiece: A simple and language independent subword tokenizer and detokenizer for neural text processing. arXiv preprint arXiv:1808.06226(2018)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2019.00173"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/2640087.2644155"},{"key":"e_1_3_2_1_26_1","unstructured":"Shen Li Yanli Zhao Rohan Varma Omkar Salpekar Pieter Noordhuis Teng Li Adam Paszke Jeff Smith Brian Vaughan Pritam Damania 2020. Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704(2020)."},{"key":"e_1_3_2_1_27_1","unstructured":"Yujun Lin Song Han Huizi Mao Yu Wang and William\u00a0J Dally. 2017. Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv preprint arXiv:1712.01887(2017)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1023\/B:IJPP.0000029272.69895.c1"},{"key":"e_1_3_2_1_29_1","unstructured":"Hiroaki Mikami Hisahiro Suganuma Yoshiki Tanaka Yuichi Kageyama 2018. Massively distributed SGD: ImageNet\/ResNet-50 training in a flash. arXiv preprint arXiv:1811.05233(2018)."},{"key":"e_1_3_2_1_30_1","unstructured":"Dheevatsa Mudigere Yuchen Hao Jianyu Huang Andrew Tulloch Srinivas Sridharan Xing Liu Mustafa Ozdal Jade Nie Jongsoo Park Liang Luo 2021. High-performance distributed training of large-scale deep learning recommendation models. arXiv preprint arXiv:2104.05158(2021)."},{"key":"e_1_3_2_1_31_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019), 8026\u20138037."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Pranav Rajpurkar Jian Zhang Konstantin Lopyrev and Percy Liang. 2016. Squad: 100 000+ questions for machine comprehension of text. arXiv preprint arXiv:1606.05250(2016).","DOI":"10.18653\/v1\/D16-1264"},{"key":"e_1_3_2_1_35_1","unstructured":"Sebastian Ruder. 2016. An overview of gradient descent optimization algorithms. arXiv preprint arXiv:1609.04747(2016)."},{"key":"e_1_3_2_1_36_1","unstructured":"Alexander Sergeev and Mike Del\u00a0Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799(2018)."},{"volume-title":"4th Intl Conf on Big Data Intelligence and Computing and Cyber Science and Technology Congress (DASC\/PiCom\/DataCom\/CyberSciTech)","author":"Shi Shaohuai","key":"e_1_3_2_1_37_1","unstructured":"Shaohuai Shi, Qiang Wang, and Xiaowen Chu. 2018. Performance modeling and evaluation of distributed deep learning frameworks on gpus. In 2018 IEEE 16th Intl Conf on Dependable, Autonomic and Secure Computing, 16th Intl Conf on Pervasive Intelligence and Computing, 4th Intl Conf on Big Data Intelligence and Computing and Cyber Science and Technology Congress (DASC\/PiCom\/DataCom\/CyberSciTech). IEEE, 949\u2013957."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Peng Sun Wansen Feng Ruobing Han Shengen Yan and Yonggang Wen. 2019. Optimizing network performance for distributed dnn training on gpu clusters: Imagenet\/alexnet training in 1.5 minutes. arXiv preprint arXiv:1902.06855(2019).","DOI":"10.1109\/TBDATA.2019.2957478"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-87475-1_16"},{"key":"e_1_3_2_1_40_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998\u20136008."},{"key":"e_1_3_2_1_41_1","volume-title":"Blink: Fast and generic collectives for distributed ml. arXiv preprint arXiv:1910.04940(2019).","author":"Wang Guanhua","year":"2019","unstructured":"Guanhua Wang, Shivaram Venkataraman, Amar Phanishayee, Jorgen Thelin, Nikhil Devanur, and Ion Stoica. 2019. Blink: Fast and generic collectives for distributed ml. arXiv preprint arXiv:1910.04940(2019)."},{"key":"e_1_3_2_1_42_1","unstructured":"Yonghui Wu Mike Schuster Zhifeng Chen Quoc\u00a0V Le Mohammad Norouzi Wolfgang Macherey Maxim Krikun Yuan Cao Qin Gao Klaus Macherey 2016. Google\u2019s neural machine translation system: Bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144(2016)."},{"key":"e_1_3_2_1_43_1","volume-title":"Model Average-based Distributed Training for Sparse Deep Neural Networks. In IEEE INFOCOM 2020-IEEE Conference on Computer Communications Workshops (INFOCOM WKSHPS). IEEE, 1346\u20131347","author":"Yang Yuetong","year":"2020","unstructured":"Yuetong Yang, Zhiquan Lai, Lei Cai, and Dongsheng Li. 2020. Model Average-based Distributed Training for Sparse Deep Neural Networks. In IEEE INFOCOM 2020-IEEE Conference on Computer Communications Workshops (INFOCOM WKSHPS). IEEE, 1346\u20131347."},{"key":"e_1_3_2_1_44_1","volume-title":"2017 USENIX Annual Technical Conference (USENIX ATC 17)","author":"Zhang Hao","year":"2017","unstructured":"Hao Zhang, Zeyu Zheng, Shizhen Xu, Wei Dai, Qirong Ho, Xiaodan Liang, Zhiting Hu, Jinliang Wei, Pengtao Xie, and Eric\u00a0P Xing. 2017. Poseidon: An efficient communication architecture for distributed deep learning on GPU clusters. In 2017 USENIX Annual Technical Conference (USENIX ATC 17). 181\u2013193."}],"event":{"name":"ICPP '22: 51st International Conference on Parallel Processing","acronym":"ICPP '22","location":"Bordeaux France"},"container-title":["Proceedings of the 51st International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3545008.3545011","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3545008.3545011","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:02:43Z","timestamp":1750186963000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3545008.3545011"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,29]]},"references-count":43,"alternative-id":["10.1145\/3545008.3545011","10.1145\/3545008"],"URL":"https:\/\/doi.org\/10.1145\/3545008.3545011","relation":{},"subject":[],"published":{"date-parts":[[2022,8,29]]},"assertion":[{"value":"2023-01-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}