{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T09:59:45Z","timestamp":1775815185866,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,3,28]],"date-time":"2022-03-28T00:00:00Z","timestamp":1648425600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"European Research Council under the European Union's Horizon 2020 programme","award":["DAPP, No. 678880; EPiGRAM-HS, No. 801039; MAELSTROM, No. 955513"],"award-info":[{"award-number":["DAPP, No. 678880; EPiGRAM-HS, No. 801039; MAELSTROM, No. 955513"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,4,2]]},"DOI":"10.1145\/3503221.3508399","type":"proceedings-article","created":{"date-parts":[[2022,3,28]],"date-time":"2022-03-28T13:58:22Z","timestamp":1648475902000},"page":"135-149","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":48,"title":["Near-optimal sparse allreduce for distributed deep learning"],"prefix":"10.1145","author":[{"given":"Shigang","family":"Li","sequence":"first","affiliation":[{"name":"ETH Zurich, Switzerland"}]},{"given":"Torsten","family":"Hoefler","sequence":"additional","affiliation":[{"name":"ETH Zurich, Switzerland"}]}],"member":"320","published-online":{"date-parts":[[2022,3,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1990.115971"},{"key":"e_1_3_2_1_2_1","volume-title":"Sparse communication for distributed gradient descent. arXiv preprint arXiv:1704.05021","author":"Aji Alham Fikri","year":"2017","unstructured":"Alham Fikri Aji and Kenneth Heafield. 2017. Sparse communication for distributed gradient descent. arXiv preprint arXiv:1704.05021 (2017)."},{"key":"e_1_3_2_1_3_1","first-page":"1709","article-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding","volume":"30","author":"Alistarh Dan","year":"2017","unstructured":"Dan Alistarh, Demjan Grubic, Jerry Li, Ryota Tomioka, and Milan Vojnovic. 2017. QSGD: Communication-efficient SGD via gradient quantization and encoding. Advances in Neural Information Processing Systems 30 (2017), 1709--1720.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_4_1","volume-title":"The convergence of sparsified gradient methods. arXiv preprint arXiv:1809.10505","author":"Alistarh Dan","year":"2018","unstructured":"Dan Alistarh, Torsten Hoefler, Mikael Johansson, Sarit Khirirat, Nikola Konstantinov, and C\u00e9dric Renggli. 2018. The convergence of sparsified gradient methods. arXiv preprint arXiv:1809.10505 (2018)."},{"key":"e_1_3_2_1_5_1","volume-title":"Cray XC series network","author":"Alverson Bob","year":"2012","unstructured":"Bob Alverson, Edwin Froese, Larry Kaplan, and Duncan Roweth. 2012. Cray XC series network. Cray Inc., White Paper WP-Aries01-1112 (2012)."},{"key":"e_1_3_2_1_6_1","volume-title":"Building efficient convnets using redundant feature pruning. arXiv preprint arXiv:1802.07653","author":"Ayinde Babajide O","year":"2018","unstructured":"Babajide O Ayinde and Jacek M Zurada. 2018. Building efficient convnets using redundant feature pruning. arXiv preprint arXiv:1802.07653 (2018)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3320060"},{"key":"e_1_3_2_1_8_1","volume-title":"International Conference on Machine Learning. PMLR, 560--569","author":"Bernstein Jeremy","year":"2018","unstructured":"Jeremy Bernstein, Yu-Xiang Wang, Kamyar Azizzadenesheli, and Animashree Anandkumar. 2018. signSGD: Compressed optimisation for non-convex problems. In International Conference on Machine Learning. PMLR, 560--569."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1137\/16M1080173"},{"key":"e_1_3_2_1_10_1","unstructured":"Tom B Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. arXiv preprint arXiv:2005.14165 (2020)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3195970.3196071"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.1206"},{"key":"e_1_3_2_1_13_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/MLHPC.2016.004"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472904"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2017.37"},{"key":"e_1_3_2_1_18_1","volume-title":"large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677","author":"Goyal Priya","year":"2017","unstructured":"Priya Goyal, Piotr Doll\u00e1r, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. 2017. Accurate, large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677 (2017)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS47774.2020.00026"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF01379320"},{"key":"e_1_3_2_1_21_1","volume-title":"Long short-term memory. Neural computation 9, 8","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation 9, 8 (1997), 1735--1780."},{"key":"e_1_3_2_1_22_1","volume-title":"Sparsity in Deep Learning: Pruning and growth for efficient inference and training in neural networks. arXiv preprint arXiv:2102.00554","author":"Hoefler Torsten","year":"2021","unstructured":"Torsten Hoefler, Dan Alistarh, Tal Ben-Nun, Nikoli Dryden, and Alexandra Peste. 2021. Sparsity in Deep Learning: Pruning and growth for efficient inference and training in neural networks. arXiv preprint arXiv:2102.00554 (2021)."},{"key":"e_1_3_2_1_23_1","volume-title":"Stochastic distributed learning with gradient quantization and variance reduction. arXiv preprint arXiv:1904.05115","author":"Horv\u00e1th Samuel","year":"2019","unstructured":"Samuel Horv\u00e1th, Dmitry Kovalev, Konstantin Mishchenko, Sebastian Stich, and Peter Richt\u00e1rik. 2019. Stochastic distributed learning with gradient quantization and variance reduction. arXiv preprint arXiv:1904.05115 (2019)."},{"key":"e_1_3_2_1_24_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3332466.3374528"},{"key":"e_1_3_2_1_26_1","first-page":"1725","article-title":"Breaking (global) barriers in parallel stochastic optimization with wait-avoiding group averaging","volume":"32","author":"Li Shigang","year":"2020","unstructured":"Shigang Li, Tal Ben-Nun, Giorgi Nadiradze, Salvatore Di Girolamo, Nikoli Dryden, Dan Alistarh, and Torsten Hoefler. 2020. Breaking (global) barriers in parallel stochastic optimization with wait-avoiding group averaging. IEEE Transactions on Parallel and Distributed Systems 32, 7 (2020), 1725--1739.","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"key":"e_1_3_2_1_27_1","volume-title":"Chimera: Efficiently Training Large-Scale Neural Networks with Bidirectional Pipelines. arXiv preprint arXiv:2107.06925","author":"Li Shigang","year":"2021","unstructured":"Shigang Li and Torsten Hoefler. 2021. Chimera: Efficiently Training Large-Scale Neural Networks with Bidirectional Pipelines. arXiv preprint arXiv:2107.06925 (2021)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2462902.2462903"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1051\/ita\/1995290402551"},{"key":"e_1_3_2_1_30_1","volume-title":"Asynchronous decentralized SGD with quantized and local updates. Advances in Neural Information Processing Systems 34","author":"Nadiradze Giorgi","year":"2021","unstructured":"Giorgi Nadiradze, Amirmojtaba Sabour, Peter Davies, Shigang Li, and Dan Alistarh. 2021. Asynchronous decentralized SGD with quantized and local updates. Advances in Neural Information Processing Systems 34 (2021)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_32_1","volume-title":"International Conference on Machine Learning. PMLR, 7937--7947","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan, Amar Phanishayee, Kaiyu Shi, Xie Chen, and Matei Zaharia. 2021. Memory-efficient pipeline-parallel dnn training. In International Conference on Machine Learning. PMLR, 7937--7947."},{"key":"e_1_3_2_1_33_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. In Advances in neural information processing systems. 8026--8037.","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. In Advances in neural information processing systems. 8026--8037."},{"key":"e_1_3_2_1_34_1","volume-title":"Language models are unsupervised multitask learners. OpenAI blog 1, 8","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. 2019. Language models are unsupervised multitask learners. OpenAI blog 1, 8 (2019), 9."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33014780"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356222"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00039"},{"key":"e_1_3_2_1_38_1","volume-title":"Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799 (2018)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3183713.3183735"},{"key":"e_1_3_2_1_40_1","volume-title":"InfiniBand network architecture","author":"Shanley Tom","unstructured":"Tom Shanley. 2003. InfiniBand network architecture. Addison-Wesley Professional."},{"key":"e_1_3_2_1_41_1","volume-title":"Ka Chun Cheung, and Simon See","author":"Shi Shaohuai","year":"2019","unstructured":"Shaohuai Shi, Xiaowen Chu, Ka Chun Cheung, and Simon See. 2019. Understanding top-k sparsification in distributed deep learning. arXiv preprint arXiv:1911.08772 (2019)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2019.00220"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Shaohuai Shi Kaiyong Zhao Qiang Wang Zhenheng Tang and Xiaowen Chu. 2019. A Convergence Analysis of Distributed SGD with Communication-Efficient Gradient Sparsification.. In IJCAI. 3411--3417.","DOI":"10.24963\/ijcai.2019\/473"},{"key":"e_1_3_2_1_44_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342005051521"},{"key":"e_1_3_2_1_46_1","volume-title":"Attention is all you need. arXiv preprint arXiv:1706.03762","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. arXiv preprint arXiv:1706.03762 (2017)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3369583.3392681"},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems","author":"Wen Wei","year":"2017","unstructured":"Wei Wen, Cong Xu, Feng Yan, Chunpeng Wu, Yandan Wang, Yiran Chen, and Hai Li. 2017. TernGrad: Ternary Gradients to Reduce Communication in Distributed Deep Learning. In Proceedings of the 31st International Conference on Neural Information Processing Systems (Long Beach, California, USA) (NIPS'17). Curran Associates Inc., Red Hook, NY, USA, 1508--1518."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3337821.3337874"},{"key":"e_1_3_2_1_50_1","volume-title":"DeepReduce: A Sparse-tensor Communication Framework for Federated Deep Learning. Advances in Neural Information Processing Systems 34","author":"Xu Hang","year":"2021","unstructured":"Hang Xu, Kelly Kostopoulou, Aritra Dutta, Xin Li, Alexandros Ntoulas, and Panos Kalnis. 2021. DeepReduce: A Sparse-tensor Communication Framework for Federated Deep Learning. Advances in Neural Information Processing Systems 34 (2021)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356137"},{"key":"e_1_3_2_1_52_1","volume-title":"Large batch optimization for deep learning: Training bert in 76 minutes. arXiv preprint arXiv:1904.00962","author":"You Yang","year":"2019","unstructured":"Yang You, Jing Li, Sashank Reddi, Jonathan Hseu, Sanjiv Kumar, Srinadh Bhojanapalli, Xiaodan Song, James Demmel, Kurt Keutzer, and Cho-Jui Hsieh. 2019. Large batch optimization for deep learning: Training bert in 76 minutes. arXiv preprint arXiv:1904.00962 (2019)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3225058.3225069"}],"event":{"name":"PPoPP '22: 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","location":"Seoul Republic of Korea","acronym":"PPoPP '22","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503221.3508399","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503221.3508399","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:49Z","timestamp":1750186849000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503221.3508399"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3,28]]},"references-count":53,"alternative-id":["10.1145\/3503221.3508399","10.1145\/3503221"],"URL":"https:\/\/doi.org\/10.1145\/3503221.3508399","relation":{},"subject":[],"published":{"date-parts":[[2022,3,28]]},"assertion":[{"value":"2022-03-28","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}