{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T08:40:50Z","timestamp":1769503250326,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62090022,62090023"],"award-info":[{"award-number":["62090022,62090023"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Strategic Priority Research Program of Chinese Academy of Sciences","award":["XDA0320000,XDA0320300"],"award-info":[{"award-number":["XDA0320000,XDA0320300"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3756092","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:19:56Z","timestamp":1760721596000},"page":"643-658","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["SkipReduce: (Interconnection) Network Sparsity to Accelerate Distributed Machine Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8828-8058","authenticated-orcid":false,"given":"Hans","family":"Kasan","sequence":"first","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7108-9013","authenticated-orcid":false,"given":"Dennis","family":"Abts","sequence":"additional","affiliation":[{"name":"NVIDIA, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5691-4771","authenticated-orcid":false,"given":"Jungwook","family":"Choi","sequence":"additional","affiliation":[{"name":"Hanyang University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3958-3891","authenticated-orcid":false,"given":"John","family":"Kim","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"2023. Nvidia Collective Communication Library (NCCL). https:\/\/developer.nvidia.com\/nccl. Accessed: 2023-07-23."},{"key":"e_1_3_3_2_3_2","unstructured":"2023. Nvidia CUDA Random Number Generation (cuRAND) library. https:\/\/developer.nvidia.com\/curand. Accessed: 2023-07-23."},{"key":"e_1_3_3_2_4_2","unstructured":"Alham\u00a0Fikri Aji and Kenneth Heafield. 2017. Sparse Communication for Distributed Gradient Descent. arXiv abs\/1704.05021 (2017). arXiv:https:\/\/arXiv.org\/abs\/1704.05021http:\/\/arxiv.org\/abs\/1704.05021"},{"key":"e_1_3_3_2_5_2","volume-title":"Advances in Neural Information Processing Systems","author":"Alistarh Dan","year":"2017","unstructured":"Dan Alistarh, Demjan Grubic, Jerry Li, Ryota Tomioka, and Milan Vojnovic. 2017. QSGD: Communication-Efficient SGD via Gradient Quantization and Encoding. In Advances in Neural Information Processing Systems , Vol.\u00a030. Curran Associates, Inc."},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.5555\/3327345.3327497"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","unstructured":"Tal Ben-Nun and Torsten Hoefler. 2019. Demystifying Parallel and Distributed Deep Learning: An In-Depth Concurrency Analysis. ACM Comput. Surv. 52 4 Article 65 (Aug. 2019) 43\u00a0pages. 10.1145\/3320060","DOI":"10.1145\/3320060"},{"key":"e_1_3_3_2_8_2","volume-title":"Accelerating AI at-scale with Selene DGXA100 SuperPOD and Lustre Parallel Filesystem Storage","author":"Bernauer J.","year":"2021","unstructured":"J. Bernauer and P. Kashinkunti. 2021. Accelerating AI at-scale with Selene DGXA100 SuperPOD and Lustre Parallel Filesystem Storage. Technical Report. NVIDIA, USA."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071117"},{"key":"e_1_3_3_2_10_2","volume-title":"4th International Conference on Learning Representations, ICLR 2016, San Juan, Puerto Rico, May 2-4, 2016, Conference Track Proceedings","author":"Dettmers Tim","year":"2016","unstructured":"Tim Dettmers. 2016. 8-Bit Approximations for Parallelism in Deep Learning. In 4th International Conference on Learning Representations, ICLR 2016, San Juan, Puerto Rico, May 2-4, 2016, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.). http:\/\/arxiv.org\/abs\/1511.04561"},{"key":"e_1_3_3_2_11_2","unstructured":"Jacob Devlin Ming-Wei Chang Kenton Lee and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv abs\/1810.04805 (2018). arXiv:https:\/\/arXiv.org\/abs\/1810.04805http:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/MLHPC.2016.004"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472904"},{"key":"e_1_3_3_2_14_2","series-title":"(NIPS\u201918)","first-page":"10750","volume-title":"Proceedings of the 32nd International Conference on Neural Information Processing Systems","author":"Ghiasi Golnaz","year":"2018","unstructured":"Golnaz Ghiasi, Tsung-Yi Lin, and Quoc\u00a0V. Le. 2018. DropBlock: a regularization method for convolutional networks. In Proceedings of the 32nd International Conference on Neural Information Processing Systems (Montr\u00e9al, Canada) (NIPS\u201918). Curran Associates Inc., Red Hook, NY, USA, 10750\u201310760."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.5555\/3086952"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/COMHPC.2016.006"},{"key":"e_1_3_3_2_17_2","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan Amy Yang Angela Fan Anirudh Goyal Anthony Hartshorn Aobo Yang Archi Mitra Archie Sravankumar Artem Korenev Arthur Hinsvark and et\u00a0al. Arun\u00a0Rao. 2024. The Llama 3 Herd of Models. arxiv:https:\/\/arXiv.org\/abs\/2407.21783\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_3_2_18_2","series-title":"(ICML\u201915)","first-page":"1737","volume-title":"Proceedings of the 32nd International Conference on International Conference on Machine Learning - Volume 37","author":"Gupta Suyog","year":"2015","unstructured":"Suyog Gupta, Ankur Agrawal, Kailash Gopalakrishnan, and Pritish Narayanan. 2015. Deep learning with limited numerical precision. In Proceedings of the 32nd International Conference on International Conference on Machine Learning - Volume 37 (Lille, France) (ICML\u201915). 1737\u20131746."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00023"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00085"},{"key":"e_1_3_3_2_22_2","unstructured":"Alex Krizhevsky. 2009. Learning Multiple Layers of Features from Tiny Images. https:\/\/api.semanticscholar.org\/CorpusID:18268744"},{"key":"e_1_3_3_2_23_2","series-title":"(ICLR \u201917)","volume-title":"International Conference on Learning Representations","author":"Larsson Gustav","year":"2017","unstructured":"Gustav Larsson, Michael Maire, and Gregory Shakhnarovich. 2017. FractalNet: Ultra-Deep Neural Networks without Residuals. In International Conference on Learning Representations (Toulon, France) (ICLR \u201917)."},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00069"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"crossref","unstructured":"Yann LeCun L\u00e9on Bottou Yoshua Bengio and Patrick Haffner. 1998. Gradient-based learning applied to document recognition. Proc. IEEE 86 11 (1998) 2278\u20132324.","DOI":"10.1109\/5.726791"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508399"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322259"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00023"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00114"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672228"},{"key":"e_1_3_3_2_31_2","volume-title":"International Conference on Learning Representations","author":"Lin Yujun","year":"2018","unstructured":"Yujun Lin, Song Han, Huizi Mao, Yu Wang, and Bill Dally. 2018. Deep Gradient Compression: Reducing the Communication Bandwidth for Distributed Training. In International Conference on Learning Representations."},{"key":"e_1_3_3_2_32_2","unstructured":"Paulius Micikevicius. 2020. Fundamentals of scaling out DL training. HotChips (2020)."},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_3_2_34_2","volume-title":"Advances in Neural Information Processing Systems","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems , Vol.\u00a032. Curran Associates, Inc."},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","unstructured":"Pitch Patarasuk and Xin Yuan. 2009. Bandwidth optimal all-reduce algorithms for clusters of workstations. J. Parallel and Distrib. Comput. 69 2 (2009) 117\u2013124. 10.1016\/j.jpdc.2008.09.002","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"e_1_3_3_2_36_2","unstructured":"Samyam Rajbhandari Jeff Rasley Olatunji Ruwase and Yuxiong He. 2020. ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. arxiv:https:\/\/arXiv.org\/abs\/1910.02054\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/1910.02054"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00049"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527382"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356222"},{"key":"e_1_3_3_2_40_2","series-title":"(NIPS\u201915)","first-page":"2674","volume-title":"Proceedings of the 28th International Conference on Neural Information Processing Systems - Volume 2","author":"Sa Christopher\u00a0De","year":"2015","unstructured":"Christopher\u00a0De Sa, Ce Zhang, Kunle Olukotun, and Christopher R\u00e9. 2015. Taming the wild: a unified analysis of HOG WILD! -style algorithms. In Proceedings of the 28th International Conference on Neural Information Processing Systems - Volume 2 (Montreal, Canada) (NIPS\u201915). MIT Press, Cambridge, MA, USA, 2674\u20132682."},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-274"},{"key":"e_1_3_3_2_42_2","unstructured":"Shaohuai Shi Xiaowen Chu Ka\u00a0Chun Cheung and Simon See. 2019. Understanding Top-k Sparsification in Distributed Deep Learning. arXiv abs\/1911.08772 (2019). arXiv:https:\/\/arXiv.org\/abs\/1911.08772https:\/\/arxiv.org\/abs\/1911.08772"},{"key":"e_1_3_3_2_43_2","unstructured":"Shaohuai Shi Qiang Wang Kaiyong Zhao Zhenheng Tang Yuxin Wang Xiang Huang and Xiaowen Chu. 2019. A Distributed Synchronous SGD Algorithm with Global Top-k Sparsification for Low Bandwidth Networks. arXiv abs\/1901.04359 (2019). arXiv:https:\/\/arXiv.org\/abs\/1901.04359https:\/\/arxiv.org\/abs\/1901.04359"},{"key":"e_1_3_3_2_44_2","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv abs\/1909.08053 (2019). arXiv:https:\/\/arXiv.org\/abs\/1909.08053https:\/\/arxiv.org\/abs\/1909.08053"},{"key":"e_1_3_3_2_45_2","series-title":"(ICLR \u201915)","volume-title":"3rd International Conference on Learning Representations","author":"Simonyan Karen","year":"2015","unstructured":"Karen Simonyan and Andrew Zisserman. 2015. Very Deep Convolutional Networks for Large-Scale Image Recognition. In 3rd International Conference on Learning Representations(ICLR \u201915). San Diego, CA, USA."},{"key":"e_1_3_3_2_46_2","unstructured":"Nitish Srivastava Geoffrey Hinton Alex Krizhevsky Ilya Sutskever and Ruslan Salakhutdinov. 2014. Dropout: A Simple Way to Prevent Neural Networks from Overfitting. Journal of Machine Learning Research 15 56 (2014) 1929\u20131958."},{"key":"e_1_3_3_2_47_2","unstructured":"Sebastian\u00a0U. Stich Jean-Baptiste Cordonnier and Martin Jaggi. 2018. Sparsified SGD with Memory(NIPS\u201918). Curran Associates Inc. Red Hook NY USA 4452\u20134463."},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-354"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS55958.2022.9895534"},{"key":"e_1_3_3_2_50_2","series-title":"(ACCV \u201920)","volume-title":"Asian Conference on Computer Vision","author":"Tseng Hung-Yu","year":"2020","unstructured":"Hung-Yu Tseng, Yi-Wen Chen, Yi-Hsuan Tsai, Sifei Liu, Yen-Yu Lin, and Ming-Hsuan Yang. 2020. Regularizing Meta-Learning via Gradient Dropout. In Asian Conference on Computer Vision (Kyoto, Japan) (ACCV \u201920)."},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","unstructured":"Leslie\u00a0G. Valiant. 1990. A Bridging Model for Parallel Computation. Commun. ACM 33 8 (Aug. 1990) 103\u2013111. 10.1145\/79173.79181","DOI":"10.1145\/79173.79181"},{"key":"e_1_3_3_2_52_2","volume-title":"PowerSGD: practical low-rank gradient compression for distributed optimization","author":"Vogels Thijs","year":"2019","unstructured":"Thijs Vogels, Sai\u00a0Praneeth Karimireddy, and Martin Jaggi. 2019. PowerSGD: practical low-rank gradient compression for distributed optimization. Curran Associates Inc., Red Hook, NY, USA."},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","DOI":"10.5555\/3691825.3691904"},{"key":"e_1_3_3_2_54_2","first-page":"685","volume-title":"22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)","author":"Warraich Ertza","year":"2025","unstructured":"Ertza Warraich, Omer Shabtai, Khalid Manaa, Shay Vargaftik, Yonatan Piasetzky, Matty Kadosh, Lalith Suresh, and Muhammad Shahbaz. 2025. OptiReduce: Resilient and Tail-Optimal AllReduce for Distributed Deep Learning in the Cloud. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25). USENIX Association, Philadelphia, PA, 685\u2013703. https:\/\/www.usenix.org\/conference\/nsdi25\/presentation\/warraich"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1101"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00068"},{"key":"e_1_3_3_2_57_2","unstructured":"Rowan Zellers Yonatan Bisk Roy Schwartz and Yejin Choi. 2018. SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference. arxiv:https:\/\/arXiv.org\/abs\/1808.05326\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1808.05326"},{"key":"e_1_3_3_2_58_2","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer Alban Desmaison Can Balioglu Pritam Damania Bernard Nguyen Geeta Chauhan Yuchen Hao Ajit Mathews and Shen Li. 2023. PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel. arxiv:https:\/\/arXiv.org\/abs\/2304.11277\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2304.11277"}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756092","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:45:32Z","timestamp":1769463932000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3756092"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":57,"alternative-id":["10.1145\/3725843.3756092","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3756092","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}