{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T00:08:43Z","timestamp":1755907723225,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T00:00:00Z","timestamp":1732060800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foundation of China under Grant","award":["62221003"],"award-info":[{"award-number":["62221003"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,20]]},"DOI":"10.1145\/3698038.3698541","type":"proceedings-article","created":{"date-parts":[[2024,11,14]],"date-time":"2024-11-14T06:32:43Z","timestamp":1731565963000},"page":"977-994","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Near-Lossless Gradient Compression for Data-Parallel Distributed DNN Training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5713-7225","authenticated-orcid":false,"given":"Xue","family":"Li","sequence":"first","affiliation":[{"name":"Alibaba Group"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5663-0052","authenticated-orcid":false,"given":"Cheng","family":"Guo","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9882-9279","authenticated-orcid":false,"given":"Kun","family":"Qian","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5274-5512","authenticated-orcid":false,"given":"Menghao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Unaffiliated"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0683-0713","authenticated-orcid":false,"given":"Mengyu","family":"Yang","sequence":"additional","affiliation":[{"name":"Unaffiliated"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4847-4585","authenticated-orcid":false,"given":"Mingwei","family":"Xu","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]}],"member":"320","published-online":{"date-parts":[[2024,11,20]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Mark Adler. 2024. A Massively Spiffy Yet Delicately Unobtrusive Compression Library. https:\/\/www.zlib.net\/."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626717"},{"key":"e_1_3_2_1_3_1","volume-title":"Sparse communication for distributed gradient descent. arXiv preprint arXiv:1704.05021","author":"Aji Alham Fikri","year":"2017","unstructured":"Alham Fikri Aji and Kenneth Heafield. 2017. Sparse communication for distributed gradient descent. arXiv preprint arXiv:1704.05021 (2017)."},{"key":"e_1_3_2_1_4_1","volume-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding. Advances in neural information processing systems 30","author":"Alistarh Dan","year":"2017","unstructured":"Dan Alistarh, Demjan Grubic, Jerry Li, Ryota Tomioka, and Milan Vojnovic. 2017. QSGD: Communication-efficient SGD via gradient quantization and encoding. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_5_1","unstructured":"Amazon Web Services. 2024. Amazon Web Services. https:\/\/aws.amazon.com\/."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483553"},{"key":"e_1_3_2_1_7_1","volume-title":"International Conference on Machine Learning. PMLR, 560--569","author":"Bernstein Jeremy","year":"2018","unstructured":"Jeremy Bernstein, Yu-Xiang Wang, Kamyar Azizzadenesheli, and Animashree Anandkumar. 2018. signSGD: Compressed optimisation for non-convex problems. In International Conference on Machine Learning. PMLR, 560--569."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441620"},{"key":"e_1_3_2_1_9_1","volume-title":"Alibaba Cloud: Reliable and Secure Cloud Computing Services. https:\/\/www.alibabacloud.com\/.","author":"Cloud Alibaba","year":"2024","unstructured":"Alibaba Cloud. 2024. Alibaba Cloud: Reliable and Secure Cloud Computing Services. https:\/\/www.alibabacloud.com\/."},{"key":"e_1_3_2_1_10_1","unstructured":"Alibaba Cloud. 2024. Platform for AI. https:\/\/www.alibabacloud.com\/en\/product\/machine-learning."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575724"},{"key":"e_1_3_2_1_12_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3091475"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/MLHPC.2016.004"},{"key":"e_1_3_2_1_15_1","unstructured":"Facebook. 2024. ZSTD. https:\/\/github.com\/facebook\/zstd."},{"key":"e_1_3_2_1_16_1","unstructured":"Facebook AI Research. 2019. Gloo: a collective communications library. https:\/\/github.com\/facebookincubator\/gloo."},{"volume-title":"Proceedings of the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming. 431--445","author":"Shiqing","key":"e_1_3_2_1_17_1","unstructured":"Shiqing Fan et al. 2021. DAPPLE: A pipelined data parallel approach for training large models. In Proceedings of the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming. 431--445."},{"key":"e_1_3_2_1_18_1","unstructured":"FSF & GNU. 2024. GNU Gzip. https:\/\/www.gnu.org\/software\/gzip\/."},{"key":"e_1_3_2_1_19_1","unstructured":"Google. 2024. Google Cloud Platform. https:\/\/cloud.google.com\/."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-50743-5_3"},{"key":"e_1_3_2_1_21_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In CVPR."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_23_1","volume-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. 2019. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/JRPROC.1952.273898"},{"key":"e_1_3_2_1_25_1","volume-title":"18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Hwang Changho","year":"2021","unstructured":"Changho Hwang, Taehyun Kim, Sunghyun Kim, Jinwoo Shin, and KyoungSoo Park. 2021. Elastic resource sharing for distributed deep learning. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21). 721--739."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/IEEESTD.2019.8766229"},{"volume-title":"BFLOAT16 - Hardware Numerics Definition. https:\/\/www.intel.com\/content\/dam\/develop\/external\/us\/en\/d ocuments\/bf16-hardware-numerics-definition-white-paper.pdf [Online","year":"2024","key":"e_1_3_2_1_27_1","unstructured":"Intel. 2018. BFLOAT16 - Hardware Numerics Definition. https:\/\/www.intel.com\/content\/dam\/develop\/external\/us\/en\/d ocuments\/bf16-hardware-numerics-definition-white-paper.pdf [Online; accessed 5-October-2024]."},{"volume-title":"Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems. 402--416","author":"Abhinav","key":"e_1_3_2_1_28_1","unstructured":"Abhinav Jangda et al. 2022. Breaking the computation and communication abstraction barrier in distributed machine learning workloads. In Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems. 402--416."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.5555\/3488766.3488792"},{"volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Ziheng","key":"e_1_3_2_1_30_1","unstructured":"Ziheng Jiang et al. 2024. MegaScale: Scaling Large Language Model Training to More Than 10,000 GPUs. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). 745--760."},{"key":"e_1_3_2_1_31_1","unstructured":"jseward. 2024. bzip2 and libbzip2. https:\/\/sourceware.org\/bzip2\/."},{"key":"e_1_3_2_1_32_1","volume-title":"Amazon sagemaker model parallelism: A general and flexible framework for large model training. arXiv preprint arXiv:2111.05972","author":"Karakus Can","year":"2021","unstructured":"Can Karakus, Rahul Huilgol, Fei Wu, Anirudh Subramanian, Cade Daniel, Derya Cavdar, Teng Xu, Haohan Chen, Arash Rahnama, and Luis Quintela. 2021. Amazon sagemaker model parallelism: A general and flexible framework for large model training. arXiv preprint arXiv:2111.05972 (2021)."},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Kim Taebum","year":"2023","unstructured":"Taebum Kim, Hyoungjoo Kim, Gyeong-In Yu, and Byung-Gon Chun. 2023. BPIPE: memory-balanced pipeline parallelism for training large language models. In Proceedings of the 40th International Conference on Machine Learning (Honolulu, Hawaii, USA) (ICML'23). JMLR.org, Article 682, 15 pages."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1977.1055714"},{"key":"e_1_3_2_1_35_1","volume-title":"THC: Accelerating Distributed Deep Learning Using Tensor Homomorphic Compression. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Li Minghao","year":"2024","unstructured":"Minghao Li, Ran Ben Basat, Shay Vargaftik, ChonLam Lao, Kevin Xu, Michael Mitzenmacher, and Minlan Yu. 2024. THC: Accelerating Distributed Deep Learning Using Tensor Homomorphic Compression. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). USENIX Association, Santa Clara, CA, 1191--1211. https:\/\/www.usenix.org\/conference\/nsdi24\/presentation\/li-minghao"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.14778\/3587136.3587149"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508399"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"e_1_3_2_1_39_1","volume-title":"Yan Zhuang, Fei Feng, Lingbo Tang, Zheng Cao, Ming Zhang, Frank Kelly, Mohammad Alizadeh, et al.","author":"Li Yuliang","year":"2019","unstructured":"Yuliang Li, Rui Miao, Hongqiang Harry Liu, Yan Zhuang, Fei Feng, Lingbo Tang, Zheng Cao, Ming Zhang, Frank Kelly, Mohammad Alizadeh, et al. 2019. HPCC: High precision congestion control. In Proceedings of the ACM Special Interest Group on Data Communication. 44--58."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.14778\/3551793.3551852"},{"key":"e_1_3_2_1_41_1","volume-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv preprint arXiv:1712.01887","author":"Lin Yujun","year":"2017","unstructured":"Yujun Lin, Song Han, Huizi Mao, Yu Wang, and William J Dally. 2017. Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv preprint arXiv:1712.01887 (2017)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2006.143"},{"key":"e_1_3_2_1_43_1","unstructured":"Microsoft Azure. 2024. Microsoft Azure. https:\/\/azure.microsoft.com\/."},{"volume-title":"Proceedings of the 27th ACM Symposium on Operating Systems Principles. 1--15","author":"Deepak","key":"e_1_3_2_1_44_1","unstructured":"Deepak Narayanan et al. 2019. PipeDream: Generalized pipeline parallelism for DNN training. In Proceedings of the 27th ACM Symposium on Operating Systems Principles. 1--15."},{"volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. 1--15","author":"Deepak","key":"e_1_3_2_1_45_1","unstructured":"Deepak Narayanan et al. 2021. Efficient large-scale language model training on gpu clusters using megatron-lm. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. 1--15."},{"key":"e_1_3_2_1_46_1","volume-title":"International Conference on Machine Learning. PMLR, 7937--7947","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan, Amar Phanishayee, Kaiyu Shi, Xie Chen, and Matei Zaharia. 2021. Memory-efficient pipeline-parallel dnn training. In International Conference on Machine Learning. PMLR, 7937--7947."},{"key":"e_1_3_2_1_47_1","volume-title":"HPC up to 20x. https:\/\/blogs.nvidia.com\/blog\/tensorfloat-32-precision-format\/ [Online","author":"NVIDIA.","year":"2024","unstructured":"NVIDIA. 2020. TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x. https:\/\/blogs.nvidia.com\/blog\/tensorfloat-32-precision-format\/ [Online; accessed 27-April-2024]."},{"key":"e_1_3_2_1_48_1","unstructured":"NVIDIA Corporation. 2019. NVIDIA Collective Communications Library (NCCL). https:\/\/developer.nvidia.com\/nccl."},{"key":"e_1_3_2_1_49_1","unstructured":"Open MPI Project. 2019. Open MPI: A High Performance Message Passing Library. https:\/\/www.open-mpi.org\/."},{"key":"e_1_3_2_1_50_1","unstructured":"OpenAI. 2020. Language Models are Few-Shot Learners. https:\/\/openai.com\/blog\/gpt-3-apps."},{"key":"e_1_3_2_1_51_1","volume-title":"Pytorch: An imperative style, highperformance deep learning library. Advances in neural information processing systems 32","author":"Adam Paszke","year":"2019","unstructured":"Adam Paszke et al. 2019. Pytorch: An imperative style, highperformance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_52_1","unstructured":"Python. 2024. lzma --- Compression using the LZMA algorithm. https:\/\/docs.python.org\/3\/library\/lzma.html."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Frank Seide Hao Fu Jasha Droppo Gang Li and Dong Yu. 2014. 1-bit stochastic gradient descent and its application to data-parallel distributed training of speech dnns. In Fifteenth annual conference of the international speech communication association.","DOI":"10.21437\/Interspeech.2014-274"},{"key":"e_1_3_2_1_56_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Shah Aashaka","year":"2023","unstructured":"Aashaka Shah, Vijay Chidambaram, Meghan Cowan, Saeed Maleki, Madan Musuvathi, Todd Mytkowicz, Jacob Nelson, Olli Saarikivi, and Rachee Singh. 2023. TACCL: Guiding Collective Algorithm Synthesis using Communication Sketches. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 593--612."},{"key":"e_1_3_2_1_57_1","volume-title":"Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D13-1170"},{"key":"e_1_3_2_1_59_1","volume-title":"Sparsified SGD with memory. Advances in neural information processing systems 31","author":"Stich Sebastian U","year":"2018","unstructured":"Sebastian U Stich, Jean-Baptiste Cordonnier, and Martin Jaggi. 2018. Sparsified SGD with memory. Advances in neural information processing systems 31 (2018)."},{"key":"e_1_3_2_1_60_1","volume-title":"International Conference on Machine Learning. PMLR, 10118--10129","author":"Tang Hanlin","year":"2021","unstructured":"Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, and Yuxiong He. 2021. 1-bit adam: Communication efficient large-scale training with adam's convergence speed. In International Conference on Machine Learning. PMLR, 10118--10129."},{"key":"e_1_3_2_1_61_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_62_1","volume-title":"Sai Praneeth Karimireddy, and Martin Jaggi","author":"Vogels Thijs","year":"2019","unstructured":"Thijs Vogels, Sai Praneeth Karimireddy, and Martin Jaggi. 2019. PowerSGD: Practical low-rank gradient compression for distributed optimization. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_1_63_1","volume-title":"Atomo: Communication-efficient learning via atomic sparsification. Advances in neural information processing systems 31","author":"Wang Hongyi","year":"2018","unstructured":"Hongyi Wang, Scott Sievert, Shengchao Liu, Zachary Charles, Dimitris Papailiopoulos, and Stephen Wright. 2018. Atomo: Communication-efficient learning via atomic sparsification. Advances in neural information processing systems 31 (2018)."},{"key":"e_1_3_2_1_64_1","volume-title":"Bytecomp: Revisiting gradient compression in distributed training. arXiv preprint arXiv:2205.14465","author":"Wang Zhuang","year":"2022","unstructured":"Zhuang Wang, Haibin Lin, Yibo Zhu, and TS Ng. 2022. Bytecomp: Revisiting gradient compression in distributed training. arXiv preprint arXiv:2205.14465 (2022)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3567505"},{"key":"e_1_3_2_1_66_1","volume-title":"Gradient sparsification for communication-efficient distributed optimization. Advances in Neural Information Processing Systems 31","author":"Wangni Jianqiao","year":"2018","unstructured":"Jianqiao Wangni, Jialei Wang, Ji Liu, and Tong Zhang. 2018. Gradient sparsification for communication-efficient distributed optimization. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472927"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544216.3544262"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS57875.2023.00031"}],"event":{"name":"SoCC '24: ACM Symposium on Cloud Computing","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGOPS ACM Special Interest Group on Operating Systems"],"location":"Redmond WA USA","acronym":"SoCC '24"},"container-title":["Proceedings of the ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3698038.3698541","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3698038.3698541","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:02:10Z","timestamp":1755889330000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3698038.3698541"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,20]]},"references-count":69,"alternative-id":["10.1145\/3698038.3698541","10.1145\/3698038"],"URL":"https:\/\/doi.org\/10.1145\/3698038.3698541","relation":{},"subject":[],"published":{"date-parts":[[2024,11,20]]},"assertion":[{"value":"2024-11-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}