{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,6]],"date-time":"2026-04-06T03:56:31Z","timestamp":1775447791793,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T00:00:00Z","timestamp":1740700800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Department of Energy","award":["DE-AC02-06CH11357"],"award-info":[{"award-number":["DE-AC02-06CH11357"]}]},{"DOI":"10.13039\/501100006374","name":"Ant Group","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62032023,T2125013"],"award-info":[{"award-number":["62032023,T2125013"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006374","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2312673,2247080,2303064,2326494,2326495"],"award-info":[{"award-number":["2312673,2247080,2303064,2326494,2326495"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,2,28]]},"DOI":"10.1145\/3710848.3710852","type":"proceedings-article","created":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T06:20:57Z","timestamp":1740723657000},"page":"212-224","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["COMPSO: Optimizing Gradient Compression for Distributed Training with Second-Order Optimizers"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9807-7978","authenticated-orcid":false,"given":"Baixi","family":"Sun","sequence":"first","affiliation":[{"name":"Indiana University, Bloomington, IN, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6002-6681","authenticated-orcid":false,"given":"Weijin","family":"Liu","sequence":"additional","affiliation":[{"name":"Stevens Institute of Technology, Hoboken, NJ, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6547-6902","authenticated-orcid":false,"given":"J. Gregory","family":"Pauloski","sequence":"additional","affiliation":[{"name":"University of Chicago, Chicago, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1101-9148","authenticated-orcid":false,"given":"Jiannan","family":"Tian","sequence":"additional","affiliation":[{"name":"Indiana University, Bloomington, IN, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5283-5241","authenticated-orcid":false,"given":"Jinda","family":"Jia","sequence":"additional","affiliation":[{"name":"Indiana University, Bloomington, IN, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4444-3634","authenticated-orcid":false,"given":"Daoce","family":"Wang","sequence":"additional","affiliation":[{"name":"Indiana University, Bloomington, IN, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8595-6238","authenticated-orcid":false,"given":"Boyuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Indiana University, Bloomington, IN, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3386-3463","authenticated-orcid":false,"given":"Mingkai","family":"Zheng","sequence":"additional","affiliation":[{"name":"Rutgers University, New Brunswick, NJ, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7339-5256","authenticated-orcid":false,"given":"Sheng","family":"Di","sequence":"additional","affiliation":[{"name":"Argonne National Laboratory, Lemont, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9250-0611","authenticated-orcid":false,"given":"Sian","family":"Jin","sequence":"additional","affiliation":[{"name":"Temple University, Philadelphia, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5921-0035","authenticated-orcid":false,"given":"Zhao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Rutgers University, New Brunswick, NJ, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6244-1264","authenticated-orcid":false,"given":"Xiaodong","family":"Yu","sequence":"additional","affiliation":[{"name":"Stevens Institute of Technology, Hoboken, NJ, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7000-4195","authenticated-orcid":false,"given":"Kamil A.","family":"Iskra","sequence":"additional","affiliation":[{"name":"Argonne National Laboratory, Lemont, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9428-7801","authenticated-orcid":false,"given":"Pete","family":"Beckman","sequence":"additional","affiliation":[{"name":"Northwestern University, Argonne National Laboratory, Lemont, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6361-5948","authenticated-orcid":false,"given":"Guangming","family":"Tan","sequence":"additional","affiliation":[{"name":"University of Chinese, Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5422-4497","authenticated-orcid":false,"given":"Dingwen","family":"Tao","sequence":"additional","affiliation":[{"name":"University of Chinese, Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,2,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Second-order stochastic optimization in linear time. stat 1050","author":"Agarwal Naman","year":"2016","unstructured":"Naman Agarwal, Brian Bullins, and Elad Hazan. 2016. Second-order stochastic optimization in linear time. stat 1050 (2016), 15."},{"key":"e_1_3_2_1_2_1","volume-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding. Advances in neural information processing systems 30","author":"Alistarh Dan","year":"2017","unstructured":"Dan Alistarh, Demjan Grubic, Jerry Li, Ryota Tomioka, and Milan Vojnovic. 2017. QSGD: Communication-efficient SGD via gradient quantization and encoding. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3572751.3572765"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","unstructured":"Sid Black Gao Leo Phil Wang Connor Leahy and Stella Biderman. 2021. GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow. https:\/\/doi.org\/10.5281\/zenodo.5297715 If you use this software please cite it using these metadata..","DOI":"10.5281\/zenodo.5297715"},{"key":"e_1_3_2_1_5_1","volume-title":"Antonio Torralba and Sanja Fidler---Proceedings of the IEEE international conference on computer vision. 19--27","author":"Books Aligning","year":"2015","unstructured":"Aligning Books. 2015. Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books---Yukun Zhu. In Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba and Sanja Fidler---Proceedings of the IEEE international conference on computer vision. 19--27."},{"key":"e_1_3_2_1_6_1","unstructured":"Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. arXiv:2005.14165 [cs.CL]"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3195970.3196071"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1098\/rsos.211631"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_10_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_11_1","unstructured":"EleutherAI. [n.d.]. EleutherAI\/gpt-neo-125m \u2022 Hugging Face. https:\/\/huggingface.co\/EleutherAI\/gpt-neo-125m."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472904"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Hao Feng Boyuan Zhang Fanjiang Ye Min Si Ching-Hsiang Chu Jiannan Tian Chunxing Yin Summer Deng Yuchen Hao Pavan Balaji et al. 2024. Accelerating Communication in Deep Learning Recommendation Model Training with Dual-Level Adaptive Lossy Compression. In SC24: International Conference for High Performance Computing Networking Storage and Analysis. IEEE 1--16.","DOI":"10.1109\/SC41406.2024.00095"},{"key":"e_1_3_2_1_14_1","volume-title":"The Pile: An 800GB Dataset of Diverse Text for Language Modeling. arXiv:2101.00027 [cs.CL]","author":"Gao Leo","year":"2020","unstructured":"Leo Gao, Stella Biderman, Sid Black, Laurence Golding, Travis Hoppe, Charles Foster, Jason Phang, Horace He, Anish Thite, Noa Nabeshima, Shawn Presser, and Connor Leahy. 2020. The Pile: An 800GB Dataset of Diverse Text for Language Modeling. arXiv:2101.00027 [cs.CL]"},{"key":"e_1_3_2_1_15_1","first-page":"2386","article-title":"Practical quasinewton methods for training deep neural networks","volume":"33","author":"Goldfarb Donald","year":"2020","unstructured":"Donald Goldfarb, Yi Ren, and Achraf Bahamou. 2020. Practical quasinewton methods for training deep neural networks. Advances in Neural Information Processing Systems 33 (2020), 2386--2396.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_16_1","first-page":"20889","article-title":"Linearly converging error compensated SGD","volume":"33","author":"Gorbunov Eduard","year":"2020","unstructured":"Eduard Gorbunov, Dmitry Kovalev, Dmitry Makarenko, and Peter Richt\u00e1rik. 2020. Linearly converging error compensated SGD. Advances in Neural Information Processing Systems 33 (2020), 20889--20900.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_17_1","volume-title":"International conference on machine learning. PMLR, 5200--5209","author":"Gower Robert Mansel","year":"2019","unstructured":"Robert Mansel Gower, Nicolas Loizou, Xun Qian, Alibek Sailanbayev, Egor Shulgin, and Peter Richt\u00e1rik. 2019. SGD: General analysis and improved rates. In International conference on machine learning. PMLR, 5200--5209."},{"key":"e_1_3_2_1_18_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Gupta Vineet","year":"2018","unstructured":"Vineet Gupta, Tomer Koren, and Yoram Singer. 2018. Shampoo: Preconditioned stochastic tensor optimization. In International Conference on Machine Learning. PMLR, 1842--1850."},{"key":"e_1_3_2_1_19_1","unstructured":"Kaiming He Georgia Gkioxari Piotr Doll\u00e1r and Ross Girshick. 2018. Mask R-CNN. arXiv:1703.06870 [cs.CV]"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_21_1","volume-title":"Stochastic distributed learning with gradient quantization and double-variance reduction. Optimization Methods and Software","author":"Horv\u00e1th Samuel","year":"2022","unstructured":"Samuel Horv\u00e1th, Dmitry Kovalev, Konstantin Mishchenko, Peter Richt\u00e1rik, and Sebastian Stich. 2022. Stochastic distributed learning with gradient quantization and double-variance reduction. Optimization Methods and Software (2022), 1--16."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00085"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/JRPROC.1952.273898"},{"key":"e_1_3_2_1_24_1","unstructured":"Jinda Jia Cong Xie Hanlin Lu Daoce Wang Hao Feng Chengming Zhang Baixi Sun Haibin Lin Zhi Zhang Xin Liu et al. 2024. SDP4Bit: Toward 4-bit Communication Quantization in Sharded Data Parallelism for LLM Training. arXiv preprint arXiv:2410.15526 (2024)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3230840"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-021-03819-2"},{"key":"e_1_3_2_1_27_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508399"},{"key":"e_1_3_2_1_29_1","volume-title":"SZ3: A Modular Framework for Composing Prediction-Based Error-Bounded Lossy Compressors","author":"Liang Xin","year":"2022","unstructured":"Xin Liang, Kai Zhao, Sheng Di, Sihuan Li, Robert Underwood, Ali M. Gok, Jiannan Tian, Junjing Deng, Jon C. Calhoun, Dingwen Tao, Zizhong Chen, and Franck Cappello. 2022. SZ3: A Modular Framework for Composing Prediction-Based Error-Bounded Lossy Compressors. IEEE Transactions on Big Data (2022), 1--14."},{"key":"e_1_3_2_1_30_1","first-page":"53","article-title":"3lc: Lightweight and effective traffic compression for distributed machine learning","volume":"1","author":"Lim Hyeontaek","year":"2019","unstructured":"Hyeontaek Lim, David G Andersen, and Michael Kaminsky. 2019. 3lc: Lightweight and effective traffic compression for distributed machine learning. Proceedings of Machine Learning and Systems 1 (2019), 53--64.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_31_1","unstructured":"Tsung-Yi Lin Michael Maire Serge Belongie Lubomir Bourdev Ross Girshick James Hays Pietro Perona Deva Ramanan C. Lawrence Zitnick and Piotr Doll\u00e1r. 2015. Microsoft COCO: Common Objects in Context. arXiv:1405.0312 [cs.CV]"},{"key":"e_1_3_2_1_32_1","volume-title":"Sophia: A Scalable Stochastic Second-order Optimizer for Language Model Pre-training. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=3xHDeA8Noi","author":"Liu Hong","year":"2024","unstructured":"Hong Liu, Zhiyuan Li, David Leo Wright Hall, Percy Liang, and Tengyu Ma. 2024. Sophia: A Scalable Stochastic Second-order Optimizer for Language Model Pre-training. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=3xHDeA8Noi"},{"key":"e_1_3_2_1_33_1","volume-title":"Quantized Distributed Training of Large Models with Convergence Guarantees. arXiv preprint arXiv:2302.02390","author":"Markov Ilia","year":"2023","unstructured":"Ilia Markov, Adrian Vladu, Qi Guo, and Dan Alistarh. 2023. Quantized Distributed Training of Large Models with Convergence Guarantees. arXiv preprint arXiv:2302.02390 (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"International Conference on Learning Representations.","author":"Martens James","year":"2018","unstructured":"James Martens, Jimmy Ba, and Matt Johnson. 2018. Kronecker-factored curvature approximations for recurrent neural networks. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_35_1","volume-title":"International conference on machine learning. PMLR, 2408--2417","author":"Martens James","year":"2015","unstructured":"James Martens and Roger Grosse. 2015. Optimizing neural networks with kronecker-factored approximate curvature. In International conference on machine learning. PMLR, 2408--2417."},{"key":"e_1_3_2_1_36_1","volume-title":"International conference on machine learning. PMLR, 2408--2417","author":"Martens James","year":"2015","unstructured":"James Martens and Roger Grosse. 2015. Optimizing neural networks with kronecker-factored approximate curvature. In International conference on machine learning. PMLR, 2408--2417."},{"key":"e_1_3_2_1_37_1","unstructured":"Meta. [n. d.]. Data dump torrents - Meta. https:\/\/meta.wikimedia.org\/wiki\/Data_dump_torrents#English_. (Accessed on 04\/07\/2023)."},{"key":"e_1_3_2_1_38_1","first-page":"6829","article-title":"Asynchronous decentralized SGD with quantized and local updates","volume":"34","author":"Nadiradze Giorgi","year":"2021","unstructured":"Giorgi Nadiradze, Amirmojtaba Sabour, Peter Davies, Shigang Li, and Dan Alistarh. 2021. Asynchronous decentralized SGD with quantized and local updates. Advances in Neural Information Processing Systems 34 (2021), 6829--6842.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_39_1","unstructured":"NVIDIA. [n.d.]. NVIDIA\/DeepLearningExamples: State-of-the-Art Deep Learning scripts organized by models - easy to train and deploy with reproducible accuracy and performance on enterprise-grade infrastructure. https:\/\/github.com\/NVIDIA\/DeepLearningExamples."},{"key":"e_1_3_2_1_40_1","unstructured":"NVIDIA. 2024. NVCOMP | NVIDIA Developer. https:\/\/developer.nvidia.com\/nvcomp. (Accessed on 01\/14\/2024)."},{"key":"e_1_3_2_1_41_1","volume-title":"Proceedings of Machine Learning and Systems 5","author":"Osawa Kazuki","year":"2023","unstructured":"Kazuki Osawa, Shigang Li, and Torsten Hoefler. 2023. PipeFisher: Efficient Training of Large Language Models Using Pipelining and Fisher Information Matrices. Proceedings of Machine Learning and Systems 5 (2023)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01264"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3161187"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476152"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433826"},{"key":"e_1_3_2_1_46_1","unstructured":"PyTorch. 2024. TORCH.CUDA. https:\/\/pytorch.org\/docs\/stable\/cuda.html."},{"key":"e_1_3_2_1_47_1","volume-title":"100,000+ questions for machine comprehension of text. arXiv preprint arXiv:1606.05250","author":"Rajpurkar Pranav","year":"2016","unstructured":"Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. 2016. Squad: 100,000+ questions for machine comprehension of text. arXiv preprint arXiv:1606.05250 (2016)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC56025.2022.00024"},{"key":"e_1_3_2_1_49_1","volume-title":"Ka Chun Cheung, and Simon See","author":"Shi Shaohuai","year":"2019","unstructured":"Shaohuai Shi, Xiaowen Chu, Ka Chun Cheung, and Simon See. 2019. Understanding top-k sparsification in distributed deep learning. arXiv preprint arXiv:1911.08772 (2019)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSAIT.2021.3103920"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Nikko Str\u00f6m. 2015. Scalable distributed DNN training using commodity GPU cloud computing. (2015).","DOI":"10.21437\/Interspeech.2015-354"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414624"},{"key":"e_1_3_2_1_53_1","volume-title":"International Conference on Machine Learning. PMLR, 36058--36076","author":"Wang Jue","year":"2023","unstructured":"Jue Wang, Yucheng Lu, Binhang Yuan, Beidi Chen, Percy Liang, Christopher De Sa, Christopher Re, and Ce Zhang. 2023. CocktailSGD: Fine-tuning foundation models over 500Mbps networks. In International Conference on Machine Learning. PMLR, 36058--36076."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3337821.3337888"},{"key":"e_1_3_2_1_55_1","volume-title":"Terngrad: Ternary gradients to reduce communication in distributed deep learning. Advances in neural information processing systems 30","author":"Wen Wei","year":"2017","unstructured":"Wei Wen, Cong Xu, Feng Yan, Chunpeng Wu, Yandan Wang, Yiran Chen, and Hai Li. 2017. Terngrad: Ternary gradients to reduce communication in distributed deep learning. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_56_1","volume-title":"Large batch optimization for deep learning: Training bert in 76 minutes. arXiv preprint arXiv:1904.00962","author":"You Yang","year":"2019","unstructured":"Yang You, Jing Li, Sashank Reddi, Jonathan Hseu, Sanjiv Kumar, Srinadh Bhojanapalli, Xiaodan Song, James Demmel, Kurt Keutzer, and Cho-Jui Hsieh. 2019. Large batch optimization for deep learning: Training bert in 76 minutes. arXiv preprint arXiv:1904.00962 (2019)."},{"key":"e_1_3_2_1_57_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=Syx4wnEtvH","author":"You Yang","year":"2020","unstructured":"Yang You, Jing Li, Sashank Reddi, Jonathan Hseu, Sanjiv Kumar, Srinadh Bhojanapalli, Xiaodan Song, James Demmel, Kurt Keutzer, and Cho-Jui Hsieh. 2020. Large Batch Optimization for Deep Learning: Training BERT in 76 minutes. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=Syx4wnEtvH"},{"key":"e_1_3_2_1_58_1","volume-title":"Proceedings of the 36th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"7193","author":"Yu Hao","year":"2019","unstructured":"Hao Yu, Rong Jin, and Sen Yang. 2019. On the Linear Speedup Analysis of Communication Efficient Momentum SGD for Distributed Non-Convex Optimization. In Proceedings of the 36th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 97), Kamalika Chaudhuri and Ruslan Salakhutdinov (Eds.). PMLR, 7184--7193. https:\/\/proceedings.mlr.press\/v97\/yu19d.html"},{"key":"e_1_3_2_1_59_1","unstructured":"Jiaqi Zhang Keyou You and Lihua Xie. 2021. Innovation Compression for Communication-efficient Distributed Optimization with Linear Convergence. arXiv:2105.06697 [math.OC]"},{"key":"e_1_3_2_1_60_1","volume-title":"Accelerating Broadcast Communication with GPU Compression for Deep Learning Workloads. In 2022 IEEE 29th International Conference on High Performance Computing, Data, and Analytics (HiPC)","author":"Zhou Qinghua","unstructured":"Qinghua Zhou, Quentin Anthony, Aamir Shafi, Hari Subramoni, and Dhabaleswar K DK Panda. 2022. Accelerating Broadcast Communication with GPU Compression for Deep Learning Workloads. In 2022 IEEE 29th International Conference on High Performance Computing, Data, and Analytics (HiPC). IEEE, IEEE, 22--31."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS54959.2023.00023"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS49936.2021.00053"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-07312-0_1"}],"event":{"name":"PPoPP '25: The 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","location":"Las Vegas NV USA","acronym":"PPoPP '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710852","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3710848.3710852","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3710848.3710852","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:13:43Z","timestamp":1755875623000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710852"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,28]]},"references-count":63,"alternative-id":["10.1145\/3710848.3710852","10.1145\/3710848"],"URL":"https:\/\/doi.org\/10.1145\/3710848.3710852","relation":{},"subject":[],"published":{"date-parts":[[2025,2,28]]},"assertion":[{"value":"2025-02-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}