{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T07:56:39Z","timestamp":1768031799704,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T00:00:00Z","timestamp":1717372800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,3]]},"DOI":"10.1145\/3625549.3658678","type":"proceedings-article","created":{"date-parts":[[2024,8,30]],"date-time":"2024-08-30T15:55:29Z","timestamp":1725033329000},"page":"135-147","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["ADTopk: All-Dimension Top-k Compression for High-Performance Data-Parallel DNN Training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1616-8054","authenticated-orcid":false,"given":"Zhangqiang","family":"Ming","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, HuBei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1265-7141","authenticated-orcid":false,"given":"Yuchong","family":"Hu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, HuBei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6391-7798","authenticated-orcid":false,"given":"Wenxiang","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, HuBei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7321-9264","authenticated-orcid":false,"given":"Xinjue","family":"Zheng","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, HuBei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1143-052X","authenticated-orcid":false,"given":"Chenxuan","family":"Yao","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, HuBei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4674-6006","authenticated-orcid":false,"given":"Dan","family":"Feng","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, HuBei, China"}]}],"member":"320","published-online":{"date-parts":[[2024,8,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of Machine Learning and Systems. 55--80","author":"Agarwal Saurabh","year":"2021","unstructured":"Saurabh Agarwal, Hongyi Wang, Kangwook Lee, Shivaram Venkataraman, and Dimitris Papailiopoulos. 2021. Adaptive gradient communication via critical learning regime identification. In Proceedings of Machine Learning and Systems. 55--80."},{"key":"e_1_3_2_1_2_1","volume-title":"Sparse communication for distributed gradient descent. arXiv preprint arXiv:1704.05021","author":"Aji Alham Fikri","year":"2017","unstructured":"Alham Fikri Aji and Kenneth Heafield. 2017. Sparse communication for distributed gradient descent. arXiv preprint arXiv:1704.05021 (2017)."},{"key":"e_1_3_2_1_3_1","volume-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding. In Advances in Neural Information Processing Systems. 8024--8035.","author":"Alistarh Dan","year":"2017","unstructured":"Dan Alistarh, Demjan Grubic, Jerry Li, Ryota Tomioka, and Milan Vojnovic. 2017. QSGD: Communication-efficient SGD via gradient quantization and encoding. In Advances in Neural Information Processing Systems. 8024--8035."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519584"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483553"},{"key":"e_1_3_2_1_6_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. In Advances in Neural Information Processing Systems. 1877--1901."},{"key":"e_1_3_2_1_7_1","unstructured":"Chia-Yu Chen Jiamin Ni Songtao Lu Xiaodong Cui Pin-Yu Chen Xiao Sun Naigang Wang Swagath Venkataramani Vijayalakshmi Srinivasan Wei Zhang et al. 2020. ScaleCom: Scalable Sparsified Gradient Compression for Communication-Efficient Distributed Training. In Advances in Neural Information Processing Systems. 13551--13563."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_9_1","volume-title":"International Conference on Learning Representations.","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Sam Shleifer, and Luke Zettlemoyer. 2022. 8-bit Optimizers via Block-wise Quantization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_10_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_11_1","unstructured":"Fartash Faghri Iman Tabrizian Ilia Markov Dan Alistarh Daniel M Roy and Ali Ramezani-Kebrya. 2020. Adaptive gradient quantization for data-parallel sgd. In Advances in Neural Information Processing Systems. 3174--3185."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Jiarui Fang Haohuan Fu Guangwen Yang and Cho-Jui Hsieh. 2019. RedSync: reducing synchronization bandwidth for distributed deep learning training system. J. Parallel and Distrib. Comput. (2019) 30--39.","DOI":"10.1016\/j.jpdc.2019.05.016"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472904"},{"key":"e_1_3_2_1_14_1","volume-title":"large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677","author":"Goyal Priya","year":"2017","unstructured":"Priya Goyal, Piotr Doll\u00e1r, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. 2017. Accurate, large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677 (2017)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_16_1","volume-title":"Advances in Neural Information Processing Systems","volume":"31","author":"Jiang Peng","year":"2018","unstructured":"Peng Jiang and Gagan Agrawal. 2018. A Linear Speedup Analysis of Distributed Deep Learning with Sparse and Quantized Communication. In Advances in Neural Information Processing Systems, Vol. 31. Curran Associates, Inc., 2530--2541."},{"key":"e_1_3_2_1_17_1","unstructured":"Alex Krizhevsky Geoffrey Hinton et al. 2009. Learning multiple layers of features from tiny images. Master's thesis University of Tront (2009)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508399"},{"key":"e_1_3_2_1_19_1","volume":"201","author":"Lin Yujun","unstructured":"Yujun Lin, Song Han, Huizi Mao, Yu Wang, and William J Dally. 2017. Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv preprint arXiv:1712.01887 (2017).","journal-title":"William J Dally."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267809.3267840"},{"key":"e_1_3_2_1_21_1","volume-title":"International conference on machine learning. PMLR, 14542--14559","author":"Luo Xu","year":"2022","unstructured":"Xu Luo, Jing Xu, and Zenglin Xu. 2022. Channel importance matters in few-shot image classification. In International conference on machine learning. PMLR, 14542--14559."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of Machine Learning and Systems. 297--322","author":"Abdelmoniem Ahmed M","year":"2021","unstructured":"Ahmed M Abdelmoniem, Ahmed Elzanaty, Mohamed-Slim Alouini, and Marco Canini. 2021. An efficient statistical-based gradient compression technique for distributed training systems. In Proceedings of Machine Learning and Systems. 297--322."},{"key":"e_1_3_2_1_23_1","volume-title":"Nitish Shirish Keskar, and Richard Socher","author":"Merity Stephen","year":"2017","unstructured":"Stephen Merity, Nitish Shirish Keskar, and Richard Socher. 2017. Regularizing and optimizing LSTM language models. arXiv preprint arXiv:1708.02182 (2017)."},{"key":"e_1_3_2_1_24_1","volume-title":"Pointer sentinel mixture models. arXiv preprint arXiv:1609.07843","author":"Merity Stephen","year":"2016","unstructured":"Stephen Merity, Caiming Xiong, James Bradbury, and Richard Socher. 2016. Pointer sentinel mixture models. arXiv preprint arXiv:1609.07843 (2016)."},{"key":"e_1_3_2_1_25_1","unstructured":"NVIDIA. 2023. The Developer Tools Documentation of NVIDIA Nsight Systems. https:\/\/docs.nvidia.com\/nsight-systems. Online accessed on Sept-2023."},{"key":"e_1_3_2_1_26_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. In Advances in Neural Information Processing Systems. 8026--8037.","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. In Advances in Neural Information Processing Systems. 8026--8037."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"e_1_3_2_1_28_1","volume-title":"Know what you don't know: Unanswerable questions for SQuAD. arXiv preprint arXiv:1806.03822","author":"Rajpurkar Pranav","year":"2018","unstructured":"Pranav Rajpurkar, Robin Jia, and Percy Liang. 2018. Know what you don't know: Unanswerable questions for SQuAD. arXiv preprint arXiv:1806.03822 (2018)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356222"},{"key":"e_1_3_2_1_30_1","unstructured":"Atal Sahu Aritra Dutta Ahmed M Abdelmoniem Trambak Banerjee Marco Canini and Panos Kalnis. 2021. Rethinking gradient sparsification as total error minimization. In Advances in Neural Information Processing Systems. 8133--8146."},{"key":"e_1_3_2_1_31_1","volume-title":"Dan RK Ports, and Peter Richt\u00e1rik","author":"Sapio Amedeo","year":"2019","unstructured":"Amedeo Sapio, Marco Canini, Chen-Yu Ho, Jacob Nelson, Panos Kalnis, Changhoon Kim, Arvind Krishnamurthy, Masoud Moshref, Dan RK Ports, and Peter Richt\u00e1rik. 2019. Scaling distributed machine learning with in-network aggregation. arXiv preprint arXiv:1903.06701 (2019)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Frank Seide Hao Fu Jasha Droppo Gang Li and Dong Yu. 2014. 1-bit stochastic gradient descent and its application to data-parallel distributed training of speech dnns. In Fifteenth annual conference of the international speech communication association.","DOI":"10.21437\/Interspeech.2014-274"},{"key":"e_1_3_2_1_33_1","volume-title":"Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799 (2018)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3183713.3183735"},{"key":"e_1_3_2_1_35_1","volume-title":"International Conference on Machine Learning. PMLR, 8645--8654","author":"Shao Wenqi","year":"2020","unstructured":"Wenqi Shao, Shitao Tang, Xingang Pan, Ping Tan, Xiaogang Wang, and Ping Luo. 2020. Channel equilibrium networks for learning deep representation. In International Conference on Machine Learning. PMLR, 8645--8654."},{"key":"e_1_3_2_1_36_1","volume-title":"Ka Chun Cheung, and Simon See","author":"Shi Shaohuai","year":"2019","unstructured":"Shaohuai Shi, Xiaowen Chu, Ka Chun Cheung, and Simon See. 2019. Understanding top-k sparsification in distributed deep learning. arXiv preprint arXiv:1911.08772 (2019)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM42981.2021.9488803"},{"key":"e_1_3_2_1_38_1","volume-title":"Layer-wise adaptive gradient sparsification for distributed deep learning with convergence guarantees. arXiv preprint arXiv:1911.08727","author":"Shi Shaohuai","year":"2019","unstructured":"Shaohuai Shi, Zhenheng Tang, Qiang Wang, Kaiyong Zhao, and Xiaowen Chu. 2019. Layer-wise adaptive gradient sparsification for distributed deep learning with convergence guarantees. arXiv preprint arXiv:1911.08727 (2019)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2019.00220"},{"key":"e_1_3_2_1_40_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_41_1","unstructured":"Sebastian U Stich Jean-Baptiste Cordonnier and Martin Jaggi. 2018. Sparsified SGD with memory. In Advances in Neural Information Processing Systems. 4452--4463."},{"key":"e_1_3_2_1_42_1","unstructured":"Hanlin Tang Shaoduo Gan Ce Zhang Tong Zhang and Ji Liu. 2018. Communication compression for decentralized training. In Advances in Neural Information Processing Systems. 7663--7673."},{"key":"e_1_3_2_1_43_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information Processing Systems. 6000--6010."},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of Machine Learning and Systems. 365--386","author":"Wang Hongyi","year":"2021","unstructured":"Hongyi Wang, Saurabh Agarwal, and Dimitris Papailiopoulos. 2021. Pufferfish: Communication-efficient models at no extra cost. In Proceedings of Machine Learning and Systems. 365--386."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3369583.3392681"},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the Eighteenth European Conference on Computer Systems (EuroSys '23)","author":"Wang Zhuang","unstructured":"Zhuang Wang, Haibin Lin, Yibo Zhu, and T. S. Eugene Ng. 2023. Hi-Speed DNN Training with Espresso: Unleashing the Full Potential of Gradient Compression with Near-Optimal Usage Strategies. In Proceedings of the Eighteenth European Conference on Computer Systems (EuroSys '23). 867--882."},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the Sixth Conference on Machine Learning and Systems (MLSys' 23). Proceedings of the Sixth Conference on Machine Learning and Systems (MLSys' 23)","author":"Wang Zhuang","year":"2023","unstructured":"Zhuang Wang, Xinyu Crystal Wu, Zhaozhuo Xu, and TS Eugene Ng. 2023. CUPCAKE: ACOMPRESSION OPTIMIZER FOR SCALABLE COMMUNICATION-EFFICIENT DISTRIBUTED TRAINING. In Proceedings of the Sixth Conference on Machine Learning and Systems (MLSys' 23). Proceedings of the Sixth Conference on Machine Learning and Systems (MLSys' 23)."},{"key":"e_1_3_2_1_48_1","unstructured":"Jianqiao Wangni Jialei Wang Ji Liu and Tong Zhang. 2018. Gradient sparsification for communication-efficient distributed optimization. In Advances in Neural Information Processing Systems. 1306--1316."},{"key":"e_1_3_2_1_49_1","volume-title":"BIRD: A Lightweight and Adaptive Compressor for Communication-Efficient Distributed Learning Using Tensor-wise Bi-Random Sampling. In 2023 IEEE 41st International Conference on Computer Design (ICCD)","author":"Wu Donglei","year":"2023","unstructured":"Donglei Wu, Weihao Yang, Cai Deng, Xiangyu Zou, Shiyi Li, and Wen Xia. 2023. BIRD: A Lightweight and Adaptive Compressor for Communication-Efficient Distributed Learning Using Tensor-wise Bi-Random Sampling. In 2023 IEEE 41st International Conference on Computer Design (ICCD). IEEE, 605--613."},{"key":"e_1_3_2_1_50_1","volume-title":"International Conference on Machine Learning. PMLR, 5325--5333","author":"Wu Jiaxiang","year":"2018","unstructured":"Jiaxiang Wu, Weidong Huang, Junzhou Huang, and Tong Zhang. 2018. Error compensated quantized SGD and its applications to large-scale distributed optimization. In International Conference on Machine Learning. PMLR, 5325--5333."},{"key":"e_1_3_2_1_51_1","volume-title":"2021 IEEE 41st international conference on distributed computing systems (ICDCS). IEEE, 561--572","author":"Xu Hang","year":"2021","unstructured":"Hang Xu, Chen-Yu Ho, Ahmed M Abdelmoniem, Aritra Dutta, El Houcine Bergou, Konstantinos Karatsenidis, Marco Canini, and Panos Kalnis. 2021. GRACE: A compressed communication framework for distributed machine learning. In 2021 IEEE 41st international conference on distributed computing systems (ICDCS). IEEE, 561--572."},{"key":"e_1_3_2_1_52_1","volume-title":"Deepreduce: A sparse-tensor communication framework for federated deep learning. In Advances in Neural Information Processing Systems. 21150--21163.","author":"Xu Hang","year":"2021","unstructured":"Hang Xu, Kelly Kostopoulou, Aritra Dutta, Xin Li, Alexandros Ntoulas, and Panos Kalnis. 2021. Deepreduce: A sparse-tensor communication framework for federated deep learning. In Advances in Neural Information Processing Systems. 21150--21163."},{"key":"e_1_3_2_1_53_1","volume-title":"2021 IEEE 18th International Conference on Mobile Ad Hoc and Smart Systems (MASS). IEEE, 136--144","author":"Yan Guangfeng","year":"2021","unstructured":"Guangfeng Yan, Shao-Lun Huang, Tian Lan, and Linqi Song. 2021. DQ-SGD: Dynamic quantization in SGD for communication-efficient distributed learning. In 2021 IEEE 18th International Conference on Mobile Ad Hoc and Smart Systems (MASS). IEEE, 136--144."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605609"},{"key":"e_1_3_2_1_55_1","volume-title":"MiCRO: Near-Zero Cost Gradient Sparsification for Scaling and Accelerating Distributed DNN Training. arXiv preprint arXiv:2310.00967","author":"Yoon Daegun","year":"2023","unstructured":"Daegun Yoon and Sangyoon Oh. 2023. MiCRO: Near-Zero Cost Gradient Sparsification for Scaling and Accelerating Distributed DNN Training. arXiv preprint arXiv:2310.00967 (2023)."},{"key":"e_1_3_2_1_56_1","volume-title":"Preserving Near-Optimal Gradient Sparsification Cost for Scalable Distributed Deep Learning. arXiv preprint arXiv:2402.13781","author":"Yoon Daegun","year":"2024","unstructured":"Daegun Yoon and Sangyoon Oh. 2024. Preserving Near-Optimal Gradient Sparsification Cost for Scalable Distributed Deep Learning. arXiv preprint arXiv:2402.13781 (2024)."}],"event":{"name":"HPDC '24: 33rd International Symposium on High-Performance Parallel and Distributed Computing","location":"Pisa Italy","acronym":"HPDC '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 33rd International Symposium on High-Performance Parallel and Distributed Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3625549.3658678","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3625549.3658678","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T22:50:38Z","timestamp":1750287038000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3625549.3658678"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,3]]},"references-count":56,"alternative-id":["10.1145\/3625549.3658678","10.1145\/3625549"],"URL":"https:\/\/doi.org\/10.1145\/3625549.3658678","relation":{},"subject":[],"published":{"date-parts":[[2024,6,3]]},"assertion":[{"value":"2024-08-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}