{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T16:49:58Z","timestamp":1755794998741,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/100017052","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62132022, U24A20245"],"award-info":[{"award-number":["62132022, U24A20245"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/100017052","id-type":"DOI","asserted-by":"publisher"}]},{"name":"The Science and Technology Innovation Program of Hunan Province","award":["2024RC1005"],"award-info":[{"award-number":["2024RC1005"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3737142","type":"proceedings-article","created":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T21:03:27Z","timestamp":1754255007000},"page":"1612-1622","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["SwitchTop-\n            <i>k<\/i>\n            : Scaling Top-\n            <i>k<\/i>\n            Compression on Programmable Switches"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4335-8742","authenticated-orcid":false,"given":"Yijun","family":"Li","sequence":"first","affiliation":[{"name":"Central South University, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7578-4490","authenticated-orcid":false,"given":"Jiawei","family":"Huang","sequence":"additional","affiliation":[{"name":"Central South University, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8743-0270","authenticated-orcid":false,"given":"Jingling","family":"Liu","sequence":"additional","affiliation":[{"name":"Central South University, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9677-2368","authenticated-orcid":false,"given":"Zhaoyi","family":"Li","sequence":"additional","affiliation":[{"name":"Central South University, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5067-321X","authenticated-orcid":false,"given":"Wanchun","family":"Jiang","sequence":"additional","affiliation":[{"name":"Central South University, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1516-0480","authenticated-orcid":false,"given":"Jianxin","family":"Wang","sequence":"additional","affiliation":[{"name":"Central South University, Changsha, China"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"2022. Intel Data Plane Development Kit (DPDK). https:\/\/www.dpdk.org\/."},{"key":"e_1_3_2_2_2_1","unstructured":"2023. Horovod. https:\/\/horovod.ai\/."},{"key":"e_1_3_2_2_3_1","unstructured":"2023. NVIDIA Collective Communication Library (NCCL). https:\/\/developer.nvidia.com\/nccl."},{"key":"e_1_3_2_2_4_1","unstructured":"2025. ns-3 | a discrete-event network simulator for internet systems. https:\/\/www.nsnam.org\/."},{"key":"e_1_3_2_2_5_1","first-page":"265","article-title":"Tensorflow: A system for large-scale machine learning","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, and Michael Isard. 2016. Tensorflow: A system for large-scale machine learning. In Proc. USENIX OSDI. 265-283.","journal-title":"Proc. USENIX OSDI."},{"key":"e_1_3_2_2_6_1","first-page":"652","article-title":"On the utility of gradient compression in distributed training systems","volume":"4","author":"Agarwal Saurabh","year":"2022","unstructured":"Saurabh Agarwal, Hongyi Wang, Shivaram Venkataraman, and Dimitris Papailiopoulos. 2022. On the utility of gradient compression in distributed training systems. Proc. MLSys 4, 652-672.","journal-title":"Proc. MLSys"},{"key":"e_1_3_2_2_7_1","first-page":"440","article-title":"Sparse Communication for Distributed Gradient Descent","author":"Aji Alham Fikri","year":"2017","unstructured":"Alham Fikri Aji and Kenneth Heafield. 2017. Sparse Communication for Distributed Gradient Descent. In Proc. EMNLP. 440-445.","journal-title":"Proc. EMNLP."},{"key":"e_1_3_2_2_8_1","first-page":"1709","article-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding","volume":"30","author":"Alistarh Dan","year":"2017","unstructured":"Dan Alistarh, Demjan Grubic, Jerry Li, Ryota Tomioka, and Milan Vojnovic. 2017. QSGD: Communication-efficient SGD via gradient quantization and encoding. Advances in Neural Information Processing Systems 30 (2017), 1709-1720.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2486001.2486031"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483553"},{"key":"e_1_3_2_2_11_1","volume-title":"Randomized admission policy for efficient top-k and frequency estimation. arXiv preprint arXiv:1612.02962","author":"Basat Ran Ben","year":"2016","unstructured":"Ran Ben Basat, Gil Einziger, Roy Friedman, and Yaron Kassner. 2016. Randomized admission policy for efficient top-k and frequency estimation. arXiv preprint arXiv:1612.02962 (2016)."},{"key":"e_1_3_2_2_12_1","first-page":"560","article-title":"signSGD: Compressed optimisation for non-convex problems","author":"Bernstein Jeremy","year":"2018","unstructured":"Jeremy Bernstein, Yu-Xiang Wang, Kamyar Azizzadenesheli, and Animashree Anandkumar. 2018. signSGD: Compressed optimisation for non-convex problems. In Proc. ICML. 560-569.","journal-title":"Proc. ICML."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1137\/16M1080173"},{"key":"e_1_3_2_2_14_1","first-page":"13551","article-title":"Scalecom: Scalable sparsified gradient compression for communication-efficient distributed training","volume":"33","author":"Chen Chia-Yu","year":"2020","unstructured":"Chia-Yu Chen, Jiamin Ni, Songtao Lu, Xiaodong Cui, Pin-Yu Chen, Xiao Sun, Naigang Wang, Swagath Venkataramani, Vijayalakshmi Viji Srinivasan, Wei Zhang, et al. 2020. Scalecom: Scalable sparsified gradient compression for communication-efficient distributed training. Advances in Neural Information Processing Systems 33 (2020), 13551-13563.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_15_1","first-page":"571","article-title":"Project adam: Building an efficient and scalable deep learning training system","author":"Chilimbi Trishul","year":"2014","unstructured":"Trishul Chilimbi, Yutaka Suzue, Johnson Apacible, and Karthik Kalyanaraman. 2014. Project adam: Building an efficient and scalable deep learning training system. In Proc. USENIX OSDI. 571-582.","journal-title":"Proc. USENIX OSDI."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.5555\/1953048.2078186"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jalgor.2003.12.001"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3208425"},{"key":"e_1_3_2_2_19_1","first-page":"1223","article-title":"Large Scale Distributed Deep Networks","volume":"25","author":"Dean Jeffrey","year":"2012","unstructured":"Jeffrey Dean, Gregory S. Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Quoc V. Le, Mark Z. Mao, Marc'Aurelio Ranzato, Andrew W. Senior, Paul A. Tucker, Ke Yang, and A. Ng. 2012. Large Scale Distributed Deep Networks. Advances in Neural Information Processing Systems 25 (2012), 1223-1231.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_20_1","first-page":"676","article-title":"Efficient sparse collective communication and its application to accelerate distributed deep learning","author":"Fei Jiawei","year":"2021","unstructured":"Jiawei Fei, Chen-Yu Ho, Atal N Sahu, Marco Canini, and Amedeo Sapio. 2021. Efficient sparse collective communication and its application to accelerate distributed deep learning. In Proc. ACM SIGCOMM. 676-691.","journal-title":"Proc. ACM SIGCOMM."},{"key":"e_1_3_2_2_21_1","first-page":"829","article-title":"In-network aggregation for shared machine learning clusters","author":"Gebara Nadeen","year":"2021","unstructured":"Nadeen Gebara, Manya Ghobadi, and Paolo Costa. 2021. In-network aggregation for shared machine learning clusters. In Proc. MLSys. 829-844.","journal-title":"Proc. MLSys."},{"key":"e_1_3_2_2_22_1","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He Kaiming","year":"2016","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep residual learning for image recognition. In Proc. IEEE CVPR. 770-778.","journal-title":"Proc. IEEE CVPR."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNET.2022.3199506"},{"key":"e_1_3_2_2_24_1","unstructured":"Nikita Ivkin Daniel Rothchild Enayat Ullah Ion Stoica Raman Arora et al. 2019. Communication-efficient distributed SGD with sketching. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_2_25_1","first-page":"1269","article-title":"Sketchml: Accelerating distributed machine learning with data sketches","author":"Jiang Jiawei","year":"2018","unstructured":"Jiawei Jiang, Fangcheng Fu, Tong Yang, and Bin Cui. 2018. Sketchml: Accelerating distributed machine learning with data sketches. In Proc. ACM SIGMOD. 1269-1284.","journal-title":"Proc. ACM SIGMOD."},{"key":"e_1_3_2_2_26_1","volume-title":"A linear speedup analysis of distributed deep learning with sparse and quantized communication. Advances in Neural Information Processing Systems 31","author":"Jiang Peng","year":"2018","unstructured":"Peng Jiang and Gagan Agrawal. 2018. A linear speedup analysis of distributed deep learning with sparse and quantized communication. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_2_2_27_1","first-page":"463","article-title":"A Unified Architecture for Accelerating Distributed DNN Training in Heterogeneous GPU\/CPU Clusters","author":"Jiang Yimin","year":"2020","unstructured":"Yimin Jiang, Yibo Zhu, Chang Lan, Bairen Yi, Yong Cui, and Chuanxiong Guo. 2020. A Unified Architecture for Accelerating Distributed DNN Training in Heterogeneous GPU\/CPU Clusters. In Proc. USENIX OSDI. 463-479.","journal-title":"Proc. USENIX OSDI."},{"key":"e_1_3_2_2_28_1","unstructured":"Alex Krizhevsky and Geoffrey Hinton. 2009. Learning multiple layers of features from tiny images. (2009)."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"e_1_3_2_2_30_1","first-page":"741","article-title":"ATP","author":"Lao ChonLam","year":"2021","unstructured":"ChonLam Lao, Yanfang Le, Kshiteej Mahajan, Yixi Chen, Wenfei Wu, Aditya Akella, and Michael Swift. 2021. ATP: In-network Aggregation for Multi-tenant Learning. In Proc. USENIX NSDI. 741-761.","journal-title":"In-network Aggregation for Multi-tenant Learning. In Proc. USENIX NSDI."},{"key":"e_1_3_2_2_31_1","volume-title":"Deep learning. Nature 521, 7553","author":"LeCun Yann","year":"2015","unstructured":"Yann LeCun, Yoshua Bengio, and Geoffrey Hinton. 2015. Deep learning. Nature 521, 7553 (2015), 436-444."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/2640087.2644155"},{"key":"e_1_3_2_2_33_1","first-page":"19","article-title":"Communication efficient distributed machine learning with the parameter server","volume":"27","author":"Li Mu","year":"2014","unstructured":"Mu Li, David G Andersen, Alexander J Smola, and Kai Yu. 2014. Communication efficient distributed machine learning with the parameter server. Advances in Neural Information Processing Systems 27 (2014), 19-27.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_34_1","volume-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv preprint arXiv:1712.01887","author":"Lin Yujun","year":"2017","unstructured":"Yujun Lin, Song Han, Huizi Mao, Yu Wang, and William J Dally. 2017. Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv preprint arXiv:1712.01887 (2017)."},{"key":"e_1_3_2_2_35_1","volume-title":"Topkapi: parallel and fast sketches for finding top-k frequent elements. Advances in Neural Information Processing Systems 31","author":"Mandal Ankush","year":"2018","unstructured":"Ankush Mandal, He Jiang, Anshumali Shrivastava, and Vivek Sarkar. 2018. Topkapi: parallel and fast sketches for finding top-k frequent elements. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_2_2_36_1","volume-title":"Nitish Shirish Keskar, and Richard Socher","author":"Merity Stephen","year":"2017","unstructured":"Stephen Merity, Nitish Shirish Keskar, and Richard Socher. 2017. Regularizing and Optimizing LSTM Language Models. ArXiv abs\/1708.02182 (2017)."},{"key":"e_1_3_2_2_37_1","first-page":"1","article-title":"PipeDream: generalized pipeline parallelism for DNN training","author":"Narayanan Deepak","year":"2019","unstructured":"Deepak Narayanan, Aaron Harlap, Amar Phanishayee, Vivek Seshadri, Nikhil R Devanur, Gregory R Ganger, Phillip B Gibbons, and Matei Zaharia. 2019. PipeDream: generalized pipeline parallelism for DNN training. In Proc. USENIX OSDI. 1-15.","journal-title":"Proc. USENIX OSDI."},{"key":"e_1_3_2_2_38_1","first-page":"8026","article-title":"Pytorch: An imperative style, high-performance deep learning library","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. In Proc. NIPS. 8026-8037.","journal-title":"Proc. NIPS."},{"key":"e_1_3_2_2_39_1","first-page":"16","article-title":"A generic communication scheduler for distributed dnn training acceleration","author":"Peng Yanghua","year":"2019","unstructured":"Yanghua Peng, Yibo Zhu, Yangrui Chen, Yixin Bao, Bairen Yi, Chang Lan, Chuan Wu, and Chuanxiong Guo. 2019. A generic communication scheduler for distributed dnn training acceleration. In Proc. ACM SOSP. 16-29.","journal-title":"Proc. ACM SOSP."},{"key":"e_1_3_2_2_40_1","volume-title":"Know what you don't know: Unanswerable questions for SQuAD. arXiv preprint arXiv:1806.03822","author":"Rajpurkar Pranav","year":"2018","unstructured":"Pranav Rajpurkar, Robin Jia, and Percy Liang. 2018. Know what you don't know: Unanswerable questions for SQuAD. arXiv preprint arXiv:1806.03822 (2018)."},{"key":"e_1_3_2_2_41_1","first-page":"785","article-title":"Scaling Distributed Machine Learning with In-Network Aggregation","author":"Sapio Amedeo","year":"2021","unstructured":"Amedeo Sapio, Marco Canini, Chen-Yu Ho, Jacob Nelson, Panos Kalnis, Changhoon Kim, Arvind Krishnamurthy, Masoud Moshref, Dan RK Ports, and Peter Richtarik. 2021. Scaling Distributed Machine Learning with In-Network Aggregation. In Proc. USENIX NSDI. 785-808.","journal-title":"Proc. USENIX NSDI."},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-274"},{"key":"e_1_3_2_2_43_1","first-page":"2238","article-title":"A distributed synchronous SGD algorithm with global top-k sparsification for low bandwidth networks","author":"Shi Shaohuai","year":"2019","unstructured":"Shaohuai Shi, Qiang Wang, Kaiyong Zhao, Zhenheng Tang, Yuxin Wang, Xiang Huang, and Xiaowen Chu. 2019. A distributed synchronous SGD algorithm with global top-k sparsification for low bandwidth networks. In Proc. IEEE ICDCS. 2238-2247.","journal-title":"Proc. IEEE ICDCS."},{"key":"e_1_3_2_2_44_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and AndrewZisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_2_45_1","volume-title":"Sparsified SGD with memory. Advances in Neural Information Processing Systems 31","author":"Stich Sebastian U","year":"2018","unstructured":"Sebastian U Stich, Jean-Baptiste Cordonnier, and Martin Jaggi. 2018. Sparsified SGD with memory. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737499"},{"key":"e_1_3_2_2_47_1","volume-title":"Gradient sparsification for communication-efficient distributed optimization. Advances in Neural Information Processing Systems 31","author":"Wangni Jianqiao","year":"2018","unstructured":"Jianqiao Wangni, Jialei Wang, Ji Liu, and Tong Zhang. 2018. Gradient sparsification for communication-efficient distributed optimization. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_2_2_48_1","volume-title":"Terngrad: Ternary gradients to reduce communication in distributed deep learning. Advances in neural information processing systems 30","author":"Xu Cong","year":"2017","unstructured":"WeiWen, Cong Xu, Feng Yan, ChunpengWu, YandanWang, Yiran Chen, and Hai Li. 2017. Terngrad: Ternary gradients to reduce communication in distributed deep learning. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_2_49_1","volume-title":"Konstantinos Karatsenidis, Marco Canini, and Panos Kalnis.","author":"Xu Hang","year":"2021","unstructured":"Hang Xu, Chen-Yu Ho, AhmedMAbdelmoniem, Aritra Dutta, El Houcine Bergou, Konstantinos Karatsenidis, Marco Canini, and Panos Kalnis. 2021. Grace: A compressed communication framework for distributed machine learning. In Proc. IEEE ICDCS. 561-572."},{"key":"e_1_3_2_2_50_1","first-page":"561","article-title":"Elastic sketch: Adaptive and fast networkwide measurements","author":"Yang Tong","year":"2018","unstructured":"Tong Yang, Jie Jiang, Peng Liu, Qun Huang, Junzhi Gong, Yang Zhou, Rui Miao, Xiaoming Li, and Steve Uhlig. 2018. Elastic sketch: Adaptive and fast networkwide measurements. In Proc. ACM SIGCOMM. 561-575.","journal-title":"Proc. ACM SIGCOMM."},{"key":"e_1_3_2_2_51_1","first-page":"7184","article-title":"On the linear speedup analysis of communication efficient momentum SGD for distributed non-convex optimization","author":"Yu Hao","year":"2019","unstructured":"Hao Yu, Rong Jin, and Sen Yang. 2019. On the linear speedup analysis of communication efficient momentum SGD for distributed non-convex optimization. In Proc. ICML. 7184-7193.","journal-title":"Proc. ICML."},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNET.2018.2877700"},{"key":"e_1_3_2_2_53_1","first-page":"428","article-title":"Multi-resource interleaving for deep learning training","author":"Zhao Yihao","year":"2022","unstructured":"Yihao Zhao, Yuanqiang Liu, Yanghua Peng, Yibo Zhu, Xuanzhe Liu, and Xin Jin. 2022. Multi-resource interleaving for deep learning training. In Proc. ACM SIGCOMM. 428-440.","journal-title":"Proc. ACM SIGCOMM."}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"],"location":"Toronto ON Canada","acronym":"KDD '25"},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3737142","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,16]],"date-time":"2025-08-16T14:37:53Z","timestamp":1755355073000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3737142"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":53,"alternative-id":["10.1145\/3711896.3737142","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3737142","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}