{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T15:34:10Z","timestamp":1772724850321,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,12]],"date-time":"2024-08-12T00:00:00Z","timestamp":1723420800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"National Natural Science Foundation of China Grant","award":["U22A6001"],"award-info":[{"award-number":["U22A6001"]}]},{"name":"Shanghai Artificial Intelligence Laboratory","award":["P22KN00581"],"award-info":[{"award-number":["P22KN00581"]}]},{"name":"Key Research Project of Zhejiang Lab","award":["2022PG0AC02"],"award-info":[{"award-number":["2022PG0AC02"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,12]]},"DOI":"10.1145\/3673038.3673095","type":"proceedings-article","created":{"date-parts":[[2024,8,8]],"date-time":"2024-08-08T18:29:01Z","timestamp":1723141741000},"page":"514-523","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Holmes: Towards Distributed Training Across Clusters with Heterogeneous NIC Environment"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4802-3191","authenticated-orcid":false,"given":"Fei","family":"Yang","sequence":"first","affiliation":[{"name":"Zhejiang Lab, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9599-4276","authenticated-orcid":false,"given":"Shuang","family":"Peng","sequence":"additional","affiliation":[{"name":"Zhejiang Lab, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7859-4454","authenticated-orcid":false,"given":"Ning","family":"Sun","sequence":"additional","affiliation":[{"name":"Zhejiang Lab, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3748-5606","authenticated-orcid":false,"given":"Fangyu","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang Lab, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3205-1442","authenticated-orcid":false,"given":"Yuanyuan","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang Lab, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9613-5908","authenticated-orcid":false,"given":"Fu","family":"Wu","sequence":"additional","affiliation":[{"name":"Zhejiang Lab, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9514-0708","authenticated-orcid":false,"given":"Jiezhong","family":"Qiu","sequence":"additional","affiliation":[{"name":"Zhejiang Lab, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5996-9110","authenticated-orcid":false,"given":"Aimin","family":"Pan","sequence":"additional","affiliation":[{"name":"Zhejiang Lab, China"}]}],"member":"320","published-online":{"date-parts":[[2024,8,12]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Alibaba. 2023. Megatron-LLaMA. https:\/\/github.com\/alibaba\/Megatron-LLaMA."},{"key":"e_1_3_2_1_2_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems 33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared\u00a0D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020), 1877\u20131901."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/2901318.2901349"},{"key":"e_1_3_2_1_4_1","unstructured":"FairScale authors. 2021. FairScale: A general purpose modular PyTorch library for high performance and large scale training. https:\/\/github.com\/facebookresearch\/fairscale."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2017.37"},{"key":"e_1_3_2_1_6_1","volume-title":"When Cloud Storage Meets RDMA. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Gao Yixiao","year":"2021","unstructured":"Yixiao Gao, Qiang Li, Lingbo Tang, Yongqing Xi, Pengcheng Zhang, Wenwen Peng, Bo Li, Yaohui Wu, Shaozong Liu, Lei Yan, 2021. When Cloud Storage Meets RDMA. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21). 519\u2013533."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3229543.3229544"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934908"},{"key":"e_1_3_2_1_9_1","volume-title":"More effective distributed ml via a stale synchronous parallel parameter server. Advances in neural information processing systems 26","author":"Ho Qirong","year":"2013","unstructured":"Qirong Ho, James Cipar, Henggang Cui, Seunghak Lee, Jin\u00a0Kyu Kim, Phillip\u00a0B Gibbons, Garth\u00a0A Gibson, Greg Ganger, and Eric\u00a0P Xing. 2013. More effective distributed ml via a stale synchronous parallel parameter server. Advances in neural information processing systems 26 (2013), 1223\u20131231."},{"key":"e_1_3_2_1_10_1","volume-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc\u00a0V Le, Yonghui Wu, 2019. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32 (2019), 103\u2013112."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2236584.2236592"},{"key":"e_1_3_2_1_12_1","volume-title":"Design Guidelines for High Performance RDMA Systems. In 2016 USENIX Annual Technical Conference (USENIX ATC 16)","author":"Kalia Anuj","year":"2016","unstructured":"Anuj Kalia, Michael Kaminsky, and David\u00a0G Andersen. 2016. Design Guidelines for High Performance RDMA Systems. In 2016 USENIX Annual Technical Conference (USENIX ATC 16). 437\u2013450."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2019.2928289"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2640087.2644155"},{"key":"e_1_3_2_1_15_1","volume-title":"Flor: An Open High Performance RDMA Framework Over Heterogeneous RNICs. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Qiang","year":"2023","unstructured":"Qiang Li, Yixiao Gao, Xiaoliang Wang, Haonan Qiu, Yanfang Le, Derui Liu, Qiao Xiang, Fei Feng, Peng Zhang, Bo Li, Jianbo Dong, Lingbo Tang, Hongqiang\u00a0Harry Liu, Shaozong Liu, Weijie Li, Rui Miao, Yaohui Wu, Zhiwu Wu, Chao Han, Lei Yan, Zheng Cao, Zhongjie Wu, Chen Tian, Guihai Chen, Dennis Cai, Jinbo Wu, Jiaji Zhu, Jiesheng Wu, and Jiwu Shu. 2023. Flor: An Open High Performance RDMA Framework Over Heterogeneous RNICs. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). 931\u2013948."},{"key":"e_1_3_2_1_16_1","volume-title":"Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704","author":"Li Shen","year":"2020","unstructured":"Shen Li, Yanli Zhao, Rohan Varma, Omkar Salpekar, Pieter Noordhuis, Teng Li, Adam Paszke, Jeff Smith, Brian Vaughan, Pritam Damania, 2020. Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704 (2020)."},{"key":"e_1_3_2_1_17_1","unstructured":"MicroSoft. 2023. Megatron-DeepSpeed. https:\/\/github.com\/microsoft\/Megatron-DeepSpeed."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_19_1","volume-title":"International Conference on Machine Learning. PMLR, 7937\u20137947","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan, Amar Phanishayee, Kaiyu Shi, Xie Chen, and Matei Zaharia. 2021. Memory-efficient pipeline-parallel dnn training. In International Conference on Machine Learning. PMLR, 7937\u20137947."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_21_1","unstructured":"NVIDIA. 2021. NCCL. https:\/\/developer.nvidia.com\/nccl."},{"key":"e_1_3_2_1_22_1","unstructured":"NVIDIA. 2022. Megatron-LM. https:\/\/github.com\/NVIDIA\/Megatron-LM."},{"key":"e_1_3_2_1_23_1","unstructured":"NVIDIA. 2022. NVIDIA A100 Tensor Core GPU. ttps:\/\/www.nvidia.com\/en-us\/data-center\/ a100."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_27_1","volume-title":"Hogwild: A Lock-Free Approach to Parallelizing Stochastic Gradient Descent. Advances in neural information processing systems 24","author":"Recht Benjamin","year":"2011","unstructured":"Benjamin Recht, Christopher Re, Stephen Wright, and Feng Niu. 2011. Hogwild: A Lock-Free Approach to Parallelizing Stochastic Gradient Descent. Advances in neural information processing systems 24 (2011), 693\u2013701."},{"key":"e_1_3_2_1_28_1","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza\u00a0Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021. ZeRO-Offload: Democratizing Billion-Scale model training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). 551\u2013564."},{"key":"e_1_3_2_1_29_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_30_1","volume-title":"Using deepspeed and megatron to train megatron-turing nlg 530b, a large-scale generative language model. arXiv preprint arXiv:2201.11990","author":"Smith Shaden","year":"2022","unstructured":"Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, 2022. Using deepspeed and megatron to train megatron-turing nlg 530b, a large-scale generative language model. arXiv preprint arXiv:2201.11990 (2022)."},{"key":"e_1_3_2_1_31_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/79173.79181"},{"key":"e_1_3_2_1_33_1","volume-title":"TopoOpt: Co-optimizing Network Topology and Parallelization Strategy for Distributed Training Jobs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wang Weiyang","year":"2023","unstructured":"Weiyang Wang, Moein Khazraee, Zhizhen Zhong, Manya Ghobadi, Zhihao Jia, Dheevatsa Mudigere, Ying Zhang, and Anthony Kewitsch. 2023. TopoOpt: Co-optimizing Network Topology and Parallelization Strategy for Distributed Training Jobs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 739\u2013767."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/2783258.2783323"},{"key":"e_1_3_2_1_35_1","first-page":"25464","article-title":"Decentralized training of foundation models in heterogeneous environments","volume":"35","author":"Yuan Binhang","year":"2022","unstructured":"Binhang Yuan, Yongjun He, Jared Davis, Tianyi Zhang, Tri Dao, Beidi Chen, Percy\u00a0S Liang, Christopher Re, and Ce Zhang. 2022. Decentralized training of foundation models in heterogeneous environments. Advances in Neural Information Processing Systems 35 (2022), 25464\u201325477.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","volume-title":"Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi\u00a0Victoria Lin, 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)."},{"key":"e_1_3_2_1_37_1","volume-title":"A survey of large language models. arXiv preprint arXiv:2303.18223","author":"Zhao Wayne\u00a0Xin","year":"2023","unstructured":"Wayne\u00a0Xin Zhao, Kun Zhou, Junyi Li, Tianyi Tang, Xiaolei Wang, Yupeng Hou, Yingqian Min, Beichen Zhang, Junjie Zhang, Zican Dong, 2023. A survey of large language models. arXiv preprint arXiv:2303.18223 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of Machine Learning and Systems 5","author":"Zhuang Yonghao","year":"2023","unstructured":"Yonghao Zhuang, Hexu Zhao, Lianmin Zheng, Zhuohan Li, Eric Xing, Qirong Ho, Joseph Gonzalez, Ion Stoica, and Hao Zhang. 2023. On optimizing the communication of model parallelism. Proceedings of Machine Learning and Systems 5 (2023)."}],"event":{"name":"ICPP '24: the 53rd International Conference on Parallel Processing","location":"Gotland Sweden","acronym":"ICPP '24"},"container-title":["Proceedings of the 53rd International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3673038.3673095","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3673038.3673095","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,23]],"date-time":"2025-09-23T17:30:38Z","timestamp":1758648638000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3673038.3673095"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,12]]},"references-count":38,"alternative-id":["10.1145\/3673038.3673095","10.1145\/3673038"],"URL":"https:\/\/doi.org\/10.1145\/3673038.3673095","relation":{},"subject":[],"published":{"date-parts":[[2024,8,12]]},"assertion":[{"value":"2024-08-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}