{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:51:22Z","timestamp":1777063882064,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3769322","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"263-278","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Handling Network Faults in Distributed AI Training: Failover is Now an Option"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-8048-8556","authenticated-orcid":false,"given":"Xin Zhe","family":"Khooi","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6144-7899","authenticated-orcid":false,"given":"Zhuo","family":"Jiang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7593-8956","authenticated-orcid":false,"given":"Pan","family":"Xie","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7726-188X","authenticated-orcid":false,"given":"Zhigang","family":"Cui","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2829-1976","authenticated-orcid":false,"given":"Meng","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2077-3786","authenticated-orcid":false,"given":"Yuze","family":"Jin","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9713-8134","authenticated-orcid":false,"given":"Pengfei","family":"Huo","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6446-700X","authenticated-orcid":false,"given":"Dongyang","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5108-8116","authenticated-orcid":false,"given":"Lulu","family":"Chen","sequence":"additional","affiliation":[{"name":"ByteDance, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3809-1879","authenticated-orcid":false,"given":"Lei","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4599-0532","authenticated-orcid":false,"given":"Liaoyuan","family":"Feng","sequence":"additional","affiliation":[{"name":"ByteDance, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0296-5664","authenticated-orcid":false,"given":"Xiaodong","family":"Liu","sequence":"additional","affiliation":[{"name":"ByteDance, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3355-2269","authenticated-orcid":false,"given":"Peng","family":"Li","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5406-6354","authenticated-orcid":false,"given":"Qinlong","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6416-0074","authenticated-orcid":false,"given":"Yang","family":"Bai","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7215-9753","authenticated-orcid":false,"given":"Yongcan","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0540-1089","authenticated-orcid":false,"given":"Hao","family":"Jin","sequence":"additional","affiliation":[{"name":"ByteDance, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0600-7009","authenticated-orcid":false,"given":"Jinshuai","family":"Sun","sequence":"additional","affiliation":[{"name":"ByteDance, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8200-3717","authenticated-orcid":false,"given":"Shan","family":"Lu","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6179-4332","authenticated-orcid":false,"given":"Xiang","family":"Shi","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4775-0438","authenticated-orcid":false,"given":"Yingkai","family":"Zhao","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8336-8065","authenticated-orcid":false,"given":"Haiquan","family":"Chen","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3069-9948","authenticated-orcid":false,"given":"Yi","family":"Li","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3395-3624","authenticated-orcid":false,"given":"Jianxi","family":"Ye","sequence":"additional","affiliation":[{"name":"ByteDance, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6563-275X","authenticated-orcid":false,"given":"Mun Choon","family":"Chan","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"d.]. https:\/\/github.com\/NVIDIA\/nccl\/issues\/998 [Accessed","year":"2024","unstructured":"[n. d.]. https:\/\/github.com\/NVIDIA\/nccl\/issues\/998 [Accessed: Oct 2024]."},{"key":"e_1_3_2_1_2_1","unstructured":"2024. ACCL: Alibaba Cloud's self-developed high-performance collective communication library. https:\/\/help.aliyun.com\/zh\/pai\/user-guide\/accl-alibaba-high-performance-collective-communication-library [Accessed: Jan 2025]."},{"key":"e_1_3_2_1_3_1","volume-title":"Infiniband Verbs Performance Tests. https:\/\/github.com\/linux-rdma\/perftest [Accessed","year":"2025","unstructured":"n.d.. Infiniband Verbs Performance Tests. https:\/\/github.com\/linux-rdma\/perftest [Accessed: Jan 2025]."},{"key":"e_1_3_2_1_4_1","volume-title":"n.d. ROCm Communication Collectives Library (RCCL). https:\/\/github.com\/ROCm\/rccl [Accessed","author":"AMD.","year":"2025","unstructured":"AMD. n.d. ROCm Communication Collectives Library (RCCL). https:\/\/github.com\/ROCm\/rccl [Accessed: Jan 2025]."},{"key":"e_1_3_2_1_5_1","volume-title":"n.d. What is RCCL? https:\/\/rocmdocs.amd.com\/projects\/rccl\/en\/latest\/what-is-rccl.html [Accessed","author":"AMD.","year":"2025","unstructured":"AMD. n.d. What is RCCL? https:\/\/rocmdocs.amd.com\/projects\/rccl\/en\/latest\/what-is-rccl.html [Accessed: Jan 2025]."},{"key":"e_1_3_2_1_6_1","unstructured":"InfiniBand Trade Association. 2020. InfiniBand Architecture Specification Release."},{"key":"e_1_3_2_1_7_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et al. 2023. Qwen Technical Report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"d.]. Tomahawk 5 \/ BCM78900 Series - 51.2 Tb\/s StrataXGS\u00ae Tomahawk\u00ae 5 Ethernet Switch Series. https:\/\/www.broadcom.com\/products\/ethernet-connectivity\/switching\/strataxgs\/bcm78900-series [Accsessed","author":"Broadcom Inc. [n.","year":"2025","unstructured":"Broadcom Inc. [n. d.]. Tomahawk 5 \/ BCM78900 Series - 51.2 Tb\/s StrataXGS\u00ae Tomahawk\u00ae 5 Ethernet Switch Series. https:\/\/www.broadcom.com\/products\/ethernet-connectivity\/switching\/strataxgs\/bcm78900-series [Accsessed: Jan 2025]."},{"key":"e_1_3_2_1_9_1","volume-title":"Minder: Faulty Machine Detection for Large-scale Distributed Model Training.","author":"Deng Yangtao","year":"2025","unstructured":"Yangtao Deng, Xiang Shi, Zhuo Jiang, Xingjian Zhang, Lei Zhang, Zhang Zhang, Bo Li, Zuquan Song, Hang Zhu, Gaohong Liu, Fuliang Li, Shuguang Wang, Haibin Lin, Jianxi Ye, and Minlan Yu. 2025. Minder: Faulty Machine Detection for Large-scale Distributed Model Training."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3091475"},{"key":"e_1_3_2_1_11_1","volume-title":"Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Krishnakumar Nair, Misha Smelyanskiy, and Murali Annavaram.","author":"Eisenman Assaf","year":"2022","unstructured":"Assaf Eisenman, Kiran Kumar Matam, Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Krishnakumar Nair, Misha Smelyanskiy, and Murali Annavaram. 2022. Check-N-Run: a Checkpointing System for Training Deep Learning Recommendation Models. In USENIX NSDI."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Swapnil Gandhi Mark Zhao Athinagoras Skiadopoulos and Christos Kozyrakis. 2024. ReCycle: Resilient Training of Large DNNs using Pipeline Adaptation. In ACM SOSP.","DOI":"10.1145\/3694715.3695960"},{"key":"e_1_3_2_1_13_1","volume-title":"Guilherme Goes, Hany Morsy, Rohit Puri, Mohammad Riftadi, Ashmitha Jeevaraj Shetty, Jingyi Yang, Shuqiang Zhang, Mikel Jimenez Fernandez, Shashidhar Gandham, and Hongyi Zeng.","author":"Gangidi Adithya","year":"2024","unstructured":"Adithya Gangidi, Rui Miao, Shengbao Zheng, Sai Jayesh Bondu, Guilherme Goes, Hany Morsy, Rohit Puri, Mohammad Riftadi, Ashmitha Jeevaraj Shetty, Jingyi Yang, Shuqiang Zhang, Mikel Jimenez Fernandez, Shashidhar Gandham, and Hongyi Zeng. 2024. RDMA over Ethernet for Distributed Training at Meta Scale. In ACM SIGCOMM."},{"key":"e_1_3_2_1_14_1","unstructured":"Yanjie Gao Yu Liu Hongyu Zhang Zhengxian Li Yonghao Zhu Haoxiang Lin and Mao Yang. 2020. Estimating GPU Memory Consumption of Deep Learning Models. In ESEC\/FSE."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Phillipa Gill Navendu Jain and Nachiappan Nagappan. 2011. Understanding Network Failures in Data Centers: Measurement Analysis and Implications. In ACM SIGCOMM.","DOI":"10.1145\/2018436.2018477"},{"key":"e_1_3_2_1_16_1","volume-title":"Unicron: Economizing Self-Healing LLM Training at Scale. arXiv:2401.00134 [cs.DC] https:\/\/arxiv.org\/abs\/2401.00134","author":"He Tao","year":"2023","unstructured":"Tao He, Xue Li, Zhibin Wang, Kun Qian, Jingbo Xu, Wenyuan Yu, and Jingren Zhou. 2023. Unicron: Economizing Self-Healing LLM Training at Scale. arXiv:2401.00134 [cs.DC] https:\/\/arxiv.org\/abs\/2401.00134"},{"key":"e_1_3_2_1_17_1","volume-title":"Maria Apostolaki, Alberto Dainotti, Stefano Vissicchio, and Laurent Vanbever.","author":"Holterbach Thomas","year":"2019","unstructured":"Thomas Holterbach, Edgar Costa Molero, Maria Apostolaki, Alberto Dainotti, Stefano Vissicchio, and Laurent Vanbever. 2019. Blink: Fast Connectivity Recovery Entirely in the Data Plane. In USENIX NSDI."},{"key":"e_1_3_2_1_18_1","volume-title":"Oobleck: Resilient Distributed Training of Large Models Using Pipeline Templates. In ACM SOSP.","author":"Jang Insu","year":"2023","unstructured":"Insu Jang, Zhenning Yang, Zhen Zhang, Xin Jin, and Mosharaf Chowdhury. 2023. Oobleck: Resilient Distributed Training of Large Models Using Pipeline Templates. In ACM SOSP."},{"key":"e_1_3_2_1_19_1","unstructured":"Myeongjae Jeon Shivaram Venkataraman Amar Phanishayee Junjie Qian Wencong Xiao and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In USENIX ATC."},{"key":"e_1_3_2_1_20_1","unstructured":"Ziheng Jiang Haibin Lin Yinmin Zhong Qi Huang Yangrui Chen Zhi Zhang Yanghua Peng Xiang Li Cong Xie Shibiao Nong Yulu Jia Sun He Hongmin Chen et al. 2024. MegaScale: Scaling Large Language Model Training to More Than 10 000 GPUs. In USENIX NSDI."},{"key":"e_1_3_2_1_21_1","volume-title":"Nishant Budhdev, Ayush Mishra, Mun Choon Chan, and Ben Leong.","author":"Joshi Raj","year":"2023","unstructured":"Raj Joshi, Chahwan Song, Xin Zhe Khooi, Nishant Budhdev, Ayush Mishra, Mun Choon Chan, and Ben Leong. 2023. Masking Corruption Packet Losses in Datacenter Networks with Link-local Retransmission. In ACM SIGCOMM."},{"key":"e_1_3_2_1_22_1","volume-title":"Andersen","author":"Kalia Anuj","year":"2016","unstructured":"Anuj Kalia, Michael Kaminsky, and David G. Andersen. 2016. Design Guidelines for High Performance RDMA Systems. In USENIX ATC."},{"key":"e_1_3_2_1_23_1","volume-title":"TCCL: Co-optimizing Collective Communication and Traffic Routing for GPU-centric Clusters. In ACM SIGCOMM NAIC Workshop.","author":"Li Baojia","year":"2024","unstructured":"Baojia Li, Xiaoliang Wang, Jingzhu Wang, Yifan Liu, Yuanyuan Gong, Hao Lu, Weizhen Dang, Weifeng Zhang, Xiaojie Huang, Mingzhuo Chen, Jie Chen, Chunzhi He, Yadong Liu, Xiaoyuan Hu, Chen Liu, Xuefeng Ji, Yinben Xia, Xiang Li, Zekun He, Yachen Wang, and Xianneng Zou. 2024. TCCL: Co-optimizing Collective Communication and Traffic Routing for GPU-centric Clusters. In ACM SIGCOMM NAIC Workshop."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"e_1_3_2_1_25_1","unstructured":"Shengkai Lin Qinwei Yang Zengyin Yang Yuchuan Wang and Shizhen Zhao. 2024. LubeRDMA: A Fail-safe Mechanism of RDMA. In ACM APNet."},{"key":"e_1_3_2_1_26_1","volume-title":"bonding. https:\/\/wiki.linuxfoundation.org\/networking\/bonding [Accessed","author":"Maintainers Linux Kernel","year":"2024","unstructured":"Linux Kernel Maintainers. 2018. bonding. https:\/\/wiki.linuxfoundation.org\/networking\/bonding [Accessed: Dec 2024]."},{"key":"e_1_3_2_1_27_1","volume-title":"Hostmesh: Monitor and Diagnose Networks in Rail-optimized RoCE Clusters. In ACM APNet.","author":"Liu Kefei","year":"2024","unstructured":"Kefei Liu, Jiao Zhang, Zhuo Jiang, Xuan Zhang, Shixian Guo, Yangyang Bai, Yongbin Dong, Zhang Zhang, Xiang Shi, Lei Wang, Haoran Wei, Zicheng Wang, Yongchen Pan, Tian Pan, and Tao Huang. 2024. Hostmesh: Monitor and Diagnose Networks in Rail-optimized RoCE Clusters. In ACM APNet."},{"key":"e_1_3_2_1_28_1","unstructured":"AI @ Meta LLAMA Team. 2024. The Llama 3 Herd of Models. arXiv:2407.21783 [cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_1_29_1","unstructured":"Yuanwei Lu Guo Chen Bojie Li Kun Tan Yongqiang Xiong Peng Cheng Jiansong Zhang Enhong Chen and Thomas Moscibroda. 2018. Multi-Path Transport for RDMA in Datacenters. In USENIX NSDI."},{"key":"e_1_3_2_1_30_1","unstructured":"Karthik Mandakolathur and Sylvain Jeaugey. 2022. Doubling all2all Performance with NVIDIA Collective Communication Library 2.12. https:\/\/developer.nvidia.com\/blog\/doubling-all2all-performance-with-nvidia-collective-communication-library-2-12\/ [Accessed: Jan 2025]."},{"key":"e_1_3_2_1_31_1","volume-title":"n.d. Microsoft Collective Communication Library. https:\/\/github.com\/Azure\/msccl [Accessed","year":"2025","unstructured":"Microsoft. n.d. Microsoft Collective Communication Library. https:\/\/github.com\/Azure\/msccl [Accessed: Jan 2025]."},{"key":"e_1_3_2_1_32_1","unstructured":"Jayashree Mohan Amar Phanishayee and Vijay Chidambaram. 2021. CheckFreq: Frequent Fine-Grained DNN Checkpointing. In USENIX FAST."},{"key":"e_1_3_2_1_33_1","volume-title":"Dmitri Vainbrand, Prethvi Kashinkunti, Julie Bernauer, Bryan Catanzaro, Amar Phanishayee, and Matei Zaharia.","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan, Mohammad Shoeybi, Jared Casper, Patrick LeGresley, Mostofa Patwary, Vijay Anand Korthikanti, Dmitri Vainbrand, Prethvi Kashinkunti, Julie Bernauer, Bryan Catanzaro, Amar Phanishayee, and Matei Zaharia. 2021. Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM. arXiv:2104.04473 [cs.CL] https:\/\/arxiv.org\/abs\/2104.04473"},{"key":"e_1_3_2_1_34_1","unstructured":"NVIDIA. 2020. Unbreakable Links - MLNX-OS v3.9.0300 - NVIDIA Networking Docs. https:\/\/docs.nvidia.com\/networking\/display\/MLNXOSv390300\/Unbreakable+Links."},{"key":"e_1_3_2_1_35_1","volume-title":"https:\/\/docs.nvidia.com\/doca\/sdk\/link+aggregation\/index.html [Accessed","author":"Link Aggregation NVIDIA.","year":"2024","unstructured":"NVIDIA. 2024. Link Aggregation. https:\/\/docs.nvidia.com\/doca\/sdk\/link+aggregation\/index.html [Accessed: Dec 2024]."},{"key":"e_1_3_2_1_36_1","volume-title":"Network Fabrics \u2014 NVIDIA DGX Super-POD. https:\/\/docs.nvidia.com\/dgx-superpod\/reference-architecture-scalable-infrastructure-h100\/latest\/network-fabrics.html [Accessed","author":"NVIDIA.","year":"2025","unstructured":"NVIDIA. 2024. Network Fabrics \u2014 NVIDIA DGX Super-POD. https:\/\/docs.nvidia.com\/dgx-superpod\/reference-architecture-scalable-infrastructure-h100\/latest\/network-fabrics.html [Accessed: Jan 2025]."},{"key":"e_1_3_2_1_37_1","volume-title":"NVIDIA Resiliency Extension. https:\/\/github.com\/NVIDIA\/nvidia-resiliency-ext [Accessed","author":"NVIDIA.","year":"2024","unstructured":"NVIDIA. 2024. NVIDIA Resiliency Extension. https:\/\/github.com\/NVIDIA\/nvidia-resiliency-ext [Accessed: Dec 2024]."},{"key":"e_1_3_2_1_38_1","volume-title":"n.d. NCCL Tests. https:\/\/github.com\/NVIDIA\/nccl-tests [Accessed","author":"NVIDIA.","year":"2025","unstructured":"NVIDIA. n.d. NCCL Tests. https:\/\/github.com\/NVIDIA\/nccl-tests [Accessed: Jan 2025]."},{"key":"e_1_3_2_1_39_1","volume-title":"n.d. NVIDIA Collective Communication Library (NCCL) Documentation. https:\/\/docs.nvidia.com\/deeplearning\/nccl\/user-guide\/docs\/index.html [Accessed","author":"NVIDIA.","year":"2024","unstructured":"NVIDIA. n.d. NVIDIA Collective Communication Library (NCCL) Documentation. https:\/\/docs.nvidia.com\/deeplearning\/nccl\/user-guide\/docs\/index.html [Accessed: August 2024]."},{"key":"e_1_3_2_1_40_1","volume-title":"n.d. Optimized primitives for inter-GPU communication. https:\/\/github.com\/NVIDIA\/nccl [Accessed","author":"NVIDIA.","year":"2025","unstructured":"NVIDIA. n.d. Optimized primitives for inter-GPU communication. https:\/\/github.com\/NVIDIA\/nccl [Accessed: Jan 2025]."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Kun Qian Yongqing Xi Jiamin Cao Jiaqi Gao Yichi Xu Yu Guan Binzhang Fu Xuemei Shi Fangbo Zhu Rui Miao Chao Wang Peng Wang Pengcheng Zhang Xianlong Zeng Eddie Ruan Zhiping Yao Ennan Zhai and Dennis Cai. 2024. Alibaba HPN: A Data Center Network for Large Language Model Training. In ACM SIGCOMM.","DOI":"10.1145\/3651890.3672265"},{"key":"e_1_3_2_1_42_1","volume-title":"Ben Leong, Deke Guo, and Zhong Liu.","author":"Qu Ting","year":"2019","unstructured":"Ting Qu, Raj Joshi, Mun Choon Chan, Ben Leong, Deke Guo, and Zhong Liu. 2019. SQR: In-network Packet Loss Recovery from Link Failures for Highly Reliable Datacenter Networks. In IEEE ICNP."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Jeff Rasley Samyam Rajbhandari Olatunji Ruwase and Yuxiong He. 2020. DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. In ACM KDD.","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_44_1","unstructured":"Zhenghang Ren Yuxuan Li Zilong Wang Xinyang Huang Wenxue Li Kaiqiang Xu Xudong Liao Yijun Sun Bowen Liu Han Tian et al. 2025. Enabling Efficient GPU Communication over Multiple NICs with FuseLink. In USENIX OSDI."},{"key":"e_1_3_2_1_45_1","volume-title":"BLOOM: A 176B-Parameter Open-Access Multilingual Language Model. arXiv:2211.05100 [cs.CL] https:\/\/arxiv.org\/abs\/2211.05100","author":"Scao Teven Le","year":"2023","unstructured":"Teven Le Scao et al. 2023. BLOOM: A 176B-Parameter Open-Access Multilingual Language Model. arXiv:2211.05100 [cs.CL] https:\/\/arxiv.org\/abs\/2211.05100"},{"key":"e_1_3_2_1_46_1","volume-title":"Horovod: fast and easy distributed deep learning in TensorFlow. CoRR abs\/1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. CoRR abs\/1802.05799 (2018). arXiv:1802.05799 http:\/\/arxiv.org\/abs\/1802.05799"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3464994.3464996"},{"key":"e_1_3_2_1_48_1","volume-title":"Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In USENIX NSDI.","author":"Thorpe John","year":"2023","unstructured":"John Thorpe, Pengzhan Zhao, Jonathan Eyolfson, Yifan Qiao, Zhihao Jia, Minjia Zhang, Ravi Netravali, and Guoqing Harry Xu. 2023. Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In USENIX NSDI."},{"key":"e_1_3_2_1_49_1","volume-title":"A Novel Software-based Multi-path RDMA Solution for Data Center Networks. CoRR abs\/2009.00243","author":"Tian Feng","year":"2020","unstructured":"Feng Tian, Wendi Feng, Yang Zhang, and Zhi-Li Zhang. 2020. A Novel Software-based Multi-path RDMA Solution for Data Center Networks. CoRR abs\/2009.00243 (2020)."},{"key":"e_1_3_2_1_50_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv:2307.09288 [cs.CL] https:\/\/arxiv.org\/abs\/2307.09288"},{"key":"e_1_3_2_1_51_1","volume-title":"Rail-only: A Low-Cost High-Performance Network for Training LLMs with Trillion Parameters","author":"Wang Weiyang","year":"2024","unstructured":"Weiyang Wang, Manya Ghobadi, Kayvon Shakeri, Ying Zhang, and Naader Hasani. 2024. Rail-only: A Low-Cost High-Performance Network for Training LLMs with Trillion Parameters. In IEEE HOTI."},{"key":"e_1_3_2_1_52_1","volume-title":"GEMINI: Fast Failure Recovery in Distributed Training with In-Memory Checkpoints. In ACM SOSP.","author":"Wang Zhuang","year":"2023","unstructured":"Zhuang Wang, Zhen Jia, Shuai Zheng, Zhen Zhang, Xinwei Fu, T. S. Eugene Ng, and Yida Wang. 2023. GEMINI: Fast Failure Recovery in Distributed Training with In-Memory Checkpoints. In ACM SOSP."},{"key":"e_1_3_2_1_53_1","unstructured":"Tong Xiao and Jingbo Zhu. 2025. Foundations of Large Language Models. arXiv:2501.09223 [cs.CL] https:\/\/arxiv.org\/abs\/2501.09223"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Chenyu Yang Yuntao Chen Hao Tian Chenxin Tao Xizhou Zhu Zhaoxiang Zhang Gao Huang Hongyang Li Yu Qiao Lewei Lu Jie Zhou and Jifeng Dai. 2022. BEVFormer v2: Adapting Modern Image Backbones to Bird's-Eye-View Recognition via Perspective Supervision. arXiv:2211.10439 [cs.CV] https:\/\/arxiv.org\/abs\/2211.10439","DOI":"10.1109\/CVPR52729.2023.01710"},{"key":"e_1_3_2_1_55_1","unstructured":"Wayne Xin Zhao Kun Zhou Junyi Li Tianyi Tang Xiaolei Wang Yupeng Hou Yingqian Min Beichen Zhang Junjie Zhang Zican Dong Yifan Du Chen Yang Yushuo Chen Zhipeng Chen Jinhao Jiang Ruiyang Ren Yifan Li Xinyu Tang Zikang Liu Peiyu Liu Jian-Yun Nie and Ji-Rong Wen. 2025. A Survey of Large Language Models. arXiv:2501.12948 [cs.CL] https:\/\/arxiv.org\/abs\/2303.18223"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"Danyang Zhuo Monia Ghobadi Ratul Mahajan Klaus-Tycho F\u00f6rster Arvind Krishnamurthy and Thomas Anderson. 2017. Understanding and Mitigating Packet Corruption in Data Center Networks. In ACM SIGCOMM.","DOI":"10.1145\/3098822.3098849"},{"key":"e_1_3_2_1_57_1","unstructured":"Yazhou Zu Alireza Ghaffarkhah Hoang-Vu Dang Brian Towles Steven Hand Safeen Huda Adekunle Bello Alexander Kolbasov Arash Rezaei Dayou Du Steve Lacy Hang Wang Aaron Wisner Chris Lewis and Henri Bahini. 2024. Resiliency at Scale: Managing Google's TPUv4 Machine Learning Supercomputer. In USENIX NSDI."}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"deposited":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:21:56Z","timestamp":1777062116000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3769322"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":57,"alternative-id":["10.1145\/3767295.3769322","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3769322","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}