{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,2]],"date-time":"2025-10-02T00:47:25Z","timestamp":1759366045116,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":70,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3731569.3764848","type":"proceedings-article","created":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:43:24Z","timestamp":1759322604000},"page":"254-269","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Mycroft: Tracing Dependencies in Collective Communication Towards Reliable LLM Training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4072-7696","authenticated-orcid":false,"given":"Yangtao","family":"Deng","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5132-4251","authenticated-orcid":false,"given":"Lei","family":"Zhang","sequence":"additional","affiliation":[{"name":"ByteDance, Seattle, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5406-6354","authenticated-orcid":false,"given":"Qinlong","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5403-9720","authenticated-orcid":false,"given":"Xiaoyun","family":"Zhi","sequence":"additional","affiliation":[{"name":"ByteDance Seed, San Jose, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2279-4258","authenticated-orcid":false,"given":"Xinlei","family":"Zhang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6144-7899","authenticated-orcid":false,"given":"Zhuo","family":"Jiang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9813-4570","authenticated-orcid":false,"given":"Haohan","family":"Xu","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3809-1879","authenticated-orcid":false,"given":"Lei","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7576-6162","authenticated-orcid":false,"given":"Zuquan","family":"Song","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Seattle, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2551-7879","authenticated-orcid":false,"given":"Gaohong","family":"Liu","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6416-0074","authenticated-orcid":false,"given":"Yang","family":"Bai","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4249-4092","authenticated-orcid":false,"given":"Shuguang","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3043-522X","authenticated-orcid":false,"given":"Wencong","family":"Xiao","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3395-3624","authenticated-orcid":false,"given":"Jianxi","family":"Ye","sequence":"additional","affiliation":[{"name":"ByteDance, Seattle, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2381-0212","authenticated-orcid":false,"given":"Minlan","family":"Yu","sequence":"additional","affiliation":[{"name":"Harvard University, Boston, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9359-9571","authenticated-orcid":false,"given":"Hong","family":"Xu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2025. py-spy. https:\/\/pypi.org\/project\/py-spy\/. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_2_1","unstructured":"2025. RCCL. https:\/\/github.com\/ROCm\/rccl. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_3_1","volume-title":"XPUTimer: Anomaly Diagnostics for Divergent LLM Training in GPU Clusters of Thousand-Plus Scale. arXiv preprint arXiv:2502.05413","author":"Cui Weihao","year":"2025","unstructured":"Weihao Cui, Ji Zhang, Han Zhao, Chao Liu, Wenhao Zhang, Jian Sha, Quan Chen, Bingsheng He, and Minyi Guo. 2025. XPUTimer: Anomaly Diagnostics for Divergent LLM Training in GPU Clusters of Thousand-Plus Scale. arXiv preprint arXiv:2502.05413 (2025)."},{"key":"e_1_3_2_1_4_1","volume-title":"Minder: Faulty Machine Detection for Large-scale Distributed Model Training. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)","author":"Deng Yangtao","year":"2025","unstructured":"Yangtao Deng, Xiang Shi, Zhuo Jiang, Xingjian Zhang, Lei Zhang, Zhang Zhang, Bo Li, Zuquan Song, Hang Zhu, Gaohong Liu, et al. 2025. Minder: Faulty Machine Detection for Large-scale Distributed Model Training. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25). 505\u2013521."},{"key":"e_1_3_2_1_5_1","volume-title":"2025 IEEE International Symposium on High Performance Computer Architecture (HPCA). 1246\u20131258","author":"Dong Jianbo","year":"2025","unstructured":"Jianbo Dong, Bin Luo, Jun Zhang, Pengcheng Zhang, Fei Feng, Yikai Zhu, Ang Liu, Zian Chen, Yi Shi, Hairong Jiao, Gang Lu, Yu Guan, Ennan Zhai, Wencong Xiao, Hanyu Zhao, Man Yuan, Siran Yang, Xiang Li, Jiamang Wang, Rui Men, Jianwei Zhang, Chang Zhou, Dennis Cai, Yuan Xie, and Binzhang Fu. 2025. Enhancing Large-Scale AI Training Efficiency: The C4 Solution for Real-Time Anomaly Detection and Communication Optimization. In 2025 IEEE International Symposium on High Performance Computer Architecture (HPCA). 1246\u20131258. 10.1109\/HPCA61900.2025.00095"},{"key":"e_1_3_2_1_6_1","volume-title":"Evolution of Aegis: Fault Diagnosis for AI Model Training Service in Production. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)","author":"Dong Jianbo","year":"2025","unstructured":"Jianbo Dong, Kun Qian, Pengcheng Zhang, Zhilong Zheng, Liang Chen, Fei Feng, Yichi Xu, Yikai Zhu, Gang Lu, Xue Li, et al. 2025. Evolution of Aegis: Fault Diagnosis for AI Model Training Service in Production. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25). 865\u2013881."},{"key":"e_1_3_2_1_7_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The Llama 3 Herd of Models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_8_1","volume-title":"X-Trace: A Pervasive Network Tracing Framework. In 4th USENIX Symposium on Networked Systems Design & Implementation (NSDI 07)","author":"Fonseca Rodrigo","year":"2007","unstructured":"Rodrigo Fonseca, George Porter, Randy H Katz, and Scott Shenker. 2007. X-Trace: A Pervasive Network Tracing Framework. In 4th USENIX Symposium on Networked Systems Design & Implementation (NSDI 07)."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles. 211\u2013228","author":"Gandhi Swapnil","year":"2024","unstructured":"Swapnil Gandhi, Mark Zhao, Athinagoras Skiadopoulos, and Christos Kozyrakis. 2024. ReCycle: Resilient Training of Large DNNs using Pipeline Adaptation. In Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles. 211\u2013228."},{"key":"e_1_3_2_1_10_1","volume-title":"Open MPI: A Flexible High Performance MPI. In Parallel Processing and Applied Mathematics: 6th International Conference, PPAM 2005","author":"Graham Richard L","year":"2006","unstructured":"Richard L Graham, Timothy S Woodall, and Jeffrey M Squyres. 2006. Open MPI: A Flexible High Performance MPI. In Parallel Processing and Applied Mathematics: 6th International Conference, PPAM 2005, Pozna\u0144, Poland, September 11-14, 2005, Revised Selected Papers 6. Springer, 228\u2013239."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3242086","article-title":"Fail-Slow at Scale: Evidence of Hardware Performance Faults in Large Production Systems","volume":"14","author":"Gunawi Haryadi S","year":"2018","unstructured":"Haryadi S Gunawi, Riza O Suminto, Russell Sears, Casey Golliher, Swaminathan Sundararaman, Xing Lin, Tim Emami, Weiguang Sheng, Nematollah Bidokhti, Caitie McCaffrey, et al. 2018. Fail-Slow at Scale: Evidence of Hardware Performance Faults in Large Production Systems. ACM Transactions on Storage (TOS) 14, 3 (2018), 1\u201326.","journal-title":"ACM Transactions on Storage (TOS)"},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the 2015 ACM Conference on Special Interest Group on Data Communication. 139\u2013152","author":"Guo Chuanxiong","year":"2015","unstructured":"Chuanxiong Guo, Lihua Yuan, Dong Xiang, Yingnong Dang, Ray Huang, Dave Maltz, Zhaoyi Liu, Vin Wang, Bin Pang, Hua Chen, et al. 2015. Pingmesh: A Large-scale System for Data Center Network Latency Measurement and Analysis. In Proceedings of the 2015 ACM Conference on Special Interest Group on Data Communication. 139\u2013152."},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the seventh ACM symposium on cloud computing. 98\u2013111","author":"Harlap Aaron","year":"2016","unstructured":"Aaron Harlap, Henggang Cui, Wei Dai, Jinliang Wei, Gregory R Ganger, Phillip B Gibbons, Garth A Gibson, and Eric P Xing. 2016. Addressing the Straggler Problem for Iterative Convergent Parallel ML. In Proceedings of the seventh ACM symposium on cloud computing. 98\u2013111."},{"key":"e_1_3_2_1_14_1","volume-title":"Characterization of Large Language Model Development in the Datacenter. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Hu Qinghao","year":"2024","unstructured":"Qinghao Hu, Zhisheng Ye, Zerui Wang, Guoteng Wang, Meng Zhang, Qiaoling Chen, Peng Sun, Dahua Lin, Xiaolin Wang, Yingwei Luo, et al. 2024. Characterization of Large Language Model Development in the Datacenter. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). 709\u2013729."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 16th Workshop on Hot Topics in Operating Systems. 150\u2013155","author":"Huang Peng","year":"2017","unstructured":"Peng Huang, Chuanxiong Guo, Lidong Zhou, Jacob R Lorch, Yingnong Dang, Murali Chintalapati, and Randolph Yao. 2017. Gray Failure: The Achilles' Heel of Cloud-Scale Systems. In Proceedings of the 16th Workshop on Hot Topics in Operating Systems. 150\u2013155."},{"key":"e_1_3_2_1_16_1","unstructured":"Yanping Huang Youlong Cheng Ankur Bapna Orhan Firat Dehao Chen Mia Chen HyoukJoong Lee Jiquan Ngiam Quoc V Le Yonghui Wu et al. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_17_1","unstructured":"IBM. 2025. Autopilot. https:\/\/github.com\/IBM\/autopilot. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles. 382\u2013395","author":"Jang Insu","year":"2023","unstructured":"Insu Jang, Zhenning Yang, Zhen Zhang, Xin Jin, and Mosharaf Chowdhury. 2023. Oobleck: Resilient Distributed Training of Large Models Using Pipeline Templates. In Proceedings of the 29th Symposium on Operating Systems Principles. 382\u2013395."},{"key":"e_1_3_2_1_19_1","volume-title":"Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). 947\u2013960."},{"key":"e_1_3_2_1_20_1","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Jiang Ziheng","year":"2024","unstructured":"Ziheng Jiang, Haibin Lin, Yinmin Zhong, Qi Huang, Yangrui Chen, Zhi Zhang, Yanghua Peng, Xiang Li, Cong Xie, Shibiao Nong, et al. 2024. MegaScale: Scaling Large Language Model Training to More Than 10,000 GPUs. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). 745\u2013760."},{"key":"e_1_3_2_1_21_1","volume-title":"Revisiting Reliability in Large-Scale Machine Learning Research Clusters. arXiv preprint arXiv:2410.21680","author":"Kokolis Apostolos","year":"2024","unstructured":"Apostolos Kokolis, Michael Kuchnik, John Hoffman, Adithya Kumar, Parth Malani, Faye Ma, Zachary DeVito, Shubho Sengupta, Kalyan Saladi, and Carole-Jean Wu. 2024. Revisiting Reliability in Large-Scale Machine Learning Research Clusters. arXiv preprint arXiv:2410.21680 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"ExChain: Exception Dependency Analysis for Root Cause Diagnosis. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Li Ao","year":"2024","unstructured":"Ao Li, Shan Lu, Zhuotao Liu, Suman Nath, Michael Leighton, Rohan Padhye, Diedi Hu, Bingchuan Tian, Vyas Sekar, Maomao Ding, et al. 2024. ExChain: Exception Dependency Analysis for Root Cause Diagnosis. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). 2047\u20132062."},{"key":"e_1_3_2_1_23_1","volume-title":"Malleus: Straggler-Resilient Hybrid Parallel Training of Large-scale Models via Malleable Data and Model Parallelization. arXiv preprint arXiv:2410.13333","author":"Li Haoyang","year":"2024","unstructured":"Haoyang Li, Fangcheng Fu, Hao Ge, Sheng Lin, Xuanyu Wang, Jiawen Niu, Yujie Wang, Hailin Zhang, Xiaonan Nie, and Bin Cui. 2024. Malleus: Straggler-Resilient Hybrid Parallel Training of Large-scale Models via Malleable Data and Model Parallelization. arXiv preprint arXiv:2410.13333 (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the 50th Annual International Symposium on Computer Architecture (ISCA '23)","author":"Liang Mingyu","year":"2023","unstructured":"Mingyu Liang, Wenyin Fu, Louis Feng, Zhongyi Lin, Pavani Panakanti, Shengbao Zheng, Srinivas Sridharan, and Christina Delimitrou. 2023. Mystique: Enabling Accurate and Scalable Generation of Production AI Benchmarks. In Proceedings of the 50th Annual International Symposium on Computer Architecture (ISCA '23). Association for Computing Machinery, Article 37, 13 pages."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the ACM SIGCOMM 2024 Conference. 554\u2013567","author":"Liu Kefei","year":"2024","unstructured":"Kefei Liu, Zhuo Jiang, Jiao Zhang, Shixian Guo, Xuan Zhang, Yangyang Bai, Yongbin Dong, Feng Luo, Zhang Zhang, Lei Wang, et al. 2024. R-pingmesh: A Service-aware RoCE Network Monitoring and Diagnostic System. In Proceedings of the ACM SIGCOMM 2024 Conference. 554\u2013567."},{"key":"e_1_3_2_1_26_1","volume-title":"Hostping: Diagnosing Intra-host Network Bottlenecks in RDMA Servers. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Liu Kefei","year":"2023","unstructured":"Kefei Liu, Zhuo Jiang, Jiao Zhang, Haoran Wei, Xiaolong Zhong, Lizhuang Tan, Tian Pan, and Tao Huang. 2023. Hostping: Diagnosing Intra-host Network Bottlenecks in RDMA Servers. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 15\u201329."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3208104","article-title":"Pivot Tracing: Dynamic Causal Monitoring for Distributed Systems","volume":"35","author":"Mace Jonathan","year":"2018","unstructured":"Jonathan Mace, Ryan Roelke, and Rodrigo Fonseca. 2018. Pivot Tracing: Dynamic Causal Monitoring for Distributed Systems. ACM Transactions on Computer Systems (TOCS) 35, 4 (2018), 1\u201328.","journal-title":"ACM Transactions on Computer Systems (TOCS)"},{"key":"e_1_3_2_1_28_1","volume-title":"KungFu: Making Training in Distributed Machine Learning Adaptive. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Mai Luo","year":"2020","unstructured":"Luo Mai, Guo Li, Marcel Wagenl\u00e4nder, Konstantinos Fertakis, Andrei-Octavian Brabete, and Peter Pietzuch. 2020. KungFu: Making Training in Distributed Machine Learning Adaptive. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 937\u2013954."},{"key":"e_1_3_2_1_29_1","unstructured":"Meta. 2025. Holistic Trace Analysis. https:\/\/github.com\/facebookresearch\/HolisticTraceAnalysis. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_30_1","unstructured":"Meta. 2025. Kineto. https:\/\/github.com\/pytorch\/kineto. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_31_1","unstructured":"Meta. 2025. OPT-175B logbook. https:\/\/github.com\/facebookresearch\/metaseq\/blob\/main\/projects\/OPT\/chronicles\/OPT175B_Logbook.pdf. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_32_1","unstructured":"Microsoft. 2025. MSCCL. https:\/\/github.com\/microsoft\/msccl. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_33_1","unstructured":"Microsoft. 2025. NCCL Profiling Kit (NPKit). https:\/\/github.com\/microsoft\/NPKit. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_34_1","volume-title":"Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. CheckFreq: Frequent, Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21). 203\u2013216."},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the 27th ACM symposium on operating systems principles. 1\u201315","author":"Narayanan Deepak","year":"2019","unstructured":"Deepak Narayanan, Aaron Harlap, Amar Phanishayee, Vivek Seshadri, Nikhil R Devanur, Gregory R Ganger, Phillip B Gibbons, and Matei Zaharia. 2019. PipeDream: Generalized Pipeline Parallelism for DNN Training. In Proceedings of the 27th ACM symposium on operating systems principles. 1\u201315."},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the international conference for high performance computing, networking, storage and analysis. 1\u201315","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan, Mohammad Shoeybi, Jared Casper, Patrick LeGresley, Mostofa Patwary, Vijay Korthikanti, Dmitri Vainbrand, Prethvi Kashinkunti, Julie Bernauer, Bryan Catanzaro, et al. 2021. Efficient Large-scale Language Model Training on GPU Clusters Using Megatron-LM. In Proceedings of the international conference for high performance computing, networking, storage and analysis. 1\u201315."},{"key":"e_1_3_2_1_37_1","unstructured":"Maxim Naumov John Kim Dheevatsa Mudigere Srinivas Sridharan Xiaodong Wang Whitney Zhao Serhat Yilmaz Changkyu Kim Hector Yuen Mustafa Ozdal et al. 2020. Deep Learning Training in Facebook Data Centers: Design of Scale-up and Scale-out Systems. arXiv preprint arXiv:2003.09518 (2020)."},{"key":"e_1_3_2_1_38_1","unstructured":"NVIDIA. 2025. ConnectX-6. https:\/\/www.nvidia.com\/en-sg\/networking\/ethernet\/connectx-6\/. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_39_1","unstructured":"NVIDIA. 2025. CUDA Profiling Tools Interface. https:\/\/developer.nvidia.com\/cupti. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_40_1","unstructured":"NVIDIA. 2025. Error Injection. https:\/\/docs.nvidia.com\/datacenter\/dcgm\/latest\/user-guide\/dcgm-error-injection.html. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_41_1","unstructured":"NVIDIA. 2025. NCCL Test. https:\/\/github.com\/NVIDIA\/nccl-tests. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_42_1","unstructured":"NVIDIA. 2025. Nsight Systems. https:\/\/developer.nvidia.com\/nsight-systems. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_43_1","unstructured":"NVIDIA. 2025. NVIDIA A100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/a100\/. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_44_1","unstructured":"NVIDIA. 2025. NVIDIA Collective Communications Library (NCCL). https:\/\/developer.nvidia.com\/nccl. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_45_1","unstructured":"NVIDIA. 2025. nvidia-resiliency-ext. https:\/\/nvidia.github.io\/nvidia-resiliency-ext\/. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_46_1","unstructured":"Nvidia. 2025. NVLink and NVSwitch. https:\/\/www.nvidia.com\/en-us\/data-center\/nvlink\/. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_47_1","unstructured":"NVIDIA. 2025. perftest. https:\/\/github.com\/linux-rdma\/perftest. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_48_1","unstructured":"PyTorch. 2025. Distributed communication package - torch.distributed. https:\/\/pytorch.org\/docs\/stable\/distributed.html. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_49_1","unstructured":"PyTorch. 2025. Flight Recorder. https:\/\/pytorch.org\/tutorials\/prototype\/flight_recorder_tutorial.html. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_50_1","unstructured":"PyTorch. 2025. PyTorch Profiler. https:\/\/pytorch.org\/docs\/stable\/profiler.html. (2025). Accessed: 2025-04-17."},{"key":"e_1_3_2_1_51_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Qi Penghui","year":"2024","unstructured":"Penghui Qi, Xinyi Wan, Guangxing Huang, and Min Lin. 2024. Zero Bubble (almost) Pipeline Parallelism. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_52_1","volume-title":"Zero: Memory Optimizations Toward Training Trillion Parameter Models. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201316","author":"Rajbhandari Samyam","year":"2020","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. 2020. Zero: Memory Optimizations Toward Training Trillion Parameter Models. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201316."},{"key":"e_1_3_2_1_53_1","volume-title":"Megatron-LM: Training Multi-billion Parameter Language Models Using Model Parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-LM: Training Multi-billion Parameter Language Models Using Model Parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_54_1","volume-title":"Mike Burrows, Pat Stephenson, Manoj Plakal, Donald Beaver, Saul Jaspan, and Chandan Shanbhag.","author":"Sigelman Benjamin H","year":"2010","unstructured":"Benjamin H Sigelman, Luiz Andr\u00e9 Barroso, Mike Burrows, Pat Stephenson, Manoj Plakal, Donald Beaver, Saul Jaspan, and Chandan Shanbhag. 2010. Dapper, a Large-Scale Distributed Systems Tracing Infrastructure. Technical report, Google, Inc (2010)."},{"key":"e_1_3_2_1_55_1","unstructured":"Shaden Smith Mostofa Patwary Brandon Norick Patrick LeGresley Samyam Rajbhandari Jared Casper Zhun Liu Shrimai Prabhumoye George Zerveas Vijay Korthikanti et al. 2022. Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530b a Large-Scale Generative Language Model. arXiv preprint arXiv:2201.11990 (2022)."},{"key":"e_1_3_2_1_56_1","volume-title":"Chakra: Advancing Performance Benchmarking and Co-design using Standardized Execution Traces. arXiv preprint arXiv:2305.14516","author":"Sridharan Srinivas","year":"2023","unstructured":"Srinivas Sridharan, Taekyung Heo, Louis Feng, Zhaodong Wang, Matt Bergeron, Wenyin Fu, Shengbao Zheng, Brian Coutinho, Saeed Rashidi, Changhai Man, et al. 2023. Chakra: Advancing Performance Benchmarking and Co-design using Standardized Execution Traces. arXiv preprint arXiv:2305.14516 (2023)."},{"key":"e_1_3_2_1_57_1","volume-title":"Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Thorpe John","year":"2023","unstructured":"John Thorpe, Pengzhan Zhao, Jonathan Eyolfson, Yifan Qiao, Zhihao Jia, Minjia Zhang, Ravi Netravali, and Guoqing Harry Xu. 2023. Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 497\u2013513."},{"key":"e_1_3_2_1_58_1","unstructured":"Borui Wan Mingji Han Yiyao Sheng Yanghua Peng Haibin Lin Mofan Zhang Zhichao Lai Menghan Yu Junda Zhang Zuquan Song et al. 2024. ByteCheckpoint: A Unified Checkpointing System for Large Foundation Model Development. arXiv preprint arXiv:2407.20143 (2024)."},{"key":"e_1_3_2_1_59_1","volume-title":"Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"1","author":"Wang Shibo","year":"2022","unstructured":"Shibo Wang, Jinliang Wei, Amit Sabne, Andy Davis, Berkin Ilbeyi, Blake Hechtman, Dehao Chen, Karthik Srinivasa Murthy, Marcello Maggioni, Qiao Zhang, et al. 2022. Overlap Communication with Dependent Computation via Decomposition in Large Deep Learning Models. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1. 93\u2013106."},{"key":"e_1_3_2_1_60_1","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles. 364\u2013381","author":"Wang Zhuang","year":"2023","unstructured":"Zhuang Wang, Zhen Jia, Shuai Zheng, Zhen Zhang, Xinwei Fu, TS Eugene Ng, and Yida Wang. 2023. GEMINI: Fast Failure Recovery in Distributed Training with In-Memory Checkpoints. In Proceedings of the 29th Symposium on Operating Systems Principles. 364\u2013381."},{"key":"e_1_3_2_1_61_1","volume-title":"2025 USENIX Annual Technical Conference (USENIX ATC 25)","author":"Wu Tianyuan","year":"2025","unstructured":"Tianyuan Wu, Wei Wang, Yinghao Yu, Siran Yang, Wenchao Wu, Qinkai Duan, Guodong Yang, Jiamang Wang, Lin Qu, and Liping Zhang. 2025. GREYHOUND: Hunting Fail-Slows in Hybrid-Parallel Training at Scale. In 2025 USENIX Annual Technical Conference (USENIX ATC 25). 731\u2013747."},{"key":"e_1_3_2_1_62_1","volume-title":"Cloud Atlas: Efficient Fault Localization for Cloud Systems using Language Models and Causal Insight. arXiv preprint arXiv:2407.08694","author":"Xie Zhiqiang","year":"2024","unstructured":"Zhiqiang Xie, Yujia Zheng, Lizi Ottens, Kun Zhang, Christos Kozyrakis, and Jonathan Mace. 2024. Cloud Atlas: Efficient Fault Localization for Cloud Systems using Language Models and Causal Insight. arXiv preprint arXiv:2407.08694 (2024)."},{"key":"e_1_3_2_1_63_1","volume-title":"SuperBench: Improving Cloud AI Infrastructure Reliability with Proactive Validation. In 2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Xiong Yifan","year":"2024","unstructured":"Yifan Xiong, Yuting Jiang, Ziyue Yang, Lei Qu, Guoshuai Zhao, Shuguang Liu, Dong Zhong, Boris Pinzur, Jie Zhang, Yang Wang, et al. 2024. SuperBench: Improving Cloud AI Infrastructure Reliability with Proactive Validation. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). 835\u2013850."},{"key":"e_1_3_2_1_64_1","volume-title":"Companion Proceedings of the 32nd ACM International Conference on the Foundations of Software Engineering. 50\u201361","author":"Yao Zhenhe","year":"2024","unstructured":"Zhenhe Yao, Changhua Pei, Wenxiao Chen, Hanzhang Wang, Liangfei Su, Huai Jiang, Zhe Xie, Xiaohui Nie, and Dan Pei. 2024. Chain-of-Event: Interpretable Root Cause Analysis for Microservices through Automatically Learning Weighted Event Causal Graph. In Companion Proceedings of the 32nd ACM International Conference on the Foundations of Software Engineering. 50\u201361."},{"key":"e_1_3_2_1_65_1","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521\u2013538."},{"key":"e_1_3_2_1_66_1","volume-title":"The Benefit of Hindsight: Tracing Edge-Cases in Distributed Systems. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Zhang Lei","year":"2023","unstructured":"Lei Zhang, Zhiqiang Xie, Vaastav Anand, Ymir Vigfusson, and Jonathan Mace. 2023. The Benefit of Hindsight: Tracing Edge-Cases in Distributed Systems. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 321\u2013339."},{"key":"e_1_3_2_1_67_1","volume-title":"Proceedings of the Workshop on Network Meets AI & ML. 8\u201313","author":"Zhang Zhen","year":"2020","unstructured":"Zhen Zhang, Chaokun Chang, Haibin Lin, Yida Wang, Raman Arora, and Xin Jin. 2020. Is Network the Bottleneck of Distributed Training?. In Proceedings of the Workshop on Network Meets AI & ML. 8\u201313."},{"key":"e_1_3_2_1_68_1","volume-title":"AdapCC: Making Collective Communication in Distributed Machine Learning Adaptive. In 2024 IEEE 44th International Conference on Distributed Computing Systems (ICDCS). IEEE, 25\u201335","author":"Zhao Xiaoyang","year":"2024","unstructured":"Xiaoyang Zhao, Zhe Zhang, and Chuan Wu. 2024. AdapCC: Making Collective Communication in Distributed Machine Learning Adaptive. In 2024 IEEE 44th International Conference on Distributed Computing Systems (ICDCS). IEEE, 25\u201335."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"crossref","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer et al. 2023. Pytorch FSDP: Experiences on Scaling Fully Sharded Data Parallel. arXiv preprint arXiv:2304.11277 (2023).","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_2_1_70_1","volume-title":"Alpa: Automating Inter-and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P Xing, et al. 2022. Alpa: Automating Inter-and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 559\u2013578."}],"event":{"name":"SOSP '25: ACM SIGOPS 31st Symposium on Operating Systems Principles","location":"Lotte Hotel World Seoul Republic of Korea","acronym":"SOSP '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","USENIX"]},"container-title":["Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles"],"original-title":[],"deposited":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:46:31Z","timestamp":1759322791000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731569.3764848"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":70,"alternative-id":["10.1145\/3731569.3764848","10.1145\/3731569"],"URL":"https:\/\/doi.org\/10.1145\/3731569.3764848","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}