{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T08:35:22Z","timestamp":1777106122202,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":98,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3731569.3764838","type":"proceedings-article","created":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:43:24Z","timestamp":1759322604000},"page":"186-203","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Robust LLM Training Infrastructure at ByteDance"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-5902-1611","authenticated-orcid":false,"given":"Borui","family":"Wan","sequence":"first","affiliation":[{"name":"School of Computing and Data Science, The University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2551-7879","authenticated-orcid":false,"given":"Gaohong","family":"Liu","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7576-6162","authenticated-orcid":false,"given":"Zuquan","family":"Song","sequence":"additional","affiliation":[{"name":"ByteDance, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8493-0624","authenticated-orcid":false,"given":"Jun","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9159-3107","authenticated-orcid":false,"given":"Yun","family":"Zhang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3395-3994","authenticated-orcid":false,"given":"Guangming","family":"Sheng","sequence":"additional","affiliation":[{"name":"School of Computing and Data Science, The University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4249-4092","authenticated-orcid":false,"given":"Shuguang","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0066-8154","authenticated-orcid":false,"given":"Houmin","family":"Wei","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5553-2601","authenticated-orcid":false,"given":"Chenyuan","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7390-0581","authenticated-orcid":false,"given":"Weiqiang","family":"Lou","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2195-8834","authenticated-orcid":false,"given":"Xi","family":"Yang","sequence":"additional","affiliation":[{"name":"ByteDance, San Jose, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1177-9771","authenticated-orcid":false,"given":"Mofan","family":"Zhang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4598-3807","authenticated-orcid":false,"given":"Kaihua","family":"Jiang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9946-8320","authenticated-orcid":false,"given":"Cheng","family":"Ren","sequence":"additional","affiliation":[{"name":"ByteDance, San Jose, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5403-9720","authenticated-orcid":false,"given":"Xiaoyun","family":"Zhi","sequence":"additional","affiliation":[{"name":"ByteDance, San Jose, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8240-8144","authenticated-orcid":false,"given":"Menghan","family":"Yu","sequence":"additional","affiliation":[{"name":"ByteDance, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6311-0018","authenticated-orcid":false,"given":"Zhe","family":"Nan","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2228-1527","authenticated-orcid":false,"given":"Zhuolin","family":"Zheng","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0833-5619","authenticated-orcid":false,"given":"Baoquan","family":"Zhong","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5406-6354","authenticated-orcid":false,"given":"Qinlong","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8102-5055","authenticated-orcid":false,"given":"Huan","family":"Yu","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3708-3065","authenticated-orcid":false,"given":"Jinxin","family":"Chi","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2436-8761","authenticated-orcid":false,"given":"Wang","family":"Zhang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5310-9523","authenticated-orcid":false,"given":"Yuhan","family":"Li","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9750-7157","authenticated-orcid":false,"given":"Zixian","family":"Du","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3100-2772","authenticated-orcid":false,"given":"Sida","family":"Zhao","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0351-5272","authenticated-orcid":false,"given":"Yongqiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8094-8112","authenticated-orcid":false,"given":"Jingzhe","family":"Tang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4910-6095","authenticated-orcid":false,"given":"Zherui","family":"Liu","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3144-4398","authenticated-orcid":false,"given":"Chuan","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Computing and Data Science, The University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3989-4358","authenticated-orcid":false,"given":"Yanghua","family":"Peng","sequence":"additional","affiliation":[{"name":"ByteDance, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4879-5335","authenticated-orcid":false,"given":"Haibin","family":"Lin","sequence":"additional","affiliation":[{"name":"ByteDance, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3043-522X","authenticated-orcid":false,"given":"Wencong","family":"Xiao","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8346-3323","authenticated-orcid":false,"given":"Xin","family":"Liu","sequence":"additional","affiliation":[{"name":"ByteDance, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2159-4971","authenticated-orcid":false,"given":"Liang","family":"Xiang","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et al. 2023. GPT-4 Technical Report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_2_1","volume-title":"Meet Claude. https:\/\/www.anthropic.com\/claude.","year":"2023","unstructured":"Anthropic. 2023. Meet Claude. https:\/\/www.anthropic.com\/claude."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519584"},{"key":"e_1_3_2_2_4_1","first-page":"430","article-title":"Pathways: Asynchronous Distributed Dataflow for ML","volume":"4","author":"Barham Paul","year":"2022","unstructured":"Paul Barham, Aakanksha Chowdhery, Jeff Dean, Sanjay Ghemawat, Steven Hand, Daniel Hurt, Michael Isard, Hyeontaek Lim, Ruoming Pang, Sudip Roy, et al. 2022. Pathways: Asynchronous Distributed Dataflow for ML. Proceedings of Machine Learning and Systems 4 (2022), 430\u2013449.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_2_5_1","unstructured":"Ben Frederickson. 2019. py-spy. https:\/\/github.com\/benfred\/py-spy."},{"key":"e_1_3_2_2_6_1","unstructured":"BigScience. 2022. BLOOM. https:\/\/github.com\/bigscience-workshop\/bigscience\/blob\/master\/train\/tr11-176B-ml\/chronicles.md."},{"key":"e_1_3_2_2_7_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877\u20131901."},{"key":"e_1_3_2_2_8_1","unstructured":"ByteDance Seed. 2025. Doubao-1.5-pro. https:\/\/seed.bytedance.com\/en\/special\/doubao_1_5_pro\/."},{"key":"e_1_3_2_2_9_1","unstructured":"ByteDance Seed. 2025. Technical Introduction to the Seed1.6 Model Series. https:\/\/seed.bytedance.com\/en\/seed1_6."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2406.06858"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651379"},{"key":"e_1_3_2_2_12_1","volume-title":"Training Deep Nets with Sublinear Memory Cost. arXiv preprint arXiv:1604.06174","author":"Chen Tianqi","year":"2016","unstructured":"Tianqi Chen, Bing Xu, Chiyuan Zhang, and Carlos Guestrin. 2016. Training Deep Nets with Sublinear Memory Cost. arXiv preprint arXiv:1604.06174 (2016)."},{"key":"e_1_3_2_2_13_1","first-page":"1","article-title":"PaLM: Scaling Language Modeling with Pathways","volume":"24","author":"Chowdhery Aakanksha","year":"2023","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. 2023. PaLM: Scaling Language Modeling with Pathways. Journal of Machine Learning Research 24, 240 (2023), 1\u2013113.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_2_14_1","volume-title":"Silent Data Corruptions at Scale. arXiv preprint arXiv:2102.11245","author":"Dixit Harish Dattatraya","year":"2021","unstructured":"Harish Dattatraya Dixit, Sneha Pendharkar, Matt Beadon, Chris Mason, Tejasvi Chakravarthy, Bharath Muthiah, and Sriram Sankar. 2021. Silent Data Corruptions at Scale. arXiv preprint arXiv:2102.11245 (2021)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2406.04594"},{"key":"e_1_3_2_2_16_1","volume-title":"Evolution of Aegis: Fault Diagnosis for AI Model Training Service in Production. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)","author":"Dong Jianbo","year":"2025","unstructured":"Jianbo Dong, Kun Qian, Pengcheng Zhang, Zhilong Zheng, Liang Chen, Fei Feng, Yichi Xu, Yikai Zhu, Gang Lu, Xue Li, Zhihui Ren, Zhicheng Wang, Bin Luo, Peng Zhang, Yang Liu, Yanqing Chen, Yu Guan, Weicheng Wang, Chaojie Yang, Yang Zhang, Man Yuan, Hanyu Zhao, Yong Li, Zihan Zhao, Shan Li, Xianlong Zeng, Zhiping Yao, Binzhang Fu, Ennan Zhai, Wei Lin, Chao Wang, and Dennis Cai. 2025. Evolution of Aegis: Fault Diagnosis for AI Model Training Service in Production. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25). USENIX Association, Philadelphia, PA, 865\u2013881. https:\/\/www.usenix.org\/conference\/nsdi25\/presentation\/dong"},{"key":"e_1_3_2_2_17_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16\u00d716 Words: Transformers for Image Recognition at Scale. arXiv:2010.11929 [cs.CV]"},{"key":"e_1_3_2_2_18_1","volume-title":"Liveput-Optimized DNN Training on Preemptible Instances. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Duan Jiangfei","year":"2024","unstructured":"Jiangfei Duan, Ziang Song, Xupeng Miao, Xiaoli Xi, Dahua Lin, Harry Xu, Minjia Zhang, and Zhihao Jia. 2024. Parcae: Proactive, Liveput-Optimized DNN Training on Preemptible Instances. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). 1121\u20131139."},{"key":"e_1_3_2_2_19_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The Llama 3 Herd of Models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_2_20_1","volume-title":"Check-N-Run: A Checkpointing System for Training Deep Learning Recommendation Models. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Eisenman Assaf","year":"2022","unstructured":"Assaf Eisenman, Kiran Kumar Matam, Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Krishnakumar Nair, Misha Smelyanskiy, and Murali Annavaram. 2022. Check-N-Run: A Checkpointing System for Training Deep Learning Recommendation Models. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). 929\u2013943."},{"key":"e_1_3_2_2_21_1","unstructured":"etcd. 2022. etcd: Distributed Reliable Key-Value Store for the Most Critical Data of a Distributed System. https:\/\/github.com\/etcd-io\/etcd."},{"key":"e_1_3_2_2_22_1","unstructured":"Apache Software Foundation. 2022. Hadoop Distributed File System. https:\/\/hadoop.apache.org\/docs\/current\/hadoop-project-dist\/hadoop-hdfs\/HdfsDesign.html."},{"key":"e_1_3_2_2_23_1","volume-title":"Large Language Models and Games: A Survey and Roadmap. arXiv preprint arXiv:2402.18659","author":"Gallotta Roberto","year":"2024","unstructured":"Roberto Gallotta, Graham Todd, Marvin Zammit, Sam Earle, Antonios Liapis, Julian Togelius, and Georgios N Yannakakis. 2024. Large Language Models and Games: A Survey and Roadmap. arXiv preprint arXiv:2402.18659 (2024)."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3611643.3613898"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695960"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3597503.3639232"},{"key":"e_1_3_2_2_27_1","volume-title":"ByteScale: Efficient Scaling of LLM Training with a 2048K Context Length on More Than 12,000 GPUs. arXiv preprint arXiv:2502.21231","author":"Ge Hao","year":"2025","unstructured":"Hao Ge, Junda Feng, Qi Huang, Fangcheng Fu, Xiaonan Nie, Lei Zuo, Haibin Lin, Bin Cui, and Xin Liu. 2025. ByteScale: Efficient Scaling of LLM Training with a 2048K Context Length on More Than 12,000 GPUs. arXiv preprint arXiv:2502.21231 (2025)."},{"key":"e_1_3_2_2_28_1","volume-title":"Copilot: Your AI Pair Programmer. https:\/\/github.com\/features\/copilot.","year":"2022","unstructured":"Github. 2022. Copilot: Your AI Pair Programmer. https:\/\/github.com\/features\/copilot."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3242086"},{"key":"e_1_3_2_2_30_1","unstructured":"Dong Guo Faming Wu Feida Zhu Fuxing Leng Guang Shi Haobin Chen Haoqi Fan Jian Wang Jianyu Jiang Jiawei Wang et al. 2025. Seed1. 5-VL Technical Report. arXiv preprint arXiv:2505.07062 (2025)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3650085"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2401.00134"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589105"},{"key":"e_1_3_2_2_34_1","volume-title":"Measuring Massive Multitask Language Understanding. arXiv preprint arXiv:2009.03300","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2020. Measuring Massive Multitask Language Understanding. arXiv preprint arXiv:2009.03300 (2020)."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458336.3465297"},{"key":"e_1_3_2_2_36_1","volume-title":"Characterization of Large Language Model Development in the Datacenter. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Hu Qinghao","year":"2024","unstructured":"Qinghao Hu, Zhisheng Ye, Zerui Wang, Guoteng Wang, Meng Zhang, Qiaoling Chen, Peng Sun, Dahua Lin, Xiaolin Wang, Yingwei Luo, et al. 2024. Characterization of Large Language Model Development in the Datacenter. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). 709\u2013729."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3102980.3103005"},{"key":"e_1_3_2_2_38_1","unstructured":"Yanping Huang Youlong Cheng Ankur Bapna Orhan Firat Dehao Chen Mia Chen HyoukJoong Lee Jiquan Ngiam Quoc V Le Yonghui Wu et al. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_2_39_1","volume-title":"Samyam Rajbhandari, and Yuxiong He.","author":"Jacobs Sam Ade","year":"2023","unstructured":"Sam Ade Jacobs, Masahiro Tanaka, Chengming Zhang, Minjia Zhang, Shuaiwen Leon Song, Samyam Rajbhandari, and Yuxiong He. 2023. Deepspeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models. arXiv preprint arXiv:2309.14509 (2023)."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613152"},{"key":"e_1_3_2_2_41_1","volume-title":"Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). 947\u2013960."},{"key":"e_1_3_2_2_42_1","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Jiang Ziheng","year":"2024","unstructured":"Ziheng Jiang, Haibin Lin, Yinmin Zhong, Qi Huang, Yangrui Chen, Zhi Zhang, Yanghua Peng, Xiang Li, Cong Xie, Shibiao Nong, Yulu Jia, Sun He, Hongmin Chen, Zhihao Bai, Qi Hou, Shipeng Yan, Ding Zhou, Yiyao Sheng, Zhuo Jiang, Haohan Xu, Haoran Wei, Zhang Zhang, Pengfei Nie, Leqi Zou, Sida Zhao, Liang Xiang, Zherui Liu, Zhe Li, Xiaoying Jia, Jianxi Ye, Xin Jin, and Xin Liu. 2024. MegaScale: Scaling Large Language Model Training to More Than 10,000 GPUs. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). USENIX Association, Santa Clara, CA, 745\u2013760. https:\/\/www.usenix.org\/conference\/nsdi24\/presentation\/jiang-ziheng"},{"key":"e_1_3_2_2_43_1","volume-title":"Adam: A Method for Stochastic Optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma. 2014. Adam: A Method for Stochastic Optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_2_44_1","volume-title":"Revisiting Reliability in Large-Scale Machine Learning Research Clusters. arXiv preprint arXiv:2410.21680","author":"Kokolis Apostolos","year":"2024","unstructured":"Apostolos Kokolis, Michael Kuchnik, John Hoffman, Adithya Kumar, Parth Malani, Faye Ma, Zachary DeVito, Shubho Sengupta, Kalyan Saladi, and Carole-Jean Wu. 2024. Revisiting Reliability in Large-Scale Machine Learning Research Clusters. arXiv preprint arXiv:2410.21680 (2024)."},{"key":"e_1_3_2_2_45_1","first-page":"341","article-title":"Reducing Activation Recomputation in Large Transformer Models","volume":"5","author":"Korthikanti Vijay Anand","year":"2023","unstructured":"Vijay Anand Korthikanti, Jared Casper, Sangkug Lym, Lawrence McAfee, Michael Andersch, Mohammad Shoeybi, and Bryan Catanzaro. 2023. Reducing Activation Recomputation in Large Transformer Models. Proceedings of Machine Learning and Systems 5 (2023), 341\u2013353.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_2_46_1","volume-title":"Gshard: Scaling Giant Models with Conditional Computation and Automatic Sharding. arXiv preprint arXiv:2006.16668","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2020. Gshard: Scaling Giant Models with Conditional Computation and Automatic Sharding. arXiv preprint arXiv:2006.16668 (2020)."},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607054"},{"key":"e_1_3_2_2_48_1","volume-title":"Pytorch Distributed: Experiences on Accelerating Data Parallel Training. arXiv preprint arXiv:2006.15704","author":"Li Shen","year":"2020","unstructured":"Shen Li, Yanli Zhao, Rohan Varma, Omkar Salpekar, Pieter Noordhuis, Teng Li, Adam Paszke, Jeff Smith, Brian Vaughan, Pritam Damania, et al. 2020. Pytorch Distributed: Experiences on Accelerating Data Parallel Training. arXiv preprint arXiv:2006.15704 (2020)."},{"key":"e_1_3_2_2_49_1","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024. Deepseek-V3 Technical Report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_2_50_1","volume-title":"Ring attention with blockwise transformers for near-infinite context. arXiv preprint arXiv:2310.01889","author":"Liu Hao","year":"2023","unstructured":"Hao Liu, Matei Zaharia, and Pieter Abbeel. 2023. Ring attention with blockwise transformers for near-infinite context. arXiv preprint arXiv:2310.01889 (2023)."},{"key":"e_1_3_2_2_51_1","volume-title":"Perseus: A Fail-Slow Detection Framework for Cloud Storage Systems. In 21st USENIX Conference on File and Storage Technologies (FAST 23)","author":"Lu Ruiming","year":"2023","unstructured":"Ruiming Lu, Erci Xu, Yiming Zhang, Fengyi Zhu, Zhaosheng Zhu, Mengtian Wang, Zongpeng Zhu, Guangtao Xue, Jiwu Shu, Minglu Li, et al. 2023. Perseus: A Fail-Slow Detection Framework for Cloud Storage Systems. In 21st USENIX Conference on File and Storage Technologies (FAST 23). 49\u201364."},{"key":"e_1_3_2_2_52_1","unstructured":"Jeffrey Ma Hengzhi Pei Leonard Lausen and George Karypis. 2025. Understanding Silent Data Corruption in LLM Training. arXiv:2502.12340 [cs.LG] https:\/\/arxiv.org\/abs\/2502.12340"},{"key":"e_1_3_2_2_53_1","volume-title":"Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. CheckFreq: Frequent, Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21). 203\u2013216."},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_2_56_1","unstructured":"NVIDIA. 2024. Extended Utility Diagnostics (EUD). https:\/\/docs.nvidia.com\/datacenter\/dcgm\/latest\/user-guide\/dcgm-eud.html."},{"key":"e_1_3_2_2_57_1","unstructured":"NVIDIA. 2024. Xid Errors. https:\/\/docs.nvidia.com\/deploy\/xid-errors\/."},{"key":"e_1_3_2_2_58_1","unstructured":"OpenAI. 2022. Introducing ChatGPT. https:\/\/openai.com\/blog\/chatgpt."},{"key":"e_1_3_2_2_59_1","unstructured":"OpenAI. 2024. Introducing OpenAI o1. https:\/\/openai.com\/o1\/."},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"e_1_3_2_2_61_1","unstructured":"PyTorch Team. 2025. Flight Recorder for Debugging Stuck Jobs. https:\/\/docs.pytorch.org\/tutorials\/unstable\/flight_recorder_tutorial.html."},{"key":"e_1_3_2_2_62_1","volume-title":"Zero Bubble (Almost) Pipeline Parallelism. In The Twelfth International Conference on Learning Representations.","author":"Qi Penghui","year":"2024","unstructured":"Penghui Qi, Xinyi Wan, Guangxing Huang, and Min Lin. 2024. Zero Bubble (Almost) Pipeline Parallelism. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_2_63_1","volume-title":"Zero: Memory Optimizations Toward Training Trillion Parameter Models. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201316","author":"Rajbhandari Samyam","year":"2020","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. 2020. Zero: Memory Optimizations Toward Training Trillion Parameter Models. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201316."},{"key":"e_1_3_2_2_64_1","volume-title":"Zero-Offload: Democratizing Billion-Scale Model Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021. Zero-Offload: Democratizing Billion-Scale Model Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). 551\u2013564."},{"key":"e_1_3_2_2_65_1","volume-title":"Yossi Adi, Jingyu Liu, Romain Sauvestre, Tal Remez, et al.","author":"Roziere Baptiste","year":"2023","unstructured":"Baptiste Roziere, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Romain Sauvestre, Tal Remez, et al. 2023. Code Llama: Open Foundation Models for Code. arXiv preprint arXiv:2308.12950 (2023)."},{"key":"e_1_3_2_2_66_1","volume-title":"Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. arXiv preprint arXiv:1701.06538","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. arXiv preprint arXiv:1701.06538 (2017)."},{"key":"e_1_3_2_2_67_1","volume-title":"Hybrid-Flow: A Flexible and Efficient RLHF Framework. arXiv preprint arXiv: 2409.19256","author":"Sheng Guangming","year":"2024","unstructured":"Guangming Sheng, Chi Zhang, Zilingfeng Ye, Xibin Wu, Wang Zhang, Ru Zhang, Yanghua Peng, Haibin Lin, and Chuan Wu. 2024. Hybrid-Flow: A Flexible and Efficient RLHF Framework. arXiv preprint arXiv: 2409.19256 (2024)."},{"key":"e_1_3_2_2_68_1","volume-title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_2_69_1","volume-title":"Ekko: A Large-Scale Deep Learning Recommender System with Low-Latency Model Update. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Sima Chijun","year":"2022","unstructured":"Chijun Sima, Yao Fu, Man-Kit Sit, Liyi Guo, Xuri Gong, Feng Lin, Junyu Wu, Yongsheng Li, Haidong Rong, Pierre-Louis Aublin, et al. 2022. Ekko: A Large-Scale Deep Learning Recommender System with Low-Latency Model Update. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 821\u2013839."},{"key":"e_1_3_2_2_70_1","volume-title":"NetBouncer: Active Device and Link Failure Localization in Data Center Networks. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Tan Cheng","year":"2019","unstructured":"Cheng Tan, Ze Jin, Chuanxiong Guo, Tianrong Zhang, Haitao Wu, Karl Deng, Dongming Bi, and Dong Xiang. 2019. NetBouncer: Active Device and Link Failure Localization in Data Center Networks. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19). USENIX Association, Boston, MA, 599\u2013614. https:\/\/www.usenix.org\/conference\/nsdi19\/presentation\/tan"},{"key":"e_1_3_2_2_71_1","volume-title":"Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al.","author":"Team Gemini","year":"2024","unstructured":"Gemini Team, Petko Georgiev, Ving Ian Lei, Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al. 2024. Gemini 1.5: Unlocking Multimodal Understanding Across Millions of Tokens of Context. arXiv preprint arXiv:2403.05530 (2024)."},{"key":"e_1_3_2_2_72_1","unstructured":"KubeDL Team. 2024. KubeDL Makes Deep Learning Workloads Run on Kubernetes More Easily and Efficiently. https:\/\/kubedl.io\/."},{"key":"e_1_3_2_2_73_1","volume-title":"Kubeflow: The Machine Learning Toolkit for Kubernetes. https:\/\/www.kubeflow.org\/.","author":"Team Kubeflow","year":"2024","unstructured":"Kubeflow Team. 2024. Kubeflow: The Machine Learning Toolkit for Kubernetes. https:\/\/www.kubeflow.org\/."},{"key":"e_1_3_2_2_74_1","unstructured":"Kimi Team Yifan Bai Yiping Bao Guanduo Chen Jiahao Chen Ningxin Chen Ruijue Chen Yanru Chen Yuankun Chen Yutian Chen et al. 2025. Kimi K2: Open Agentic Intelligence. arXiv preprint arXiv:2507.20534 (2025)."},{"key":"e_1_3_2_2_75_1","unstructured":"NVIDIA Team. 2021. NVIDIA DCGM. https:\/\/developer.nvidia.com\/dcgm."},{"key":"e_1_3_2_2_76_1","unstructured":"NVIDIA Team. 2022. NVIDIA GPU Memory Error Management. https:\/\/docs.nvidia.com\/deploy\/a100-gpu-mem-error-mgmt\/index.html#row-mapping."},{"key":"e_1_3_2_2_77_1","unstructured":"Volcano Team. 2024. VolcanoJob. https:\/\/volcano.sh\/en\/docs\/vcjob\/."},{"key":"e_1_3_2_2_78_1","volume-title":"Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Thorpe John","year":"2023","unstructured":"John Thorpe, Pengzhan Zhao, Jonathan Eyolfson, Yifan Qiao, Zhihao Jia, Minjia Zhang, Ravi Netravali, and Guoqing Harry Xu. 2023. Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 497\u2013513."},{"key":"e_1_3_2_2_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695975"},{"key":"e_1_3_2_2_80_1","volume-title":"ByteCheckpoint: A Unified Checkpointing System for Large Foundation Model Development. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)","author":"Wan Borui","year":"2025","unstructured":"Borui Wan, Mingji Han, Yiyao Sheng, Yanghua Peng, Haibin Lin, Mofan Zhang, Zhichao Lai, Menghan Yu, Junda Zhang, Zuquan Song, et al. 2025. ByteCheckpoint: A Unified Checkpointing System for Large Foundation Model Development. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25). 559\u2013578."},{"key":"e_1_3_2_2_81_1","unstructured":"Wandb Team. 2025. AI is Easy to Productionize. https:\/\/wandb.ai\/site\/."},{"key":"e_1_3_2_2_82_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613149"},{"key":"e_1_3_2_2_83_1","volume-title":"Atharva Naik, David Stap, et al.","author":"Wang Yizhong","year":"2022","unstructured":"Yizhong Wang, Swaroop Mishra, Pegah Alipoormolabashi, Yeganeh Kordi, Amirreza Mirzaei, Anjana Arunkumar, Arjun Ashok, Arut Selvan Dhanasekaran, Atharva Naik, David Stap, et al. 2022. Super-Naturalinstructions: Generalization via Declarative Instructions on 1600+ NLP Tasks. arXiv preprint arXiv:2204.07705 (2022)."},{"key":"e_1_3_2_2_84_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613145"},{"key":"e_1_3_2_2_85_1","volume-title":"Brian Lester, Nan Du, Andrew M Dai, and Quoc V Le.","author":"Wei Jason","year":"2021","unstructured":"Jason Wei, Maarten Bosma, Vincent Y Zhao, Kelvin Guu, Adams Wei Yu, Brian Lester, Nan Du, Andrew M Dai, and Quoc V Le. 2021. Finetuned Language Models are Zero-Shot Learners. arXiv preprint arXiv:2109.01652 (2021)."},{"key":"e_1_3_2_2_86_1","volume-title":"MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Weng Qizhen","year":"2022","unstructured":"Qizhen Weng, Wencong Xiao, Yinghao Yu, Wei Wang, Cheng Wang, Jian He, Yong Li, Liping Zhang, Wei Lin, and Yu Ding. 2022. MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). USENIX Association, Renton, WA, 945\u2013960."},{"key":"e_1_3_2_2_87_1","unstructured":"xAI. 2024. xAI Blog. https:\/\/x.ai\/blog."},{"key":"e_1_3_2_2_88_1","volume-title":"Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, Fan Yang, and Lidong Zhou. 2018. Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018, Carlsbad, CA, USA, October 8-10, 2018. USENIX Association, 595\u2013610. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/xiao"},{"key":"e_1_3_2_2_89_1","volume-title":"SuperBench: Improving Cloud AI Infrastructure Reliability with Proactive Validation. In 2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Xiong Yifan","year":"2024","unstructured":"Yifan Xiong, Yuting Jiang, Ziyue Yang, Lei Qu, Guoshuai Zhao, Shuguang Liu, Dong Zhong, Boris Pinzur, Jie Zhang, Yang Wang, et al. 2024. SuperBench: Improving Cloud AI Infrastructure Reliability with Proactive Validation. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). 835\u2013850."},{"key":"e_1_3_2_2_90_1","unstructured":"An Yang Anfeng Li Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chang Gao Chengen Huang Chenxu Lv et al. 2025. Qwen3 Technical Report. arXiv preprint arXiv:2505.09388 (2025)."},{"key":"e_1_3_2_2_91_1","volume-title":"Xu Zhao, Yongle Zhang, Pranay U. Jain, and Michael Stumm.","author":"Yuan Ding","year":"2014","unstructured":"Ding Yuan, Yu Luo, Xin Zhuang, Guilherme Renna Rodrigues, Xu Zhao, Yongle Zhang, Pranay U. Jain, and Michael Stumm. 2014. Simple Testing Can Prevent Most Critical Failures: An Analysis of Production Failures in Distributed Data-Intensive Systems. In 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14). USENIX Association, Broomfield, CO, 249\u2013265. https:\/\/www.usenix.org\/conference\/osdi14\/technical-sessions\/presentation\/yuan"},{"key":"e_1_3_2_2_92_1","unstructured":"Aohan Zeng Xiao Liu Zhengxiao Du Zihan Wang Hanyu Lai Ming Ding Zhuoyi Yang Yifan Xu Wendi Zheng Xiao Xia et al. 2022. GLM-130B: An open Bilingual Pre-Trained Model. arXiv preprint arXiv:2210.02414 (2022)."},{"key":"e_1_3_2_2_93_1","doi-asserted-by":"publisher","DOI":"10.1145\/3377811.3380362"},{"key":"e_1_3_2_2_94_1","volume-title":"Xi Victoria Lin, et al","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. 2022. OPT: Open Pre-Trained Transformer Language Models. arXiv preprint arXiv:2205.01068 (2022)."},{"key":"e_1_3_2_2_95_1","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Zhang Yuqi","year":"2024","unstructured":"Yuqi Zhang, Tianyi Zhang, Wenwen Hao, Shuyang Wang, Na Liu, Xing He, Yang Zhang, Weixin Wang, Yongguang Cheng, Huan Wang, et al. 2024. MSFRD: Mutation Similarity based SSD Failure Rating and Diagnosis for Complex and Volatile Production Environments. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). 869\u2013884."},{"key":"e_1_3_2_2_96_1","volume-title":"OVERLORD: Ultimate Scaling of DataLoader for Multi-Source Large Foundation Model Training. arXiv preprint arXiv:2504.09844","author":"Zhao Juntao","year":"2025","unstructured":"Juntao Zhao, Qi Lu, Wei Jia, Borui Wan, Lei Zuo, Junda Feng, Jianyu Jiang, Yangrui Chen, Shuaishuai Cao, Jialing He, et al. 2025. OVERLORD: Ultimate Scaling of DataLoader for Multi-Source Large Foundation Model Training. arXiv preprint arXiv:2504.09844 (2025)."},{"key":"e_1_3_2_2_97_1","doi-asserted-by":"crossref","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer et al. 2023. Pytorch FSDP: Experiences on Scaling Fully Sharded Data Parallel. arXiv preprint arXiv:2304.11277 (2023).","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_2_2_98_1","volume-title":"SWIFT: Expedited Failure Recovery for Large-scale DNN Training","author":"Zhong Yuchen","year":"2024","unstructured":"Yuchen Zhong, Guangming Sheng, Juncheng Liu, Jinhui Yuan, and Chuan Wu. 2024. SWIFT: Expedited Failure Recovery for Large-scale DNN Training. IEEE Transactions on Parallel and Distributed Systems (2024)."}],"event":{"name":"SOSP '25: ACM SIGOPS 31st Symposium on Operating Systems Principles","location":"Lotte Hotel World Seoul Republic of Korea","acronym":"SOSP '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","USENIX"]},"container-title":["Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731569.3764838","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,3]],"date-time":"2025-11-03T15:10:31Z","timestamp":1762182631000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731569.3764838"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":98,"alternative-id":["10.1145\/3731569.3764838","10.1145\/3731569"],"URL":"https:\/\/doi.org\/10.1145\/3731569.3764838","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}