{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:52:55Z","timestamp":1777063975472,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":81,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China","award":["62572304"],"award-info":[{"award-number":["62572304"]}]},{"name":"National Natural Science Foundation of China","award":["62232011"],"award-info":[{"award-number":["62232011"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3803623","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"2002-2021","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Suika: Efficient and High-quality Re-scheduling of 3D-parallelized LLM Training Jobs in Shared Clusters"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-1595-411X","authenticated-orcid":false,"given":"Yuxuan","family":"Wang","sequence":"first","affiliation":[{"name":"Zhiyuan College, Shanghai Jiao Tong University, Shanghai, China"},{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9679-8602","authenticated-orcid":false,"given":"Yanbo","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"},{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9480-5632","authenticated-orcid":false,"given":"Chen","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9272-1732","authenticated-orcid":false,"given":"Chunyu","family":"Xue","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9195-6443","authenticated-orcid":false,"given":"Qizhen","family":"Weng","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4930-1028","authenticated-orcid":false,"given":"Yin","family":"Chen","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9287-9413","authenticated-orcid":false,"given":"Zeren","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1131-6237","authenticated-orcid":false,"given":"Xuqi","family":"Zhu","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9733-4346","authenticated-orcid":false,"given":"Yongqiang","family":"Yang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., National Natural Science Foundation of Chinahen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5832-0347","authenticated-orcid":false,"given":"Quan","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0034-2302","authenticated-orcid":false,"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"issue":"1","key":"e_1_3_2_1_1_1","first-page":"1","article-title":"Ai flow: Perspectives, scenarios, and approaches","volume":"3","author":"An Hongjun","year":"2026","unstructured":"Hongjun An, Wenhan Hu, Sida Huang, Siqi Huang, Ruanjun Li, Yuanzhi Liang, Jiawei Shao, Yiliang Song, Zihan Wang, Cheng Yuan, et al. Ai flow: Perspectives, scenarios, and approaches. Vicinagearth, 3(1):1, 2026.","journal-title":"Vicinagearth"},{"key":"e_1_3_2_1_2_1","first-page":"15","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Bian Zhengda","year":"2021","unstructured":"Zhengda Bian, Shenggui Li, Wei Wang, and Yang You. Online evolutionary batch size orchestration for scheduling deep learning workloads in gpu clusters. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pages 1\u201315, 2021."},{"key":"e_1_3_2_1_3_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems, 33:1877\u20131901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877\u20131901, 2020."},{"key":"e_1_3_2_1_4_1","first-page":"16","volume-title":"Proceedings of the Fifteenth European Conference on Computer Systems","author":"Chaudhary Shubham","year":"2020","unstructured":"Shubham Chaudhary, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, and Srinidhi Viswanatha. Balancing efficiency and fairness in heterogeneous gpu clusters for deep learning. In Proceedings of the Fifteenth European Conference on Computer Systems, pages 1\u201316, 2020."},{"key":"e_1_3_2_1_5_1","first-page":"446","volume-title":"Proceedings of the 11th ACM Symposium on Cloud Computing","author":"Chen Chen","year":"2020","unstructured":"Chen Chen, Qizhen Weng, Wei Wang, Baochun Li, and Bo Li. Semi-dynamic load balancing: Efficient distributed learning in non-dedicated environments. In Proceedings of the 11th ACM Symposium on Cloud Computing, pages 431\u2013446, 2020."},{"key":"e_1_3_2_1_6_1","volume-title":"Ladm: Long-context training data selection with attention-based dependency measurement for llms. arXiv preprint arXiv:2503.02502","author":"Chen Jianghao","year":"2025","unstructured":"Jianghao Chen, Junhong Wu, Yangyifan Xu, and Jiajun Zhang. Ladm: Long-context training data selection with attention-based dependency measurement for llms. arXiv preprint arXiv:2503.02502, 2025."},{"key":"e_1_3_2_1_7_1","volume-title":"Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374","author":"Chen Mark","year":"2021","unstructured":"Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde De Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374, 2021."},{"key":"e_1_3_2_1_8_1","volume-title":"Large scale distributed deep networks. Advances in neural information processing systems, 25","author":"Dean Jeffrey","year":"2012","unstructured":"Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Marc'aurelio Ranzato, Andrew Senior, Paul Tucker, Ke Yang, et al. Large scale distributed deep networks. Advances in neural information processing systems, 25, 2012."},{"key":"e_1_3_2_1_9_1","volume-title":"Qlora: Efficient finetuning of quantized llms. Advances in neural information processing systems, 36:10088\u201310115","author":"Dettmers Tim","year":"2023","unstructured":"Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. Qlora: Efficient finetuning of quantized llms. Advances in neural information processing systems, 36:10088\u201310115, 2023."},{"key":"e_1_3_2_1_10_1","volume-title":"Efficient training of large language models on distributed infrastructures: a survey. arXiv preprint arXiv:2407.20018","author":"Duan Jiangfei","year":"2024","unstructured":"Jiangfei Duan, Shuo Zhang, Zerui Wang, Lijuan Jiang, Wenwen Qu, Qinghao Hu, Guoteng Wang, Qizhen Weng, Hang Yan, Xingcheng Zhang, et al. Efficient training of large language models on distributed infrastructures: a survey. arXiv preprint arXiv:2407.20018, 2024."},{"key":"e_1_3_2_1_11_1","volume-title":"The llama 3 herd of models. arXiv e-prints","author":"Dubey Abhimanyu","year":"2024","unstructured":"Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, et al. The llama 3 herd of models. arXiv e-prints, pages arXiv-2407, 2024."},{"key":"e_1_3_2_1_12_1","volume-title":"From llm to nmt: Advancing low-resource machine translation with claude. arXiv preprint arXiv:2404.13813","author":"Enis Maxim","year":"2024","unstructured":"Maxim Enis and Mark Hopkins. From llm to nmt: Advancing low-resource machine translation with claude. arXiv preprint arXiv:2404.13813, 2024."},{"issue":"30","key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","first-page":"e2305016120","DOI":"10.1073\/pnas.2305016120","article-title":"Chatgpt outperforms crowd workers for text-annotation tasks","volume":"120","author":"Gilardi Fabrizio","year":"2023","unstructured":"Fabrizio Gilardi, Meysam Alizadeh, and Ma\u00ebl Kubli. Chatgpt outperforms crowd workers for text-annotation tasks. Proceedings of the National Academy of Sciences, 120(30):e2305016120, 2023.","journal-title":"Proceedings of the National Academy of Sciences"},{"key":"e_1_3_2_1_14_1","volume-title":"et al. Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793","author":"Aohan Zeng Team GLM","year":"2024","unstructured":"Team GLM, Aohan Zeng, Bin Xu, Bowen Wang, Chenhui Zhang, Da Yin, Dan Zhang, Diego Rojas, Guanyu Feng, Hanlin Zhao, et al. Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793, 2024."},{"key":"e_1_3_2_1_15_1","first-page":"280","volume-title":"Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"2","author":"Gu Diandian","year":"2023","unstructured":"Diandian Gu, Yihao Zhao, Yinmin Zhong, Yifan Xiong, Zhenhua Han, Peng Cheng, Fan Yang, Gang Huang, Xin Jin, and Xuanzhe Liu. Elasticflow: An elastic serverless training platform for distributed deep learning. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2, ASPLOS 2023, page 266\u2013280, New York, NY, USA, 2023. Association for Computing Machinery."},{"issue":"11","key":"e_1_3_2_1_16_1","first-page":"2808","article-title":"Intelligent resource estimation and network-efficient scheduling for deep learning jobs on distributed gpu clusters","volume":"33","author":"Gu Rong","year":"2021","unstructured":"Rong Gu, Yuquan Chen, Shuai Liu, Haipeng Dai, Guihai Chen, Kai Zhang, Yang Che, and Yihua Huang. Liquid: Intelligent resource estimation and network-efficient scheduling for deep learning jobs on distributed gpu clusters. IEEE Transactions on Parallel and Distributed Systems, 33(11):2808\u20132820, 2021.","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"issue":"2","key":"e_1_3_2_1_17_1","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al. Lora: Low-rank adaptation of large language models. ICLR, 1(2):3, 2022.","journal-title":"ICLR"},{"key":"e_1_3_2_1_18_1","first-page":"15","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Hu Qinghao","year":"2021","unstructured":"Qinghao Hu, Peng Sun, Shengen Yan, Yonggang Wen, and Tianwei Zhang. Characterization and prediction of deep learning workloads in large-scale gpu datacenters. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pages 1\u201315, 2021."},{"key":"e_1_3_2_1_19_1","first-page":"729","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Hu Qinghao","year":"2024","unstructured":"Qinghao Hu, Zhisheng Ye, Zerui Wang, Guoteng Wang, Meng Zhang, Qiaoling Chen, Peng Sun, Dahua Lin, Xiaolin Wang, Yingwei Luo, et al. Characterization of large language model development in the datacenter. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24), pages 709\u2013729, 2024."},{"key":"e_1_3_2_1_20_1","first-page":"472","volume-title":"Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"2","author":"Hu Qinghao","year":"2023","unstructured":"Qinghao Hu, Meng Zhang, Peng Sun, Yonggang Wen, and Tianwei Zhang. Lucid: A non-intrusive, scalable and interpretable scheduler for deep learning training jobs. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2, pages 457\u2013472, 2023."},{"key":"e_1_3_2_1_21_1","volume-title":"et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 32, 2019."},{"key":"e_1_3_2_1_22_1","unstructured":"Hugging Face Inc. Hugging face. https:\/\/huggingface.co\/ 2025. Accessed: 2025-09-22."},{"key":"e_1_3_2_1_23_1","first-page":"739","volume-title":"18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Hwang Changho","year":"2021","unstructured":"Changho Hwang, Taehyun Kim, Sunghyun Kim, Jinwoo Shin, and KyoungSoo Park. Elastic resource sharing for distributed deep learning. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21), pages 721\u2013739, 2021."},{"issue":"1","key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","first-page":"79","DOI":"10.1162\/neco.1991.3.1.79","article-title":"Adaptive mixtures of local experts","volume":"3","author":"Jacobs Robert A","year":"1991","unstructured":"Robert A Jacobs, Michael I Jordan, Steven J Nowlan, and Geoffrey E Hinton. Adaptive mixtures of local experts. Neural computation, 3(1):79\u201387, 1991.","journal-title":"Neural computation"},{"key":"e_1_3_2_1_25_1","first-page":"657","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles","author":"Subramanya Suhas Jayaram","year":"2023","unstructured":"Suhas Jayaram Subramanya, Daiyaan Arfeen, Shouxu Lin, Aurick Qiao, Zhihao Jia, and Gregory R Ganger. Sia: Heterogeneity-aware, goodput-optimized ml-cluster scheduling. In Proceedings of the 29th Symposium on Operating Systems Principles, pages 642\u2013657, 2023."},{"key":"e_1_3_2_1_26_1","first-page":"960","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. Analysis of {Large-Scale} {Multi-Tenant} {GPU} clusters for {DNN} training workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19), pages 947\u2013960, 2019."},{"key":"e_1_3_2_1_27_1","first-page":"760","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Jiang Ziheng","year":"2024","unstructured":"Ziheng Jiang, Haibin Lin, Yinmin Zhong, Qi Huang, Yangrui Chen, Zhi Zhang, Yanghua Peng, Xiang Li, Cong Xie, Shibiao Nong, et al. {MegaScale}: Scaling large language model training to more than 10,000 {GPUs}. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24), pages 745\u2013760, 2024."},{"key":"e_1_3_2_1_28_1","first-page":"341","article-title":"Reducing activation recomputation in large transformer models","volume":"5","author":"Korthikanti Vijay Anand","year":"2023","unstructured":"Vijay Anand Korthikanti, Jared Casper, Sangkug Lym, Lawrence McAfee, Michael Andersch, Mohammad Shoeybi, and Bryan Catanzaro. Reducing activation recomputation in large transformer models. Proceedings of Machine Learning and Systems, 5:341\u2013353, 2023.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_29_1","volume-title":"et al. Trainmover: An interruption-resilient and reliable ml training runtime. arXiv preprint arXiv:2412.12636","author":"Lao Chon Lam","year":"2024","unstructured":"Chon Lam Lao, Minlan Yu, Aditya Akella, Jiamin Cao, Yu Guan, Pengcheng Zhang, Zhilong Zheng, Yichi Xu, Ennan Zhai, Dennis Cai, et al. Trainmover: An interruption-resilient and reliable ml training runtime. arXiv preprint arXiv:2412.12636, 2024."},{"key":"e_1_3_2_1_30_1","volume-title":"Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668, 2020."},{"issue":"3","key":"e_1_3_2_1_31_1","first-page":"1","article-title":"Straggler-resilient hybrid parallel training of large-scale models via malleable data and model parallelization","volume":"3","author":"Li Haoyang","year":"2025","unstructured":"Haoyang Li, Fangcheng Fu, Hao Ge, Sheng Lin, Xuanyu Wang, Jiawen Niu, Yujie Wang, Hailin Zhang, Xiaonan Nie, and Bin Cui. Malleus: Straggler-resilient hybrid parallel training of large-scale models via malleable data and model parallelization. Proceedings of the ACM on Management of Data, 3(3):1\u201328, 2025.","journal-title":"Proceedings of the ACM on Management of Data"},{"key":"e_1_3_2_1_32_1","first-page":"959","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Li Jiamin","year":"2023","unstructured":"Jiamin Li, Yimin Jiang, Yibo Zhu, Cong Wang, and Hong Xu. Accelerating distributed { MoE} training and inference with lina. In 2023 USENIX Annual Technical Conference (USENIX ATC 23), pages 945\u2013959, 2023."},{"key":"e_1_3_2_1_33_1","first-page":"850","volume-title":"Proceedings of the Eighteenth European Conference on Computer Systems","author":"Li Jiamin","year":"2023","unstructured":"Jiamin Li, Hong Xu, Yibo Zhu, Zherui Liu, Chuanxiong Guo, and Cong Wang. Lyra: Elastic scheduling for deep learning clusters. In Proceedings of the Eighteenth European Conference on Computer Systems, pages 835\u2013850, 2023."},{"key":"e_1_3_2_1_34_1","volume-title":"Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704","author":"Li Shen","year":"2020","unstructured":"Shen Li, Yanli Zhao, Rohan Varma, Omkar Salpekar, Pieter Noordhuis, Teng Li, Adam Paszke, Jeff Smith, Brian Vaughan, Pritam Damania, et al. Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704, 2020."},{"key":"e_1_3_2_1_35_1","volume-title":"Sequence parallelism: Long sequence training from system perspective. arXiv preprint arXiv:2105.13120","author":"Li Shenggui","year":"2021","unstructured":"Shenggui Li, Fuzhao Xue, Chaitanya Baranwal, Yongbin Li, and Yang You. Sequence parallelism: Long sequence training from system perspective. arXiv preprint arXiv:2105.13120, 2021."},{"key":"e_1_3_2_1_36_1","first-page":"98","volume-title":"Proceedings of the ACM SIGCOMM 2025 Conference","author":"Li Wenxue","year":"2025","unstructured":"Wenxue Li, Xiangzhou Liu, Yunxuan Zhang, Zihao Wang, Wei Gu, Tao Qian, Gaoxiong Zeng, Shoushou Ren, Xinyang Huang, Zhenghang Ren, et al. Revisiting rdma reliability for lossy fabrics. In Proceedings of the ACM SIGCOMM 2025 Conference, pages 85\u201398, 2025."},{"issue":"1","key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","first-page":"9","DOI":"10.1007\/s44336-024-00009-2","article-title":"A survey on llm-based multi-agent systems: workflow, infrastructure, and challenges","volume":"1","author":"Li Xinyi","year":"2024","unstructured":"Xinyi Li, Sai Wang, Siqi Zeng, Yu Wu, and Yi Yang. A survey on llm-based multi-agent systems: workflow, infrastructure, and challenges. Vicinagearth, 1(1):9, 2024.","journal-title":"Vicinagearth"},{"key":"e_1_3_2_1_38_1","first-page":"1534","volume-title":"2025 USENIX Annual Technical Conference (USENIX ATC 25)","author":"Lian Xinyu","year":"2025","unstructured":"Xinyu Lian, Sam Ade Jacobs, Lev Kurilenko, Masahiro Tanaka, Stas Bekman, Olatunji Ruwase, and Minjia Zhang. Universal checkpointing: A flexible and efficient distributed checkpointing system for {Large-Scale}{DNN} training with reconfigurable parallelism. In 2025 USENIX Annual Technical Conference (USENIX ATC 25), pages 1519\u20131534, 2025."},{"key":"e_1_3_2_1_39_1","first-page":"181","volume-title":"Proceedings of the Nineteenth European Conference on Computer Systems","author":"Liu Guodong","year":"2024","unstructured":"Guodong Liu, Youshan Miao, Zhiqi Lin, Xiaoxiang Shi, Saeed Maleki, Fan Yang, Yungang Bao, and Sa Wang. Aceso: Efficient parallel dnn training through iterative bottleneck alleviation. In Proceedings of the Nineteenth European Conference on Computer Systems, pages 163\u2013181, 2024."},{"key":"e_1_3_2_1_40_1","volume-title":"Fagel: Fabric llms agent empowered embodied intelligence evolution with autonomous human-machine collaboration. arXiv preprint arXiv:2412.20297","author":"Liu Jia","year":"2024","unstructured":"Jia Liu and Min Chen. Fagel: Fabric llms agent empowered embodied intelligence evolution with autonomous human-machine collaboration. arXiv preprint arXiv:2412.20297, 2024."},{"key":"e_1_3_2_1_41_1","first-page":"304","volume-title":"17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)","author":"Mahajan Kshiteej","year":"2020","unstructured":"Kshiteej Mahajan, Arjun Balasubramanian, Arjun Singhvi, Shivaram Venkataraman, Aditya Akella, Amar Phanishayee, and Shuchi Chawla. Themis: Fair and efficient {GPU} cluster scheduling. In 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20), pages 289\u2013304, 2020."},{"key":"e_1_3_2_1_42_1","first-page":"216","volume-title":"19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. { CheckFreq}: Frequent, {Fine-Grained} {DNN} checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21), pages 203\u2013216, 2021."},{"key":"e_1_3_2_1_43_1","first-page":"596","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Mohan Jayashree","year":"2022","unstructured":"Jayashree Mohan, Amar Phanishayee, Janardhan Kulkarni, and Vijay Chidambaram. Looking beyond {GPUs} for {DNN} scheduling on {Multi-Tenant} clusters. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 579\u2013596, 2022."},{"key":"e_1_3_2_1_44_1","first-page":"15","volume-title":"Proceedings of the 27th ACM symposium on operating systems principles","author":"Narayanan Deepak","year":"2019","unstructured":"Deepak Narayanan, Aaron Harlap, Amar Phanishayee, Vivek Seshadri, Nikhil R Devanur, Gregory R Ganger, Phillip B Gibbons, and Matei Zaharia. Pipedream: Generalized pipeline parallelism for dnn training. In Proceedings of the 27th ACM symposium on operating systems principles, pages 1\u201315, 2019."},{"key":"e_1_3_2_1_45_1","first-page":"15","volume-title":"Proceedings of the international conference for high performance computing, networking, storage and analysis","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan, Mohammad Shoeybi, Jared Casper, Patrick LeGresley, Mostofa Patwary, Vijay Korthikanti, Dmitri Vainbrand, Prethvi Kashinkunti, Julie Bernauer, Bryan Catanzaro, et al. Efficient large-scale language model training on gpu clusters using megatron-lm. In Proceedings of the international conference for high performance computing, networking, storage and analysis, pages 1\u201315, 2021."},{"key":"e_1_3_2_1_46_1","first-page":"181","volume-title":"2020 20th IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID)","author":"Nicolae Bogdan","unstructured":"Bogdan Nicolae, Jiali Li, Justin M Wozniak, George Bosilca, Matthieu Dorier, and Franck Cappello. Deepfreeze: Towards scalable asynchronous checkpointing of deep learning models. In 2020 20th IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID), pages 172\u2013181. IEEE, 2020."},{"key":"e_1_3_2_1_47_1","volume-title":"Codegen: An open large language model for code with multi-turn program synthesis. arXiv preprint arXiv:2203.13474","author":"Nijkamp Erik","year":"2022","unstructured":"Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, and Caiming Xiong. Codegen: An open large language model for code with multi-turn program synthesis. arXiv preprint arXiv:2203.13474, 2022."},{"key":"e_1_3_2_1_48_1","volume-title":"CUDA Runtime API :: CUDA Toolkit Documentation - Events. https:\/\/docs.nvidia.com\/cuda\/cuda-runtime-api\/group__CUDART__EVENT.html","author":"NVIDIA Corporation","year":"2025","unstructured":"NVIDIA Corporation. CUDA Runtime API :: CUDA Toolkit Documentation - Events. https:\/\/docs.nvidia.com\/cuda\/cuda-runtime-api\/group__CUDART__EVENT.html, 2025. Accessed: 2025-09-22."},{"key":"e_1_3_2_1_49_1","volume-title":"Inter-process communication. https:\/\/developer.nvidia.com\/docs\/drive\/drive-os\/6.0.8\/public\/drive-os-linux-sdk\/common\/topics\/nvsci_nvsciipc\/Inter-ProcessCommunication1.html","author":"NVIDIA Corporation","year":"2025","unstructured":"NVIDIA Corporation. Inter-process communication. https:\/\/developer.nvidia.com\/docs\/drive\/drive-os\/6.0.8\/public\/drive-os-linux-sdk\/common\/topics\/nvsci_nvsciipc\/Inter-ProcessCommunication1.html, 2025. Accessed: 2025-09-22."},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of the Thirteenth EuroSys Conference, EuroSys '18","author":"Peng Yanghua","year":"2018","unstructured":"Yanghua Peng, Yixin Bao, Yangrui Chen, Chuan Wu, and Chuanxiong Guo. Optimus: an efficient dynamic resource scheduler for deep learning clusters. In Proceedings of the Thirteenth EuroSys Conference, EuroSys '18, New York, NY, USA, 2018. Association for Computing Machinery."},{"key":"e_1_3_2_1_51_1","volume-title":"pybind11. https:\/\/github.com\/pybind\/pybind 11","year":"2025","unstructured":"pybind11 Contributors. pybind11. https:\/\/github.com\/pybind\/pybind 11, 2025. Accessed: 2025-09-22."},{"key":"e_1_3_2_1_52_1","volume-title":"https:\/\/pytorch.org\/","author":"Contributors PyTorch","year":"2025","unstructured":"PyTorch Contributors. Pytorch. https:\/\/pytorch.org\/, 2025. Accessed: 2025-09-22."},{"key":"e_1_3_2_1_53_1","first-page":"18","volume-title":"15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Qiao Aurick","unstructured":"Aurick Qiao, Sang Keun Choe, Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R. Ganger, and Eric P. Xing. Pollux: Co-adaptive cluster scheduling for goodput-optimized deep learning. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21), pages 1\u201318. USENIX Association, July 2021."},{"issue":"8","key":"e_1_3_2_1_54_1","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever, et al. Language models are unsupervised multitask learners. OpenAI blog, 1(8):9, 2019.","journal-title":"OpenAI blog"},{"key":"e_1_3_2_1_55_1","first-page":"16","volume-title":"SC20: International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Rajbhandari Samyam","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. Zero: Memory optimizations toward training trillion parameter models. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pages 1\u201316. IEEE, 2020."},{"key":"e_1_3_2_1_56_1","first-page":"3506","volume-title":"Proceedings of the 26th ACM SIGKDD international conference on knowledge discovery & data mining","author":"Rasley Jeff","year":"2020","unstructured":"Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. Deepspeed: System optimizations enable training deep learning models with over 100 billion parameters. In Proceedings of the 26th ACM SIGKDD international conference on knowledge discovery & data mining, pages 3505\u20133506, 2020."},{"key":"e_1_3_2_1_57_1","volume-title":"Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053, 2019."},{"key":"e_1_3_2_1_58_1","first-page":"214","volume-title":"Proceedings of the 37th International Conference on Supercomputing","author":"Singh Siddharth","year":"2023","unstructured":"Siddharth Singh, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He, and Abhinav Bhatele. A hybrid tensor-expert-data parallelism approach to optimize mixture-of-experts training. In Proceedings of the 37th International Conference on Supercomputing, pages 203\u2013214, 2023."},{"issue":"1","key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","first-page":"8","DOI":"10.1007\/s44336-025-00020-1","article-title":"Embodied intelligence for robot manipulation: development and challenges","volume":"2","author":"Song Honghao","year":"2025","unstructured":"Honghao Song, Liang Wang, Xiaozhen Qiao, Yifan Chen, Da Sun, and Zhe Sun. Embodied intelligence for robot manipulation: development and challenges. Vicinagearth, 2(1):8, 2025.","journal-title":"Vicinagearth"},{"key":"e_1_3_2_1_60_1","volume-title":"Large language models as generalizable policies for embodied tasks. arXiv preprint arXiv:2310.17722","author":"Szot Andrew","year":"2023","unstructured":"Andrew Szot, Max Schwarzer, Harsh Agrawal, Bogdan Mazoure, Walter Talbott, Katherine Metcalf, Natalie Mackraz, Devon Hjelm, and Alexander Toshev. Large language models as generalizable policies for embodied tasks. arXiv preprint arXiv:2310.17722, 2023."},{"key":"e_1_3_2_1_61_1","volume-title":"Qwen2 technical report. arXiv preprint arXiv:2407.10671, 2","author":"Team Qwen","year":"2024","unstructured":"Qwen Team. Qwen2 technical report. arXiv preprint arXiv:2407.10671, 2, 2024."},{"key":"e_1_3_2_1_62_1","volume-title":"et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971, 2023."},{"key":"e_1_3_2_1_63_1","volume-title":"Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, et al. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288, 2023."},{"key":"e_1_3_2_1_64_1","first-page":"578","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Um Taegeon","year":"2024","unstructured":"Taegeon Um, Byungsoo Oh, Minyoung Kang, Woo-Yeon Lee, Goeun Kim, Dongseob Kim, Youngtaek Kim, Mohd Muzzammil, and Myeongjae Jeon. Metis: Fast automatic distributed training on heterogeneous {GPUs}. In 2024 USENIX Annual Technical Conference (USENIX ATC 24), pages 563\u2013578, 2024."},{"key":"e_1_3_2_1_65_1","first-page":"284","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Unger Colin","year":"2022","unstructured":"Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, et al. Unity: Accelerating {DNN} training through joint optimization of algebraic transformations and parallelization. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 267\u2013284, 2022."},{"key":"e_1_3_2_1_66_1","volume-title":"Attention is all you need. Advances in neural information processing systems, 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. Attention is all you need. Advances in neural information processing systems, 30, 2017."},{"key":"e_1_3_2_1_67_1","first-page":"210","volume-title":"Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles","author":"Wagenl\u00e4nder Marcel","year":"2024","unstructured":"Marcel Wagenl\u00e4nder, Guo Li, Bo Zhao, Luo Mai, and Peter Pietzuch. Tenplex: Dynamic parallelism for deep learning using parallelizable tensor collections. In Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles, pages 195\u2013210, 2024."},{"key":"e_1_3_2_1_68_1","first-page":"578","volume-title":"22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)","author":"Wan Borui","year":"2025","unstructured":"Borui Wan, Mingji Han, Yiyao Sheng, Yanghua Peng, Haibin Lin, Mofan Zhang, Zhichao Lai, Menghan Yu, Junda Zhang, Zuquan Song, et al. {ByteCheckpoint}: A unified checkpointing system for large foundation model development. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25), pages 559\u2013578, 2025."},{"key":"e_1_3_2_1_69_1","volume-title":"Fast-persist: Accelerating model checkpointing in deep learning. arXiv preprint arXiv:2406.13768","author":"Wang Guanhua","year":"2024","unstructured":"Guanhua Wang, Olatunji Ruwase, Bing Xie, and Yuxiong He. Fast-persist: Accelerating model checkpointing in deep learning. arXiv preprint arXiv:2406.13768, 2024."},{"key":"e_1_3_2_1_70_1","volume-title":"et al. Wlb-llm: Workload-balanced 4d parallelism for large language model training. arXiv preprint arXiv:2503.17924","author":"Wang Zheng","year":"2025","unstructured":"Zheng Wang, Anna Cai, Xinfeng Xie, Zaifeng Pan, Yue Guan, Weiwei Chu, Jie Wang, Shikai Li, Jianyu Huang, Chris Cai, et al. Wlb-llm: Workload-balanced 4d parallelism for large language model training. arXiv preprint arXiv:2503.17924, 2025."},{"key":"e_1_3_2_1_71_1","first-page":"381","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles","author":"Wang Zhuang","year":"2023","unstructured":"Zhuang Wang, Zhen Jia, Shuai Zheng, Zhen Zhang, Xinwei Fu, TS Eugene Ng, and Yida Wang. Gemini: Fast failure recovery in distributed training with in-memory checkpoints. In Proceedings of the 29th Symposium on Operating Systems Principles, pages 364\u2013381, 2023."},{"issue":"1","key":"e_1_3_2_1_72_1","first-page":"144","article-title":"Elastic deep learning in multi-tenant gpu clusters","volume":"33","author":"Wu Yidi","year":"2021","unstructured":"Yidi Wu, Kaihao Ma, Xiao Yan, Zhi Liu, Zhenkun Cai, Yuzhen Huang, James Cheng, Han Yuan, and Fan Yu. Elastic deep learning in multi-tenant gpu clusters. IEEE Transactions on Parallel and Distributed Systems, 33(1):144\u2013158, 2021.","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"key":"e_1_3_2_1_73_1","first-page":"548","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao, Shiru Ren, Yong Li, Yang Zhang, Pengyang Hou, Zhi Li, Yihui Feng, Wei Lin, and Yangqing Jia. {AntMan}: Dynamic scaling on { GPU} clusters for deep learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pages 533\u2013548, 2020."},{"key":"e_1_3_2_1_74_1","first-page":"88","volume-title":"2020 IEEE 40th International Conference on Distributed Computing Systems (ICDCS)","author":"Xie Lei","unstructured":"Lei Xie, Jidong Zhai, Baodong Wu, Yuanbo Wang, Xingcheng Zhang, Peng Sun, and Shengen Yan. Elan: Towards generic and efficient elastic training for deep learning. In 2020 IEEE 40th International Conference on Distributed Computing Systems (ICDCS), pages 78\u201388. IEEE, 2020."},{"key":"e_1_3_2_1_75_1","volume-title":"Xiaohui Tao, and Fu Lee Wang. Parameter-efficient fine-tuning methods for pretrained language models: A critical review and assessment. arXiv preprint arXiv:2312.12148","author":"Xu Lingling","year":"2023","unstructured":"Lingling Xu, Haoran Xie, Si-Zhao Joe Qin, Xiaohui Tao, and Fu Lee Wang. Parameter-efficient fine-tuning methods for pretrained language models: A critical review and assessment. arXiv preprint arXiv:2312.12148, 2023."},{"key":"e_1_3_2_1_76_1","volume-title":"Qwen3 technical report. arXiv preprint arXiv:2505.09388","author":"Yang An","year":"2025","unstructured":"An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Gao, Chengen Huang, Chenxu Lv, et al. Qwen3 technical report. arXiv preprint arXiv:2505.09388, 2025."},{"key":"e_1_3_2_1_77_1","first-page":"847","volume-title":"2023 IEEE 29th International Conference on Parallel and Distributed Systems (ICPADS)","author":"Zeng Fanlong","unstructured":"Fanlong Zeng, Wensheng Gan, Yongheng Wang, and Philip S Yu. Distributed training of large language models. In 2023 IEEE 29th International Conference on Parallel and Distributed Systems (ICPADS), pages 840\u2013847. IEEE, 2023."},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"crossref","first-page":"39","DOI":"10.1162\/tacl_a_00632","article-title":"Benchmarking large language models for news summarization","volume":"12","author":"Zhang Tianyi","year":"2024","unstructured":"Tianyi Zhang, Faisal Ladhak, Esin Durmus, Percy Liang, Kathleen McKeown, and Tatsunori B Hashimoto. Benchmarking large language models for news summarization. Transactions of the Association for Computational Linguistics, 12:39\u201357, 2024.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"e_1_3_2_1_79_1","volume-title":"Rubick: Exploiting job reconfigurability for deep learning cluster scheduling. arXiv preprint arXiv:2408.08586","author":"Zhang Xinyi","year":"2024","unstructured":"Xinyi Zhang, Hanyu Zhao, Wencong Xiao, Xianyan Jia, Fei Xu, Yong Li, Wei Lin, and Fangming Liu. Rubick: Exploiting job reconfigurability for deep learning cluster scheduling. arXiv preprint arXiv:2408.08586, 2024."},{"key":"e_1_3_2_1_80_1","first-page":"532","volume-title":"14th USENIX symposium on operating systems design and implementation (OSDI 20)","author":"Zhao Hanyu","year":"2020","unstructured":"Hanyu Zhao, Zhenhua Han, Zhi Yang, Quanlu Zhang, Fan Yang, Lidong Zhou, Mao Yang, Francis CM Lau, Yuqi Wang, Yifan Xiong, et al. {HiveD}: Sharing a { GPU} cluster for deep learning with guarantees. In 14th USENIX symposium on operating systems design and implementation (OSDI 20), pages 515\u2013532, 2020."},{"key":"e_1_3_2_1_81_1","first-page":"578","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P Xing, et al. Alpa: Automating inter-and {Intra-Operator} parallelism for distributed deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 559\u2013578, 2022."}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"deposited":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:25:56Z","timestamp":1777062356000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3803623"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":81,"alternative-id":["10.1145\/3767295.3803623","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3803623","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}