{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T13:03:29Z","timestamp":1780664609256,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":81,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,4,26]],"date-time":"2026-04-26T00:00:00Z","timestamp":1777161600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/legalcode"}],"funder":[{"name":"National Natural Science Foundation of China","award":["62572304"],"award-info":[{"award-number":["62572304"]}]},{"name":"National Natural Science Foundation of China","award":["62232011"],"award-info":[{"award-number":["62232011"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3803623","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"2002-2021","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Suika: Efficient and High-quality Re-scheduling of 3D-parallelized LLM Training Jobs in Shared Clusters"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-1595-411X","authenticated-orcid":false,"given":"Yuxuan","family":"Wang","sequence":"first","affiliation":[{"name":"Zhiyuan College, Shanghai Jiao Tong University, Shanghai, China"},{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9679-8602","authenticated-orcid":false,"given":"Yanbo","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"},{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9480-5632","authenticated-orcid":false,"given":"Chen","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9272-1732","authenticated-orcid":false,"given":"Chunyu","family":"Xue","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9195-6443","authenticated-orcid":false,"given":"Qizhen","family":"Weng","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4930-1028","authenticated-orcid":false,"given":"Yin","family":"Chen","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9287-9413","authenticated-orcid":false,"given":"Zeren","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1131-6237","authenticated-orcid":false,"given":"Xuqi","family":"Zhu","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9733-4346","authenticated-orcid":false,"given":"Yongqiang","family":"Yang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., National Natural Science Foundation of Chinahen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5832-0347","authenticated-orcid":false,"given":"Quan","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0034-2302","authenticated-orcid":false,"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/s44336-025-00031-y"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3480859"},{"key":"e_1_3_2_1_3_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems, 33:1877\u20131901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877\u20131901, 2020."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387555"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421299"},{"key":"e_1_3_2_1_6_1","volume-title":"Ladm: Long-context training data selection with attention-based dependency measurement for llms. arXiv preprint arXiv:2503.02502","author":"Chen Jianghao","year":"2025","unstructured":"Jianghao Chen, Junhong Wu, Yangyifan Xu, and Jiajun Zhang. Ladm: Long-context training data selection with attention-based dependency measurement for llms. arXiv preprint arXiv:2503.02502, 2025."},{"key":"e_1_3_2_1_7_1","volume-title":"Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374","author":"Chen Mark","year":"2021","unstructured":"Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde De Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374, 2021."},{"key":"e_1_3_2_1_8_1","volume-title":"Large scale distributed deep networks. Advances in neural information processing systems, 25","author":"Dean Jeffrey","year":"2012","unstructured":"Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Marc'aurelio Ranzato, Andrew Senior, Paul Tucker, Ke Yang, et al. Large scale distributed deep networks. Advances in neural information processing systems, 25, 2012."},{"key":"e_1_3_2_1_9_1","volume-title":"Qlora: Efficient finetuning of quantized llms. Advances in neural information processing systems, 36:10088\u201310115","author":"Dettmers Tim","year":"2023","unstructured":"Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. Qlora: Efficient finetuning of quantized llms. Advances in neural information processing systems, 36:10088\u201310115, 2023."},{"key":"e_1_3_2_1_10_1","volume-title":"Efficient training of large language models on distributed infrastructures: a survey. arXiv preprint arXiv:2407.20018","author":"Duan Jiangfei","year":"2024","unstructured":"Jiangfei Duan, Shuo Zhang, Zerui Wang, Lijuan Jiang, Wenwen Qu, Qinghao Hu, Guoteng Wang, Qizhen Weng, Hang Yan, Xingcheng Zhang, et al. Efficient training of large language models on distributed infrastructures: a survey. arXiv preprint arXiv:2407.20018, 2024."},{"key":"e_1_3_2_1_11_1","volume-title":"The llama 3 herd of models. arXiv e-prints","author":"Dubey Abhimanyu","year":"2024","unstructured":"Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, et al. The llama 3 herd of models. arXiv e-prints, pages arXiv-2407, 2024."},{"key":"e_1_3_2_1_12_1","volume-title":"From llm to nmt: Advancing low-resource machine translation with claude. arXiv preprint arXiv:2404.13813","author":"Enis Maxim","year":"2024","unstructured":"Maxim Enis and Mark Hopkins. From llm to nmt: Advancing low-resource machine translation with claude. arXiv preprint arXiv:2404.13813, 2024."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.2305016120"},{"key":"e_1_3_2_1_14_1","volume-title":"et al. Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793","author":"Aohan Zeng Team GLM","year":"2024","unstructured":"Team GLM, Aohan Zeng, Bin Xu, Bowen Wang, Chenhui Zhang, Da Yin, Dan Zhang, Diego Rojas, Guanyu Feng, Hanlin Zhao, et al. Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793, 2024."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575721"},{"issue":"11","key":"e_1_3_2_1_16_1","first-page":"2808","article-title":"Intelligent resource estimation and network-efficient scheduling for deep learning jobs on distributed gpu clusters","volume":"33","author":"Gu Rong","year":"2021","unstructured":"Rong Gu, Yuquan Chen, Shuai Liu, Haipeng Dai, Guihai Chen, Kai Zhang, Yang Che, and Yihua Huang. Liquid: Intelligent resource estimation and network-efficient scheduling for deep learning jobs on distributed gpu clusters. IEEE Transactions on Parallel and Distributed Systems, 33(11):2808\u20132820, 2021.","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"issue":"2","key":"e_1_3_2_1_17_1","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al. Lora: Low-rank adaptation of large language models. ICLR, 1(2):3, 2022.","journal-title":"ICLR"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476223"},{"key":"e_1_3_2_1_19_1","first-page":"729","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Hu Qinghao","year":"2024","unstructured":"Qinghao Hu, Zhisheng Ye, Zerui Wang, Guoteng Wang, Meng Zhang, Qiaoling Chen, Peng Sun, Dahua Lin, Xiaolin Wang, Yingwei Luo, et al. Characterization of large language model development in the datacenter. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24), pages 709\u2013729, 2024."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575705"},{"key":"e_1_3_2_1_21_1","volume-title":"et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 32, 2019."},{"key":"e_1_3_2_1_22_1","unstructured":"Hugging Face Inc. Hugging face. https:\/\/huggingface.co\/ 2025. Accessed: 2025-09-22."},{"key":"e_1_3_2_1_23_1","first-page":"739","volume-title":"18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Hwang Changho","year":"2021","unstructured":"Changho Hwang, Taehyun Kim, Sunghyun Kim, Jinwoo Shin, and KyoungSoo Park. Elastic resource sharing for distributed deep learning. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21), pages 721\u2013739, 2021."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1991.3.1.79"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613175"},{"key":"e_1_3_2_1_26_1","first-page":"960","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. Analysis of {Large-Scale} {Multi-Tenant} {GPU} clusters for {DNN} training workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19), pages 947\u2013960, 2019."},{"key":"e_1_3_2_1_27_1","first-page":"760","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Jiang Ziheng","year":"2024","unstructured":"Ziheng Jiang, Haibin Lin, Yinmin Zhong, Qi Huang, Yangrui Chen, Zhi Zhang, Yanghua Peng, Xiang Li, Cong Xie, Shibiao Nong, et al. {MegaScale}: Scaling large language model training to more than 10,000 {GPUs}. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24), pages 745\u2013760, 2024."},{"key":"e_1_3_2_1_28_1","first-page":"341","article-title":"Reducing activation recomputation in large transformer models","volume":"5","author":"Korthikanti Vijay Anand","year":"2023","unstructured":"Vijay Anand Korthikanti, Jared Casper, Sangkug Lym, Lawrence McAfee, Michael Andersch, Mohammad Shoeybi, and Bryan Catanzaro. Reducing activation recomputation in large transformer models. Proceedings of Machine Learning and Systems, 5:341\u2013353, 2023.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_29_1","volume-title":"et al. Trainmover: An interruption-resilient and reliable ml training runtime. arXiv preprint arXiv:2412.12636","author":"Lao Chon Lam","year":"2024","unstructured":"Chon Lam Lao, Minlan Yu, Aditya Akella, Jiamin Cao, Yu Guan, Pengcheng Zhang, Zhilong Zheng, Yichi Xu, Ennan Zhai, Dennis Cai, et al. Trainmover: An interruption-resilient and reliable ml training runtime. arXiv preprint arXiv:2412.12636, 2024."},{"key":"e_1_3_2_1_30_1","volume-title":"Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668, 2020."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3725322"},{"key":"e_1_3_2_1_32_1","first-page":"959","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Li Jiamin","year":"2023","unstructured":"Jiamin Li, Yimin Jiang, Yibo Zhu, Cong Wang, and Hong Xu. Accelerating distributed { MoE} training and inference with lina. In 2023 USENIX Annual Technical Conference (USENIX ATC 23), pages 945\u2013959, 2023."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587445"},{"key":"e_1_3_2_1_34_1","volume-title":"Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704","author":"Li Shen","year":"2020","unstructured":"Shen Li, Yanli Zhao, Rohan Varma, Omkar Salpekar, Pieter Noordhuis, Teng Li, Adam Paszke, Jeff Smith, Brian Vaughan, Pritam Damania, et al. Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704, 2020."},{"key":"e_1_3_2_1_35_1","volume-title":"Sequence parallelism: Long sequence training from system perspective. arXiv preprint arXiv:2105.13120","author":"Li Shenggui","year":"2021","unstructured":"Shenggui Li, Fuzhao Xue, Chaitanya Baranwal, Yongbin Li, and Yang You. Sequence parallelism: Long sequence training from system perspective. arXiv preprint arXiv:2105.13120, 2021."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3718958.3750480"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/s44336-024-00009-2"},{"key":"e_1_3_2_1_38_1","first-page":"1534","volume-title":"2025 USENIX Annual Technical Conference (USENIX ATC 25)","author":"Lian Xinyu","year":"2025","unstructured":"Xinyu Lian, Sam Ade Jacobs, Lev Kurilenko, Masahiro Tanaka, Stas Bekman, Olatunji Ruwase, and Minjia Zhang. Universal checkpointing: A flexible and efficient distributed checkpointing system for {Large-Scale}{DNN} training with reconfigurable parallelism. In 2025 USENIX Annual Technical Conference (USENIX ATC 25), pages 1519\u20131534, 2025."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629554"},{"key":"e_1_3_2_1_40_1","volume-title":"Fagel: Fabric llms agent empowered embodied intelligence evolution with autonomous human-machine collaboration. arXiv preprint arXiv:2412.20297","author":"Liu Jia","year":"2024","unstructured":"Jia Liu and Min Chen. Fagel: Fabric llms agent empowered embodied intelligence evolution with autonomous human-machine collaboration. arXiv preprint arXiv:2412.20297, 2024."},{"key":"e_1_3_2_1_41_1","first-page":"304","volume-title":"17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)","author":"Mahajan Kshiteej","year":"2020","unstructured":"Kshiteej Mahajan, Arjun Balasubramanian, Arjun Singhvi, Shivaram Venkataraman, Aditya Akella, Amar Phanishayee, and Shuchi Chawla. Themis: Fair and efficient {GPU} cluster scheduling. In 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20), pages 289\u2013304, 2020."},{"key":"e_1_3_2_1_42_1","first-page":"216","volume-title":"19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. { CheckFreq}: Frequent, {Fine-Grained} {DNN} checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21), pages 203\u2013216, 2021."},{"key":"e_1_3_2_1_43_1","first-page":"596","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Mohan Jayashree","year":"2022","unstructured":"Jayashree Mohan, Amar Phanishayee, Janardhan Kulkarni, and Vijay Chidambaram. Looking beyond {GPUs} for {DNN} scheduling on {Multi-Tenant} clusters. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 579\u2013596, 2022."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid49817.2020.00-76"},{"key":"e_1_3_2_1_47_1","volume-title":"Codegen: An open large language model for code with multi-turn program synthesis. arXiv preprint arXiv:2203.13474","author":"Nijkamp Erik","year":"2022","unstructured":"Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, and Caiming Xiong. Codegen: An open large language model for code with multi-turn program synthesis. arXiv preprint arXiv:2203.13474, 2022."},{"key":"e_1_3_2_1_48_1","volume-title":"CUDA Runtime API :: CUDA Toolkit Documentation - Events. https:\/\/docs.nvidia.com\/cuda\/cuda-runtime-api\/group__CUDART__EVENT.html","author":"NVIDIA Corporation","year":"2025","unstructured":"NVIDIA Corporation. CUDA Runtime API :: CUDA Toolkit Documentation - Events. https:\/\/docs.nvidia.com\/cuda\/cuda-runtime-api\/group__CUDART__EVENT.html, 2025. Accessed: 2025-09-22."},{"key":"e_1_3_2_1_49_1","volume-title":"Inter-process communication. https:\/\/developer.nvidia.com\/docs\/drive\/drive-os\/6.0.8\/public\/drive-os-linux-sdk\/common\/topics\/nvsci_nvsciipc\/Inter-ProcessCommunication1.html","author":"NVIDIA Corporation","year":"2025","unstructured":"NVIDIA Corporation. Inter-process communication. https:\/\/developer.nvidia.com\/docs\/drive\/drive-os\/6.0.8\/public\/drive-os-linux-sdk\/common\/topics\/nvsci_nvsciipc\/Inter-ProcessCommunication1.html, 2025. Accessed: 2025-09-22."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"e_1_3_2_1_51_1","volume-title":"pybind11. https:\/\/github.com\/pybind\/pybind 11","year":"2025","unstructured":"pybind11 Contributors. pybind11. https:\/\/github.com\/pybind\/pybind 11, 2025. Accessed: 2025-09-22."},{"key":"e_1_3_2_1_52_1","volume-title":"https:\/\/pytorch.org\/","author":"Contributors PyTorch","year":"2025","unstructured":"PyTorch Contributors. Pytorch. https:\/\/pytorch.org\/, 2025. Accessed: 2025-09-22."},{"key":"e_1_3_2_1_53_1","first-page":"18","volume-title":"15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Qiao Aurick","unstructured":"Aurick Qiao, Sang Keun Choe, Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R. Ganger, and Eric P. Xing. Pollux: Co-adaptive cluster scheduling for goodput-optimized deep learning. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21), pages 1\u201318. USENIX Association, July 2021."},{"issue":"8","key":"e_1_3_2_1_54_1","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever, et al. Language models are unsupervised multitask learners. OpenAI blog, 1(8):9, 2019.","journal-title":"OpenAI blog"},{"key":"e_1_3_2_1_55_1","first-page":"16","volume-title":"SC20: International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Rajbhandari Samyam","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. Zero: Memory optimizations toward training trillion parameter models. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pages 1\u201316. IEEE, 2020."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_57_1","volume-title":"Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053, 2019."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593704"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1007\/s44336-025-00020-1"},{"key":"e_1_3_2_1_60_1","volume-title":"Large language models as generalizable policies for embodied tasks. arXiv preprint arXiv:2310.17722","author":"Szot Andrew","year":"2023","unstructured":"Andrew Szot, Max Schwarzer, Harsh Agrawal, Bogdan Mazoure, Walter Talbott, Katherine Metcalf, Natalie Mackraz, Devon Hjelm, and Alexander Toshev. Large language models as generalizable policies for embodied tasks. arXiv preprint arXiv:2310.17722, 2023."},{"key":"e_1_3_2_1_61_1","volume-title":"Qwen2 technical report. arXiv preprint arXiv:2407.10671, 2","author":"Team Qwen","year":"2024","unstructured":"Qwen Team. Qwen2 technical report. arXiv preprint arXiv:2407.10671, 2, 2024."},{"key":"e_1_3_2_1_62_1","volume-title":"et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971, 2023."},{"key":"e_1_3_2_1_63_1","volume-title":"Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, et al. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288, 2023."},{"key":"e_1_3_2_1_64_1","first-page":"578","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Um Taegeon","year":"2024","unstructured":"Taegeon Um, Byungsoo Oh, Minyoung Kang, Woo-Yeon Lee, Goeun Kim, Dongseob Kim, Youngtaek Kim, Mohd Muzzammil, and Myeongjae Jeon. Metis: Fast automatic distributed training on heterogeneous {GPUs}. In 2024 USENIX Annual Technical Conference (USENIX ATC 24), pages 563\u2013578, 2024."},{"key":"e_1_3_2_1_65_1","first-page":"284","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Unger Colin","year":"2022","unstructured":"Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, et al. Unity: Accelerating {DNN} training through joint optimization of algebraic transformations and parallelization. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 267\u2013284, 2022."},{"key":"e_1_3_2_1_66_1","volume-title":"Attention is all you need. Advances in neural information processing systems, 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. Attention is all you need. Advances in neural information processing systems, 30, 2017."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695975"},{"key":"e_1_3_2_1_68_1","first-page":"578","volume-title":"22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)","author":"Wan Borui","year":"2025","unstructured":"Borui Wan, Mingji Han, Yiyao Sheng, Yanghua Peng, Haibin Lin, Mofan Zhang, Zhichao Lai, Menghan Yu, Junda Zhang, Zuquan Song, et al. {ByteCheckpoint}: A unified checkpointing system for large foundation model development. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25), pages 559\u2013578, 2025."},{"key":"e_1_3_2_1_69_1","volume-title":"Fast-persist: Accelerating model checkpointing in deep learning. arXiv preprint arXiv:2406.13768","author":"Wang Guanhua","year":"2024","unstructured":"Guanhua Wang, Olatunji Ruwase, Bing Xie, and Yuxiong He. Fast-persist: Accelerating model checkpointing in deep learning. arXiv preprint arXiv:2406.13768, 2024."},{"key":"e_1_3_2_1_70_1","volume-title":"et al. Wlb-llm: Workload-balanced 4d parallelism for large language model training. arXiv preprint arXiv:2503.17924","author":"Wang Zheng","year":"2025","unstructured":"Zheng Wang, Anna Cai, Xinfeng Xie, Zaifeng Pan, Yue Guan, Weiwei Chu, Jie Wang, Shikai Li, Jianyu Huang, Chris Cai, et al. Wlb-llm: Workload-balanced 4d parallelism for large language model training. arXiv preprint arXiv:2503.17924, 2025."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613145"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3064966"},{"key":"e_1_3_2_1_73_1","first-page":"548","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao, Shiru Ren, Yong Li, Yang Zhang, Pengyang Hou, Zhi Li, Yihui Feng, Wei Lin, and Yangqing Jia. {AntMan}: Dynamic scaling on { GPU} clusters for deep learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pages 533\u2013548, 2020."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS47774.2020.00018"},{"key":"e_1_3_2_1_75_1","volume-title":"Xiaohui Tao, and Fu Lee Wang. Parameter-efficient fine-tuning methods for pretrained language models: A critical review and assessment. arXiv preprint arXiv:2312.12148","author":"Xu Lingling","year":"2023","unstructured":"Lingling Xu, Haoran Xie, Si-Zhao Joe Qin, Xiaohui Tao, and Fu Lee Wang. Parameter-efficient fine-tuning methods for pretrained language models: A critical review and assessment. arXiv preprint arXiv:2312.12148, 2023."},{"key":"e_1_3_2_1_76_1","volume-title":"Qwen3 technical report. arXiv preprint arXiv:2505.09388","author":"Yang An","year":"2025","unstructured":"An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Gao, Chengen Huang, Chenxu Lv, et al. Qwen3 technical report. arXiv preprint arXiv:2505.09388, 2025."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPADS60453.2023.00126"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00632"},{"key":"e_1_3_2_1_79_1","volume-title":"Rubick: Exploiting job reconfigurability for deep learning cluster scheduling. arXiv preprint arXiv:2408.08586","author":"Zhang Xinyi","year":"2024","unstructured":"Xinyi Zhang, Hanyu Zhao, Wencong Xiao, Xianyan Jia, Fei Xu, Yong Li, Wei Lin, and Fangming Liu. Rubick: Exploiting job reconfigurability for deep learning cluster scheduling. arXiv preprint arXiv:2408.08586, 2024."},{"key":"e_1_3_2_1_80_1","first-page":"532","volume-title":"14th USENIX symposium on operating systems design and implementation (OSDI 20)","author":"Zhao Hanyu","year":"2020","unstructured":"Hanyu Zhao, Zhenhua Han, Zhi Yang, Quanlu Zhang, Fan Yang, Lidong Zhou, Mao Yang, Francis CM Lau, Yuqi Wang, Yifan Xiong, et al. {HiveD}: Sharing a { GPU} cluster for deep learning with guarantees. In 14th USENIX symposium on operating systems design and implementation (OSDI 20), pages 515\u2013532, 2020."},{"key":"e_1_3_2_1_81_1","first-page":"578","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P Xing, et al. Alpa: Automating inter-and {Intra-Operator} parallelism for distributed deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 559\u2013578, 2022."}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3767295.3803623","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T12:08:24Z","timestamp":1780661304000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3803623"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":81,"alternative-id":["10.1145\/3767295.3803623","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3803623","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}