{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:54:00Z","timestamp":1777064040799,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":92,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3803571","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"89-107","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Arena: Efficiently Training Large Models via Dynamic Scheduling and Adaptive Parallelism Co-Design"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9272-1732","authenticated-orcid":false,"given":"Chunyu","family":"Xue","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6646-5260","authenticated-orcid":false,"given":"Weihao","family":"Cui","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5832-0347","authenticated-orcid":false,"given":"Quan","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9480-5632","authenticated-orcid":false,"given":"Chen","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1561-5329","authenticated-orcid":false,"given":"Han","family":"Zhao","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0802-7203","authenticated-orcid":false,"given":"Shulai","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7089-9199","authenticated-orcid":false,"given":"Linmei","family":"Wang","sequence":"additional","affiliation":[{"name":"Lenovo Research, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4063-0772","authenticated-orcid":false,"given":"Yan","family":"Li","sequence":"additional","affiliation":[{"name":"Microsoft, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9438-9181","authenticated-orcid":false,"given":"Limin","family":"Xiao","sequence":"additional","affiliation":[{"name":"Lenovo Research, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4529-1679","authenticated-orcid":false,"given":"WeiFeng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Lenovo Research, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1915-9487","authenticated-orcid":false,"given":"Jing","family":"Yang","sequence":"additional","affiliation":[{"name":"Guizhou University, Guiyang, Guizhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8618-4581","authenticated-orcid":false,"given":"Bingsheng","family":"He","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0034-2302","authenticated-orcid":false,"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. Amazon EC2 P4 Instance. https:\/\/aws.amazon.com\/ec2\/instance-types\/p4\/."},{"key":"e_1_3_2_1_2_1","unstructured":"2024. Distributed communication package - torch.distributed. https:\/\/pytorch.org\/docs\/stable\/distributed.html."},{"key":"e_1_3_2_1_3_1","unstructured":"2024. Euclidean distance. https:\/\/en.wikipedia.org\/wiki\/Euclidean_distance."},{"key":"e_1_3_2_1_4_1","unstructured":"2024. Hexagonal Sampling. https:\/\/en.wikipedia.org\/wiki\/Hexagonal_sampling."},{"key":"e_1_3_2_1_5_1","unstructured":"2024. JAX: High-Performance Array Computing. https:\/\/jax.readthedocs.io\/en\/latest\/index.html."},{"key":"e_1_3_2_1_6_1","unstructured":"2024. Multi-objective optimization. https:\/\/en.wikipedia.org\/wiki\/Multi-objective_optimization."},{"key":"e_1_3_2_1_7_1","unstructured":"2024. Nvidia Mellanox ConnectX-5. https:\/\/www.nvidia.com\/en-us\/networking\/ethernet\/connectx-5\/."},{"key":"e_1_3_2_1_8_1","unstructured":"2024. Nvidia Mellanox ConnectX-6. https:\/\/www.nvidia.com\/en-sg\/networking\/ethernet\/connectx-6\/."},{"key":"e_1_3_2_1_9_1","unstructured":"2024. PyTorch Dispatcher. http:\/\/blog.ezyang.com\/2020\/09\/lets-talk-about-the-pytorch-dispatcher\/."},{"key":"e_1_3_2_1_10_1","unstructured":"2024. Roofline model. https:\/\/en.wikipedia.org\/wiki\/Roofline_model."},{"key":"e_1_3_2_1_11_1","unstructured":"2024. TensorFlow XLA Compiler. https:\/\/www.tensorflow.org\/xla."},{"key":"e_1_3_2_1_12_1","unstructured":"2024. Torch FX. https:\/\/pytorch.org\/docs\/stable\/fx.html."},{"key":"e_1_3_2_1_13_1","unstructured":"2025. A high performance open source universal RPC framework. https:\/\/grpc.io."},{"key":"e_1_3_2_1_14_1","unstructured":"2025. Kubernetes. https:\/\/kubernetes.io\/."},{"key":"e_1_3_2_1_15_1","unstructured":"2025. Nvidia Ada Lovelace Architecture. https:\/\/www.nvidia.com\/en-us\/geforce\/ada-lovelace-architecture\/."},{"key":"e_1_3_2_1_16_1","unstructured":"2025. Nvidia Ampere Architecture. https:\/\/www.nvidia.com\/en-us\/data-center\/ampere-architecture\/."},{"key":"e_1_3_2_1_17_1","unstructured":"2025. Nvidia Blackwell Architecture. https:\/\/www.nvidia.com\/en-us\/data-center\/technologies\/blackwell-architecture\/."},{"key":"e_1_3_2_1_18_1","unstructured":"2025. NVIDIA Collective Communication Library (NCCL). https:\/\/developer.nvidia.com\/nccl."},{"key":"e_1_3_2_1_19_1","unstructured":"2025. NVIDIA CUDA Profiling Tools Interface (CUPTI). https:\/\/developer.nvidia.com\/cupti."},{"key":"e_1_3_2_1_20_1","unstructured":"2025. Nvidia Hopper Architecture. https:\/\/www.nvidia.com\/en-us\/data-center\/technologies\/hopper-architecture\/."},{"key":"e_1_3_2_1_21_1","unstructured":"2025. Nvidia Volta Architecture. https:\/\/www.nvidia.com\/en-us\/data-center\/volta-gpu-architecture\/."},{"key":"e_1_3_2_1_22_1","unstructured":"2025. NVLink and NVSwitch. https:\/\/www.nvidia.com\/en-us\/data-center\/nvlink\/."},{"key":"e_1_3_2_1_23_1","unstructured":"2025. Pareto front. https:\/\/en.wikipedia.org\/wiki\/Pareto_front."},{"key":"e_1_3_2_1_24_1","volume-title":"Taming throughput-latency tradeoff in llm inference with sarathi-serve. arXiv preprint arXiv:2403.02310","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming throughput-latency tradeoff in llm inference with sarathi-serve. arXiv preprint arXiv:2403.02310 (2024)."},{"key":"e_1_3_2_1_25_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877\u20131901."},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the Fifteenth European Conference on Computer Systems. 1\u201316","author":"Chaudhary Shubham","year":"2020","unstructured":"Shubham Chaudhary, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, and Srinidhi Viswanatha. 2020. Balancing efficiency and fairness in heterogeneous GPU clusters for deep learning. In Proceedings of the Fifteenth European Conference on Computer Systems. 1\u201316."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"3","author":"Chen Chang","year":"2024","unstructured":"Chang Chen, Xiuhong Li, Qianchao Zhu, Jiangfei Duan, Peng Sun, Xingcheng Zhang, and Chao Yang. 2024. Centauri: Enabling Efficient Scheduling for Communication-Computation Overlap in Large Model Training via Communication Partitioning. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3 (La Jolla, CA, USA) (ASPLOS '24). Association for Computing Machinery, New York, NY, USA, 178\u2013191. 10.1145\/3620666.3651379"},{"key":"e_1_3_2_1_28_1","volume-title":"CrossPipe: Towards Optimal Pipeline Schedules for Cross-Datacenter Training. arXiv preprint arXiv:2507.00217","author":"Chen Tiancheng","year":"2025","unstructured":"Tiancheng Chen, Ales Kubicek, Langwen Huang, and Torsten Hoefler. 2025. CrossPipe: Towards Optimal Pipeline Schedules for Cross-Datacenter Training. arXiv preprint arXiv:2507.00217 (2025)."},{"key":"e_1_3_2_1_29_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. 2018. TVM: An automated End-to-End optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578\u2013594."},{"key":"e_1_3_2_1_30_1","unstructured":"Tianqi Chen Bing Xu Chiyuan Zhang and Carlos Guestrin. 2016. Training Deep Nets with Sublinear Memory Cost. arXiv:1604.06174 [cs.LG] https:\/\/arxiv.org\/abs\/1604.06174"},{"key":"e_1_3_2_1_31_1","unstructured":"Arthur Douillard Qixuan Feng Andrei A. Rusu Rachita Chhaparia Yani Donchev Adhiguna Kuncoro Marc'Aurelio Ranzato Arthur Szlam and Jiajun Shen. 2024. DiLoCo: Distributed Low-Communication Training of Language Models. arXiv:2311.08105 [cs.LG] https:\/\/arxiv.org\/abs\/2311.08105"},{"key":"e_1_3_2_1_32_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Faisal Abdullah Bin","unstructured":"Abdullah Bin Faisal, Noah Martin, Hafiz Mohsin Bashir, Swaminathan Lamelas, and Fahad R. Dogar. 2024. When will my ML Job finish? Toward providing Completion Time Estimates through Predictability-Centric Scheduling. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 487\u2013505. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/bin-faisal"},{"key":"e_1_3_2_1_33_1","volume-title":"Hardware Scaling Trends and Diminishing Returns in Large-Scale Distributed Training. arXiv preprint arXiv:2411.13055","author":"Fernandez Jared","year":"2024","unstructured":"Jared Fernandez, Luca Wehrstedt, Leonid Shamis, Mostafa Elhoushi, Kalyan Saladi, Yonatan Bisk, Emma Strubell, and Jacob Kahn. 2024. Hardware Scaling Trends and Diminishing Returns in Large-Scale Distributed Training. arXiv preprint arXiv:2411.13055 (2024)."},{"key":"e_1_3_2_1_34_1","unstructured":"Wikimedia Foundation. 2025. Wikimedia Downloads. https:\/\/dumps.wikimedia.org"},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"2","author":"Gu Diandian","year":"2023","unstructured":"Diandian Gu, Yihao Zhao, Yinmin Zhong, Yifan Xiong, Zhenhua Han, Peng Cheng, Fan Yang, Gang Huang, Xin Jin, and Xuanzhe Liu. 2023. ElasticFlow: An Elastic Serverless Training Platform for Distributed Deep Learning. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2. 266\u2013280."},{"key":"e_1_3_2_1_36_1","first-page":"485","article-title":"Tiresias: A GPU Cluster Manager for Distributed Deep Learning","volume":"19","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu, Mosharaf Chowdhury, Kang G Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Harry Liu, and Chuanxiong Guo. 2019. Tiresias: A GPU Cluster Manager for Distributed Deep Learning.. In NSDI, Vol. 19. 485\u2013500.","journal-title":"NSDI"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","author":"He Jiaao","year":"2022","unstructured":"Jiaao He, Jidong Zhai, Tiago Antunes, Haojie Wang, Fuwen Luo, Shangfeng Shi, and Qin Li. 2022. FasterMoE: Modeling and Optimizing Training of Large-Scale Dynamic Pre-Trained Models. In Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (Seoul, Republic of Korea) (PPoPP '22). Association for Computing Machinery, New York, NY, USA, 120\u2013134."},{"key":"e_1_3_2_1_38_1","unstructured":"John L Hennessy and David A Patterson. 2017. Computer architecture: a quantitative approach. Morgan kaufmann."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Samuel Hsia Alicia Golden Bilge Acun Newsha Ardalani Zachary DeVito Gu-Yeon Wei David Brooks and Carole-Jean Wu. 2023. MAD Max Beyond Single-Node: Enabling Large Machine Learning Model Acceleration on Distributed Systems. arXiv:2310.02784 [cs.DC]","DOI":"10.1109\/ISCA59077.2024.00064"},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. 1\u201315","author":"Hu Qinghao","year":"2021","unstructured":"Qinghao Hu, Peng Sun, Shengen Yan, Yonggang Wen, and Tianwei Zhang. 2021. Characterization and prediction of deep learning workloads in large-scale gpu datacenters. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. 1\u201315."},{"key":"e_1_3_2_1_41_1","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Hu Qinghao","year":"2024","unstructured":"Qinghao Hu, Zhisheng Ye, Zerui Wang, Guoteng Wang, Meng Zhang, Qiaoling Chen, Peng Sun, Dahua Lin, Xiaolin Wang, Yingwei Luo, et al. 2024. Characterization of large language model development in the datacenter. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). 709\u2013729."},{"key":"e_1_3_2_1_42_1","volume-title":"Scalable and Interpretable Scheduler for Deep Learning Training Jobs. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"2","author":"Hu Qinghao","year":"2023","unstructured":"Qinghao Hu, Meng Zhang, Peng Sun, Yonggang Wen, and Tianwei Zhang. 2023. Lucid: A Non-intrusive, Scalable and Interpretable Scheduler for Deep Learning Training Jobs. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2. 457\u2013472."},{"key":"e_1_3_2_1_43_1","volume-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. 2019. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_44_1","first-page":"269","article-title":"Tutel: Adaptive mixture-of-experts at scale","volume":"5","author":"Hwang Changho","year":"2023","unstructured":"Changho Hwang, Wei Cui, Yifan Xiong, Ziyue Yang, Ze Liu, Han Hu, Zilong Wang, Rafael Salas, Jithin Jose, Prabhat Ram, et al. 2023. Tutel: Adaptive mixture-of-experts at scale. Proceedings of Machine Learning and Systems 5 (2023), 269\u2013287.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles. 642\u2013657","author":"Subramanya Suhas Jayaram","year":"2023","unstructured":"Suhas Jayaram Subramanya, Daiyaan Arfeen, Shouxu Lin, Aurick Qiao, Zhihao Jia, and Gregory R Ganger. 2023. Sia: Heterogeneity-aware, goodput-optimized ML-cluster scheduling. In Proceedings of the 29th Symposium on Operating Systems Principles. 642\u2013657."},{"key":"e_1_3_2_1_46_1","volume-title":"USENIX Annual Technical Conference. 947\u2013960","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads.. In USENIX Annual Technical Conference. 947\u2013960."},{"key":"e_1_3_2_1_47_1","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Jia Xianyan","year":"2022","unstructured":"Xianyan Jia, Le Jiang, Ang Wang, Wencong Xiao, Ziji Shi, Jie Zhang, Xinyuan Li, Langshi Chen, Yong Li, Zhen Zheng, et al. 2022. Whale: Efficient giant model training over heterogeneous {GPUs}. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). 673\u2013688."},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of the Nineteenth European Conference on Computer Systems. 542\u2013559","author":"Jiang Chenyu","year":"2024","unstructured":"Chenyu Jiang, Zhen Jia, Shuai Zheng, Yida Wang, and Chuan Wu. 2024. DynaPipe: Optimizing multi-task training through dynamic pipelines. In Proceedings of the Nineteenth European Conference on Computer Systems. 542\u2013559."},{"key":"e_1_3_2_1_49_1","first-page":"341","article-title":"Reducing activation recomputation in large transformer models","volume":"5","author":"Korthikanti Vijay Anand","year":"2023","unstructured":"Vijay Anand Korthikanti, Jared Casper, Sangkug Lym, Lawrence McAfee, Michael Andersch, Mohammad Shoeybi, and Bryan Catanzaro. 2023. Reducing activation recomputation in large transformer models. Proceedings of Machine Learning and Systems 5 (2023), 341\u2013353.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"1","author":"Lee Seonho","year":"2025","unstructured":"Seonho Lee, Amar Phanishayee, and Divya Mahajan. 2025. Forecasting GPU performance for deep learning training and inference. In Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1. 493\u2013508."},{"key":"e_1_3_2_1_51_1","volume-title":"Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2020. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668 (2020)."},{"key":"e_1_3_2_1_52_1","volume-title":"11th USENIX Symposium on operating systems design and implementation (OSDI 14)","author":"Li Mu","year":"2014","unstructured":"Mu Li, David G Andersen, Jun Woo Park, Alexander J Smola, Amr Ahmed, Vanja Josifovski, James Long, Eugene J Shekita, and Bor-Yiing Su. 2014. Scaling distributed machine learning with the parameter server. In 11th USENIX Symposium on operating systems design and implementation (OSDI 14). 583\u2013598."},{"key":"e_1_3_2_1_53_1","volume-title":"Easyscale: Accuracy-consistent elastic training for deep learning. arXiv preprint arXiv:2208.14228","author":"Li Mingzhen","year":"2022","unstructured":"Mingzhen Li, Wencong Xiao, Biao Sun, Hanyu Zhao, Hailong Yang, Shiru Ren, Zhongzhi Luan, Xianyan Jia, Yi Liu, Yong Li, et al. 2022. Easyscale: Accuracy-consistent elastic training for deep learning. arXiv preprint arXiv:2208.14228 (2022)."},{"key":"e_1_3_2_1_54_1","volume-title":"Flash Communication: Reducing Tensor Parallelization Bottleneck for Fast Large Language Model Inference. arXiv preprint arXiv:2412.04964","author":"Li Qingyuan","year":"2024","unstructured":"Qingyuan Li, Bo Zhang, Liang Ye, Yifan Zhang, Wei Wu, Yerui Sun, Lin Ma, and Yuchen Xie. 2024. Flash Communication: Reducing Tensor Parallelization Bottleneck for Fast Large Language Model Inference. arXiv preprint arXiv:2412.04964 (2024)."},{"key":"e_1_3_2_1_55_1","unstructured":"Rongzhi Li Ruogu Du Zefang Chu Sida Zhao Chunlei Han Zuocheng Shi Yiwen Shao Huanle Han Long Huang Zherui Liu and Shufan Liu. 2025. Taming the Chaos: Coordinated Autoscaling for Heterogeneous and Disaggregated LLM Inference. arXiv:2508.19559 [cs.DC] https:\/\/arxiv.org\/abs\/2508.19559"},{"key":"e_1_3_2_1_56_1","unstructured":"Rui Li Xiaoyun Zhi Jinxin Chi Menghan Yu Lixin Huang Jia Zhu Weilun Zhang Xing Ma Wenjia Liu Zhicheng Zhu Daowen Luo Zuquan Song Xin Yin Chao Xiang Shuguang Wang Wencong Xiao and Gene Cooperman. 2025. BootSeer: Analyzing and Mitigating Initialization Bottlenecks in Large-Scale LLM Training. arXiv:2507.12619 [cs.LG] https:\/\/arxiv.org\/abs\/2507.12619"},{"key":"e_1_3_2_1_57_1","volume-title":"International Conference on Machine Learning. PMLR, 6543\u20136552","author":"Li Zhuohan","year":"2021","unstructured":"Zhuohan Li, Siyuan Zhuang, Shiyuan Guo, Danyang Zhuo, Hao Zhang, Dawn Song, and Ion Stoica. 2021. Terapipe: Token-level pipeline parallelism for training large-scale language models. In International Conference on Machine Learning. PMLR, 6543\u20136552."},{"key":"e_1_3_2_1_58_1","volume-title":"2024 IEEE International Symposium on High-Performance Computer Architecture (HPCA). IEEE, 803\u2013816","author":"Lin Zhiqi","year":"2024","unstructured":"Zhiqi Lin, Youshan Miao, Guanbin Xu, Cheng Li, Olli Saarikivi, Saeed Maleki, and Fan Yang. 2024. Tessel: Boosting Distributed Execution of Large DNN Models via Flexible Schedule Search. In 2024 IEEE International Symposium on High-Performance Computer Architecture (HPCA). IEEE, 803\u2013816."},{"key":"e_1_3_2_1_59_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Lin Zhiqi","year":"2024","unstructured":"Zhiqi Lin, Youshan Miao, Quanlu Zhang, Fan Yang, Yi Zhu, Cheng Li, Saeed Maleki, Xu Cao, Ning Shang, Yilei Yang, et al. 2024. {nnScaler} : {Constraint-Guided} Parallelization Plan Generation for Deep Learning Training. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 347\u2013363."},{"key":"e_1_3_2_1_60_1","volume-title":"Proceedings of the Nineteenth European Conference on Computer Systems. 163\u2013181","author":"Liu Guodong","year":"2024","unstructured":"Guodong Liu, Youshan Miao, Zhiqi Lin, Xiaoxiang Shi, Saeed Maleki, Fan Yang, Yungang Bao, and Sa Wang. 2024. Aceso: Efficient parallel dnn training through iterative bottleneck alleviation. In Proceedings of the Nineteenth European Conference on Computer Systems. 163\u2013181."},{"key":"e_1_3_2_1_61_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Ma Lingxiao","year":"2020","unstructured":"Lingxiao Ma, Zhiqiang Xie, Zhi Yang, Jilong Xue, Youshan Miao, Wei Cui, Wenxiang Hu, Fan Yang, Lintao Zhang, and Lidong Zhou. 2020. Rammer: Enabling holistic deep learning compiler optimizations with {rTasks}. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 881\u2013897."},{"key":"e_1_3_2_1_62_1","volume-title":"17th USENIX Symposium on Networked Systems Design and Implementation.","author":"Mahajan Kshiteej","year":"2020","unstructured":"Kshiteej Mahajan, Arjun Balasubramanian, Arjun Singhvi, Shivaram Venkataraman, Aditya Akella, Amar Phanishayee, and Shuchi Chawla. 2020. Themis: Fair and efficient GPU cluster scheduling. In 17th USENIX Symposium on Networked Systems Design and Implementation."},{"key":"e_1_3_2_1_63_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Moritz Philipp","year":"2018","unstructured":"Philipp Moritz, Robert Nishihara, Stephanie Wang, Alexey Tumanov, Richard Liaw, Eric Liang, Melih Elibol, Zongheng Yang, William Paul, Michael I Jordan, et al. 2018. Ray: A distributed framework for emerging AI applications. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 561\u2013577."},{"key":"e_1_3_2_1_64_1","volume-title":"Proceedings of the 27th ACM Symposium on Operating Systems Principles. 1\u201315","author":"Narayanan Deepak","year":"2019","unstructured":"Deepak Narayanan, Aaron Harlap, Amar Phanishayee, Vivek Seshadri, Nikhil R Devanur, Gregory R Ganger, Phillip B Gibbons, and Matei Zaharia. 2019. PipeDream: Generalized pipeline parallelism for DNN training. In Proceedings of the 27th ACM Symposium on Operating Systems Principles. 1\u201315."},{"key":"e_1_3_2_1_65_1","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation. 481\u2013498","author":"Narayanan Deepak","year":"2020","unstructured":"Deepak Narayanan, Keshav Santhanam, Fiodar Kazhamiaka, Amar Phanishayee, and Matei Zaharia. 2020. Heterogeneity-aware cluster scheduling policies for deep learning workloads. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation. 481\u2013498."},{"key":"e_1_3_2_1_66_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. 1\u201315","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan, Mohammad Shoeybi, Jared Casper, Patrick LeGresley, Mostofa Patwary, Vijay Korthikanti, Dmitri Vainbrand, Prethvi Kashinkunti, Julie Bernauer, Bryan Catanzaro, et al. 2021. Efficient large-scale language model training on gpu clusters using megatron-lm. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. 1\u201315."},{"key":"e_1_3_2_1_67_1","volume-title":"International Conference on Learning Representations.","author":"Qi Hang","year":"2016","unstructured":"Hang Qi, Evan R Sparks, and Ameet Talwalkar. 2016. Paleo: A performance model for deep neural networks. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_68_1","first-page":"1","article-title":"Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning","volume":"21","author":"Qiao Aurick","year":"2021","unstructured":"Aurick Qiao, Sang Keun Choe, Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R Ganger, and Eric P Xing. 2021. Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning.. In OSDI, Vol. 21. 1\u201318.","journal-title":"OSDI"},{"key":"e_1_3_2_1_69_1","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Qiu Haoran","year":"2023","unstructured":"Haoran Qiu, Weichao Mao, Chen Wang, Hubertus Franke, Alaa Youssef, Zbigniew T Kalbarczyk, Tamer Ba\u015far, and Ravishankar K Iyer. 2023. { AWARE} : Automate workload autoscaling with reinforcement learning in production cloud systems. In 2023 USENIX Annual Technical Conference (USENIX ATC 23). 387\u2013402."},{"key":"e_1_3_2_1_70_1","volume-title":"SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201316","author":"Rajbhandari Samyam","year":"2020","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. 2020. Zero: Memory optimizations toward training trillion parameter models. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201316."},{"key":"e_1_3_2_1_71_1","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021. {Zero-offload}: Democratizing {billion-scale} model training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). 551\u2013564."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"crossref","unstructured":"Teven Le Scao Thomas Wang Daniel Hesslow Lucile Saulnier Stas Bekman M Saiful Bari Stella Biderman Hady Elsahar Niklas Muennighoff Jason Phang et al. 2022. What language model to train if you have one million gpu hours? arXiv preprint arXiv:2210.15424 (2022).","DOI":"10.18653\/v1\/2022.findings-emnlp.54"},{"key":"e_1_3_2_1_73_1","volume-title":"Operator Fusion in XLA: Analysis and Evaluation. arXiv preprint arXiv:2301.13062","author":"Snider Daniel","year":"2023","unstructured":"Daniel Snider and Ruofan Liang. 2023. Operator Fusion in XLA: Analysis and Evaluation. arXiv preprint arXiv:2301.13062 (2023)."},{"key":"e_1_3_2_1_74_1","volume-title":"Proceedings of the 4th Workshop on Machine Learning and Systems","author":"Strati Foteini","year":"2024","unstructured":"Foteini Strati, Paul Elvinger, Tolga Kerimoglu, and Ana Klimovic. 2024. ML Training with Cloud GPU Shortages: Is Cross-Region the Answer?. In Proceedings of the 4th Workshop on Machine Learning and Systems (Athens, Greece) (EuroMLSys '24). Association for Computing Machinery, New York, NY, USA, 107\u2013116. 10.1145\/3642970.3655843"},{"key":"e_1_3_2_1_75_1","volume-title":"Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles (Lotte Hotel World","author":"Strati Foteini","year":"2025","unstructured":"Foteini Strati, Zhendong Zhang, George Manos, Ixeia S\u00e1nchez P\u00e9riz, Qinghao Hu, Tiancheng Chen, Berk Buzcu, Song Han, Pamela Delgado, and Ana Klimovic. 2025. Sailor: Automating Distributed Training over Dynamic, Heterogeneous, and Geo-distributed Clusters. In Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles (Lotte Hotel World, Seoul, Republic of Korea) (SOSP '25). Association for Computing Machinery, New York, NY, USA, 204\u2013220. 10.1145\/3731569.3764839"},{"key":"e_1_3_2_1_76_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_77_1","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Um Taegeon","year":"2024","unstructured":"Taegeon Um, Byungsoo Oh, Minyoung Kang, Woo-Yeon Lee, Goeun Kim, Dongseob Kim, Youngtaek Kim, Mohd Muzzammil, and Myeongjae Jeon. 2024. Metis: Fast Automatic Distributed Training on Heterogeneous {GPUs}. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). 563\u2013578."},{"key":"e_1_3_2_1_78_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Unger Colin","year":"2022","unstructured":"Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, et al. 2022. Unity: Accelerating {DNN} training through joint optimization of algebraic transformations and parallelization. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 267\u2013284."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695975"},{"key":"e_1_3_2_1_80_1","unstructured":"Borui Wan Mingji Han Yiyao Sheng Yanghua Peng Haibin Lin Mofan Zhang Zhichao Lai Menghan Yu Junda Zhang Zuquan Song et al. 2024. ByteCheckpoint: A Unified Checkpointing System for Large Foundation Model Development. arXiv preprint arXiv:2407.20143 (2024)."},{"key":"e_1_3_2_1_81_1","volume-title":"19th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 22).","author":"Weng Qizhen","unstructured":"Qizhen Weng, Wencong Xiao, Yinghao Yu, Wei Wang, Cheng Wang, Jian He, Yong Li, Liping Zhang, Wei Lin, and Yu Ding. 2022. MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In 19th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 22)."},{"key":"e_1_3_2_1_82_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, et al. 2018. Gandiva: Introspective cluster scheduling for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 595\u2013610."},{"key":"e_1_3_2_1_83_1","unstructured":"Wencong Xiao Shiru Ren Yong Li Yang Zhang Pengyang Hou Zhi Li Yihui Feng Wei Lin and Yangqing Jia. 2020. AntMan: Dynamic Scaling on GPU Clusters for Deep Learning.. In OSDI. 533\u2013548."},{"key":"e_1_3_2_1_84_1","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Xie Zhen","year":"2024","unstructured":"Zhen Xie, Murali Emani, Xiaodong Yu, Dingwen Tao, Xin He, Pengfei Su, Keren Zhou, and Venkatram Vishwanath. 2024. Centimani: Enabling Fast AI Accelerator Selection for DNN Training with a Novel Performance Predictor. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). USENIX Association, Santa Clara, CA, 1203\u20131221. https:\/\/www.usenix.org\/conference\/atc24\/presentation\/xie"},{"key":"e_1_3_2_1_85_1","volume-title":"SkyPilot: An Intercloud Broker for Sky Computing. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Yang Zongheng","year":"2023","unstructured":"Zongheng Yang, Zhanghao Wu, Michael Luo, Wei-Lin Chiang, Romil Bhardwaj, Woosuk Kwon, Siyuan Zhuang, Frank Sifei Luan, Gautam Mittal, Scott Shenker, and Ion Stoica. 2023. SkyPilot: An Intercloud Broker for Sky Computing. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 437\u2013455. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/yang-zongheng"},{"key":"e_1_3_2_1_86_1","volume-title":"Habitat: A Runtime-Based Computational Performance Predictor for Deep Neural Network Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Yu Geoffrey X.","year":"2021","unstructured":"Geoffrey X. Yu, Yubo Gao, Pavel Golikov, and Gennady Pekhimenko. 2021. Habitat: A Runtime-Based Computational Performance Predictor for Deep Neural Network Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 503\u2013521."},{"key":"e_1_3_2_1_87_1","volume-title":"Wide residual networks. arXiv preprint arXiv:1605.07146","author":"Zagoruyko Sergey","year":"2016","unstructured":"Sergey Zagoruyko and Nikos Komodakis. 2016. Wide residual networks. arXiv preprint arXiv:1605.07146 (2016)."},{"key":"e_1_3_2_1_88_1","volume-title":"Proceedings of the Eighteenth European Conference on Computer Systems","author":"Zhao Hanyu","year":"2023","unstructured":"Hanyu Zhao, Zhenhua Han, Zhi Yang, Quanlu Zhang, Mingxia Li, Fan Yang, Qianxi Zhang, Binyang Li, Yuqing Yang, Lili Qiu, Lintao Zhang, and Lidong Zhou. 2023. SiloD: A Co-design of Caching and Scheduling for Deep Learning Clusters. In Proceedings of the Eighteenth European Conference on Computer Systems (Rome, Italy) (EuroSys '23). Association for Computing Machinery, New York, NY, USA, 883\u2013898. 10.1145\/3552326.3567499"},{"key":"e_1_3_2_1_89_1","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation. 515\u2013532","author":"Zhao Hanyu","year":"2020","unstructured":"Hanyu Zhao, Zhenhua Han, Zhi Yang, Quanlu Zhang, Fan Yang, Lidong Zhou, Mao Yang, Francis CM Lau, Yuqi Wang, Yifan Xiong, et al. 2020. HiveD: Sharing a GPU cluster for deep learning with guarantees. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation. 515\u2013532."},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"crossref","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer Alban Desmaison Can Balioglu Pritam Damania Bernard Nguyen Geeta Chauhan Yuchen Hao Ajit Mathews and Shen Li. 2023. Py-Torch FSDP: Experiences on Scaling Fully Sharded Data Parallel. arXiv:2304.11277 [cs.DC] https:\/\/arxiv.org\/abs\/2304.11277","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_2_1_91_1","volume-title":"Alpa: Automating Inter-and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P Xing, et al. 2022. Alpa: Automating Inter-and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 559\u2013578."},{"key":"e_1_3_2_1_92_1","volume-title":"Daydream: Accurately Estimating the Efficacy of Optimizations for DNN Training. In 2020 USENIX Annual Technical Conference (USENIX ATC 20)","author":"Zhu Hongyu","year":"2020","unstructured":"Hongyu Zhu, Amar Phanishayee, and Gennady Pekhimenko. 2020. Daydream: Accurately Estimating the Efficacy of Optimizations for DNN Training. In 2020 USENIX Annual Technical Conference (USENIX ATC 20). USENIX Association, 337\u2013352."}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"deposited":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:28:18Z","timestamp":1777062498000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3803571"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":92,"alternative-id":["10.1145\/3767295.3803571","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3803571","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}