{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T00:12:17Z","timestamp":1777421537861,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":79,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3731569.3764839","type":"proceedings-article","created":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:43:24Z","timestamp":1759322604000},"page":"204-220","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Sailor: Automating Distributed Training over Dynamic, Heterogeneous, and Geo-distributed Clusters"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3364-2109","authenticated-orcid":false,"given":"Foteini","family":"Strati","sequence":"first","affiliation":[{"name":"ETH Zurich, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0843-6457","authenticated-orcid":false,"given":"Zhendong","family":"Zhang","sequence":"additional","affiliation":[{"name":"ETH Zurich, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5767-8325","authenticated-orcid":false,"given":"George","family":"Manos","sequence":"additional","affiliation":[{"name":"ETH Zurich, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1810-353X","authenticated-orcid":false,"given":"Ixeia S\u00e1nchez","family":"P\u00e9riz","sequence":"additional","affiliation":[{"name":"Unaffiliated, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1034-7502","authenticated-orcid":false,"given":"Qinghao","family":"Hu","sequence":"additional","affiliation":[{"name":"MIT, Cambridge, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8071-2552","authenticated-orcid":false,"given":"Tiancheng","family":"Chen","sequence":"additional","affiliation":[{"name":"ETH Zurich, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1320-8006","authenticated-orcid":false,"given":"Berk","family":"Buzcu","sequence":"additional","affiliation":[{"name":"HES-SO Valais\/Wallis, Sierre, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4186-7618","authenticated-orcid":false,"given":"Song","family":"Han","sequence":"additional","affiliation":[{"name":"MIT, Cambridge, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9187-8497","authenticated-orcid":false,"given":"Pamela","family":"Delgado","sequence":"additional","affiliation":[{"name":"HES-SO Valais\/Wallis, Sierre, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8559-0529","authenticated-orcid":false,"given":"Ana","family":"Klimovic","sequence":"additional","affiliation":[{"name":"ETH Zurich, Zurich, Switzerland"}]}],"member":"320","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575754"},{"key":"e_1_3_2_1_2_1","volume-title":"CherryPick: Adaptively Unearthing the Best Cloud Configurations for Big Data Analytics. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Alipourfard Omid","year":"2017","unstructured":"Omid Alipourfard, Hongqiang Harry Liu, Jianshu Chen, Shivaram Venkataraman, Minlan Yu, and Ming Zhang. 2017. CherryPick: Adaptively Unearthing the Best Cloud Configurations for Big Data Analytics. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17). USENIX Association, Boston, MA, 469\u2013482. https:\/\/www.usenix.org\/conference\/nsdi17\/technical-sessions\/presentation\/alipourfard"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519584"},{"key":"e_1_3_2_1_4_1","unstructured":"AWS. 2025. Overview of Data Transfer Costs for Common Architectures. https:\/\/aws.amazon.com\/blogs\/architecture\/overview-of-data-transfer-costs-for-common-architectures\/."},{"key":"e_1_3_2_1_5_1","unstructured":"Azure. 2025. Azure Bandwidth pricing. https:\/\/azure.microsoft.com\/en-us\/pricing\/details\/bandwidth\/."},{"key":"e_1_3_2_1_6_1","unstructured":"Baseline. 2024. China achieves breakthrough in AI training. https:\/\/www.baselinemag.com\/news\/china-achieves-breakthrough-in-ai-training\/."},{"key":"e_1_3_2_1_7_1","unstructured":"Tiancheng Chen Ales Kubicek Langwen Huang and Torsten Hoefler. 2025. CrossPipe: Towards Optimal Pipeline Schedules for Cross-Datacenter Training. arXiv:2507.00217 [cs.DC] https:\/\/arxiv.org\/abs\/2507.00217"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620678.3624661"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620678.3624669"},{"key":"e_1_3_2_1_10_1","unstructured":"Google Cloud. 2024. Google Cloud - About GPUs. https:\/\/cloud.google.com\/compute\/docs\/gpus\/about-gpus."},{"key":"e_1_3_2_1_11_1","unstructured":"Google Cloud. 2025. Google Cloud All networking pricing. https:\/\/pytorch.org\/docs\/stable\/torch_cuda_memory.html."},{"key":"e_1_3_2_1_12_1","unstructured":"deepspeedai. 2025. Megatron-DeepSpeed. https:\/\/github.com\/deepspeedai\/Megatron-DeepSpeed."},{"key":"e_1_3_2_1_13_1","unstructured":"Arthur Douillard Qixuan Feng Andrei A. Rusu Rachita Chhaparia Yani Donchev Adhiguna Kuncoro Marc'Aurelio Ranzato Arthur Szlam and Jiajun Shen. 2024. DiLoCo: Distributed Low-Communication Training of Language Models. arXiv:2311.08105 [cs.LG] https:\/\/arxiv.org\/abs\/2311.08105"},{"key":"e_1_3_2_1_14_1","volume-title":"Liveput-Optimized DNN Training on Preemptible Instances. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Duan Jiangfei","year":"2024","unstructured":"Jiangfei Duan, Ziang Song, Xupeng Miao, Xiaoli Xi, Dahua Lin, Harry Xu, Minjia Zhang, and Zhihao Jia. 2024. Parcae: Proactive, Liveput-Optimized DNN Training on Preemptible Instances. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). USENIX Association, Santa Clara, CA, 1121\u20131139. https:\/\/www.usenix.org\/conference\/nsdi24\/presentation\/duan"},{"key":"e_1_3_2_1_15_1","volume-title":"Tiresias: A GPU Cluster Manager for Distributed Deep Learning. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu, Mosharaf Chowdhury, Kang G. Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Liu, and Chuanxiong Guo. 2019. Tiresias: A GPU Cluster Manager for Distributed Deep Learning. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19). USENIX Association, Boston, MA, 485\u2013500. https:\/\/www.usenix.org\/conference\/nsdi19\/presentation\/gu"},{"key":"e_1_3_2_1_16_1","volume-title":"Cephalo: Harnessing Heterogeneous GPU Clusters for Training Transformer Models. arXiv:2411.01075 [cs.DC] https:\/\/arxiv.org\/abs\/2411.01075","author":"Guo Runsheng Benson","year":"2024","unstructured":"Runsheng Benson Guo, Utkarsh Anand, Arthur Chen, and Khuzaima Daudjee. 2024. Cephalo: Harnessing Heterogeneous GPU Clusters for Training Transformer Models. arXiv:2411.01075 [cs.DC] https:\/\/arxiv.org\/abs\/2411.01075"},{"key":"e_1_3_2_1_17_1","unstructured":"Toms Hardware. 2025. Meta to build 2GW data center with over 1.3 million Nvidia AI GPUs."},{"key":"e_1_3_2_1_18_1","volume-title":"Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen.","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: efficient training of giant neural networks using pipeline parallelism. Curran Associates Inc., Red Hook, NY, USA."},{"key":"e_1_3_2_1_19_1","unstructured":"HuggingFace. 2025. HuggingFace GPT-Neo. https:\/\/huggingface.co\/docs\/transformers\/en\/model_doc\/gpt_neo."},{"key":"e_1_3_2_1_20_1","unstructured":"HuggingFace. 2025. HuggingFace OPT. https:\/\/huggingface.co\/docs\/transformers\/en\/model_doc\/opt."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613152"},{"key":"e_1_3_2_1_22_1","volume-title":"Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). USENIX Association, Renton, WA, 947\u2013960. https:\/\/www.usenix.org\/conference\/atc19\/presentation\/jeon"},{"key":"e_1_3_2_1_23_1","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Jia Xianyan","year":"2022","unstructured":"Xianyan Jia, Le Jiang, Ang Wang, Wencong Xiao, Ziji Shi, Jie Zhang, Xinyuan Li, Langshi Chen, Yong Li, Zhen Zheng, Xiaoyong Liu, and Wei Lin. 2022. Whale: Efficient Giant Model Training over Heterogeneous GPUs. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). USENIX Association, Carlsbad, CA, 673\u2013688. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/jia-xianyan"},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"Jiang Youhe","year":"2024","unstructured":"Youhe Jiang, Ran Yan, Xiaozhe Yao, Yang Zhou, Beidi Chen, and Binhang Yuan. 2024. HEXGEN: generative inference of large language model over heterogeneous environment. In Proceedings of the 41st International Conference on Machine Learning (Vienna, Austria) (ICML'24). JMLR.org, Article 881, 16 pages."},{"key":"e_1_3_2_1_25_1","unstructured":"Ziheng Jiang Haibin Lin Yinmin Zhong Qi Huang Yangrui Chen Zhi Zhang Yanghua Peng Xiang Li Cong Xie Shibiao Nong Yulu Jia Sun He Hongmin Chen Zhihao Bai Qi Hou Shipeng Yan Ding Zhou Yiyao Sheng Zhuo Jiang Haohan Xu Haoran Wei Zhang Zhang Pengfei Nie Leqi Zou Sida Zhao Liang Xiang Zherui Liu Zhe Li Xiaoying Jia Jianxi Ye Xin Jin and Xin Liu. 2024. MegaScale: Scaling Large Language Model Training to More Than 10 000 GPUs. arXiv:2402.15627 [cs.LG]"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CLOUD.2017.21"},{"key":"e_1_3_2_1_27_1","volume-title":"AMP: Automatically Finding Model Parallel Strategies with Heterogeneity Awareness. arXiv:2210.07297 [cs.LG]","author":"Li Dacheng","year":"2022","unstructured":"Dacheng Li, Hongyi Wang, Eric Xing, and Hao Zhang. 2022. AMP: Automatically Finding Model Parallel Strategies with Heterogeneity Awareness. arXiv:2210.07297 [cs.LG]"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2021.3066142"},{"key":"e_1_3_2_1_29_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Lin Zhiqi","year":"2024","unstructured":"Zhiqi Lin, Youshan Miao, Quanlu Zhang, Fan Yang, Yi Zhu, Cheng Li, Saeed Maleki, Xu Cao, Ning Shang, Yilei Yang, Weijiang Xu, Mao Yang, Lintao Zhang, and Lidong Zhou. 2024. nnScaler: Constraint-Guided Parallelization Plan Generation for Deep Learning Training. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 347\u2013363. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/lin-zhiqi"},{"key":"e_1_3_2_1_30_1","unstructured":"Linkedin. 2024. The Heat Challenge of AI Infrastructure: A Growing Concern for Traditional Office Buildings and Older Data Centers. https:\/\/www.linkedin.com\/pulse\/heat-challenge-ai-infrastructure-gpu-servers-trgdatacenter-b6vcc."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629554"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.)","volume":"4","author":"Luo Liang","year":"2022","unstructured":"Liang Luo, Peter West, Pratyush Patel, Arvind Krishnamurthy, and Luis Ceze. 2022. SRIFTY: Swift and Thrifty Distributed Neural Network Training on the Cloud. In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.), Vol. 4. 833\u2013847. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2022\/file\/0cafb7890f6a7d4de65507d5bb7e0187-Paper.pdf"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3717459"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707215"},{"key":"e_1_3_2_1_35_1","unstructured":"Meta. 2025. The Llama 4 herd: The beginning of a new era of natively multimodal AI innovation. https:\/\/ai.meta.com\/blog\/llama-4-multimodal-intelligence\/."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.14778\/3598581.3598604"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.14778\/3570690.3570697"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640375"},{"key":"e_1_3_2_1_39_1","volume-title":"Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. CheckFreq: Frequent, Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21). USENIX Association, 203\u2013216. https:\/\/www.usenix.org\/conference\/fast21\/presentation\/mohan"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_41_1","volume-title":"Padmanabhan","author":"Gandhi Rohan","year":"2024","unstructured":"Palak, Rohan Gandhi, Karan Tandon, Debopam Bhattacherjee, and Venkata N. Padmanabhan. 2024. Improving training time and GPU utilization in geo-distributed language model training. arXiv:2411.14458 [cs.DC] https:\/\/arxiv.org\/abs\/2411.14458"},{"key":"e_1_3_2_1_42_1","volume-title":"Padmanabhan","author":"Gandhi Rohan","year":"2024","unstructured":"Palak, Rohan Gandhi, Karan Tandon, Debopam Bhattacherjee, and Venkata N. Padmanabhan. 2024. Improving training time and GPU utilization in geo-distributed language model training. arXiv:2411.14458 [cs.DC] https:\/\/arxiv.org\/abs\/2411.14458"},{"key":"e_1_3_2_1_43_1","unstructured":"PyTorch. 2025. PyTorch CUDA Events. https:\/\/pytorch.org\/docs\/stable\/generated\/torch.cuda.Event.html."},{"key":"e_1_3_2_1_44_1","unstructured":"PyTorch. 2025. PyTorch hooks. https:\/\/pytorch.org\/docs\/stable\/generated\/torch.Tensor.register_hook.html."},{"key":"e_1_3_2_1_45_1","unstructured":"PyTorch. 2025. Understanding CUDA Memory Usage. https:\/\/pytorch.org\/docs\/stable\/torch_cuda_memory.html."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_47_1","volume-title":"ZeRO-Offload: Democratizing Billion-Scale Model Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021. ZeRO-Offload: Democratizing Billion-Scale Model Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 551\u2013564. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/ren-jie"},{"key":"e_1_3_2_1_48_1","unstructured":"Lisa Rivalin Lingyun Yi Megan Diefenbach Alex Bruefach Frances Amatruda and Tobias Tiecke. 2025. Estimating embodied carbon in data center hardware down to the individual screws. https:\/\/sustainability.atmeta.com\/blog\/2024\/09\/10\/estimating-embodied-carbon-in-data-center-hardware-down-to-the-individual-screws\/."},{"key":"e_1_3_2_1_49_1","unstructured":"Max Ryabinin Tim Dettmers Michael Diskin and Alexander Borzunov. 2023. SWARM Parallelism: Training Large Models Can Be Surprisingly Communication-Efficient. arXiv:2301.11913 [cs.DC]"},{"key":"e_1_3_2_1_50_1","volume-title":"Scaling Distributed Machine Learning with In-Network Aggregation. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Sapio Amedeo","year":"2021","unstructured":"Amedeo Sapio, Marco Canini, Chen-Yu Ho, Jacob Nelson, Panos Kalnis, Changhoon Kim, Arvind Krishnamurthy, Masoud Moshref, Dan Ports, and Peter Richtarik. 2021. Scaling Distributed Machine Learning with In-Network Aggregation. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21). USENIX Association, 785\u2013808. https:\/\/www.usenix.org\/conference\/nsdi21\/presentation\/sapio"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Ian Schneider Hui Xu Stephan Benecke David Patterson Keguo Huang Parthasarathy Ranganathan and Cooper Elsworth. 2025. Life-Cycle Emissions of AI Hardware: A Cradle-To-Grave Approach and Generational Trends. arXiv:2502.01671 [cs.AR] https:\/\/arxiv.org\/abs\/2502.01671","DOI":"10.1109\/MM.2025.3592568"},{"key":"e_1_3_2_1_52_1","unstructured":"Semaphor. 2024. Microsoft Azure CTO: US data centers will soon hit size limits. https:\/\/www.semafor.com\/article\/10\/11\/2024\/microsoftazure-cto-us-data-centers-will-soon-hit-limits-of-energy-grid."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/IWQoS57198.2023.10188717"},{"key":"e_1_3_2_1_54_1","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv:1909.08053 [cs.CL]"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3716025"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3642970.3655843"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707255"},{"key":"e_1_3_2_1_58_1","unstructured":"Foteini Strati Sara Mcallister Amar Phanishayee Jakub Tarnawski and Ana Klimovic. 2024. D\u00e9j\u00e0Vu: KV-cache Streaming for Fast Fault-tolerant Generative LLM Serving. arXiv:2403.01876 [cs.DC] https:\/\/arxiv.org\/abs\/2403.01876"},{"key":"e_1_3_2_1_59_1","volume-title":"Wortman Vaughan (Eds.)","volume":"34","author":"Tarnawski Jakub M","year":"2021","unstructured":"Jakub M Tarnawski, Deepak Narayanan, and Amar Phanishayee. 2021. Piper: Multidimensional Planner for DNN Parallelization. In Advances in Neural Information Processing Systems, M. Ranzato, A. Beygelzimer, Y. Dauphin, P.S. Liang, and J. Wortman Vaughan (Eds.), Vol. 34. Curran Associates, Inc., 24829\u201324840. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2021\/file\/d01eeca8b24321cd2fe89dd85b9beb51-Paper.pdf"},{"key":"e_1_3_2_1_60_1","volume-title":"Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Thorpe John","year":"2023","unstructured":"John Thorpe, Pengzhan Zhao, Jonathan Eyolfson, Yifan Qiao, Zhihao Jia, Minjia Zhang, Ravi Netravali, and Guoqing Harry Xu. 2023. Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 497\u2013513. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/thorpe"},{"key":"e_1_3_2_1_61_1","volume-title":"Microsoft surprises analysts with massive 80B AI investment plans for","year":"2025","unstructured":"tom's Hardware. 2025. Microsoft surprises analysts with massive 80B AI investment plans for 2025."},{"key":"e_1_3_2_1_62_1","volume-title":"Metis: Fast Automatic Distributed Training on Heterogeneous GPUs. In 2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Um Taegeon","year":"2024","unstructured":"Taegeon Um, Byungsoo Oh, Minyoung Kang, Woo-Yeon Lee, Goeun Kim, Dongseob Kim, Youngtaek Kim, Mohd Muzzammil, and Myeongjae Jeon. 2024. Metis: Fast Automatic Distributed Training on Heterogeneous GPUs. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). USENIX Association, Santa Clara, CA, 563\u2013578. https:\/\/www.usenix.org\/conference\/atc24\/presentation\/um"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695975"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00041"},{"key":"e_1_3_2_1_65_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Wang Jue","year":"2023","unstructured":"Jue Wang, Yucheng Lu, Binhang Yuan, Beidi Chen, Percy Liang, Christopher De Sa, Christopher Re, and Ce Zhang. 2023. CocktailSGD: fine-tuning foundation models over 500mbps networks. In Proceedings of the 40th International Conference on Machine Learning (Honolulu, Hawaii, USA) (ICML'23). JMLR.org, Article 1497, 19 pages."},{"key":"e_1_3_2_1_66_1","volume-title":"MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Weng Qizhen","year":"2022","unstructured":"Qizhen Weng, Wencong Xiao, Yinghao Yu, Wei Wang, Cheng Wang, Jian He, Yong Li, Liping Zhang, Wei Lin, and Yu Ding. 2022. MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). USENIX Association, Renton, WA, 945\u2013960. https:\/\/www.usenix.org\/conference\/nsdi22\/presentation\/weng"},{"key":"e_1_3_2_1_67_1","unstructured":"Wikipedia. 2025. GPT-2. https:\/\/en.wikipedia.org\/wiki\/GPT-4."},{"key":"e_1_3_2_1_68_1","unstructured":"Wikipedia. 2025. GPT-4. https:\/\/en.wikipedia.org\/wiki\/GPT-4."},{"key":"e_1_3_2_1_69_1","unstructured":"Wikipedia. 2025. List of Nvidia graphics processing units. https:\/\/en.wikipedia.org\/wiki\/List_of_Nvidia_graphics_processing_units."},{"key":"e_1_3_2_1_70_1","volume-title":"FALCON: Pinpointing and Mitigating Stragglers for Large-Scale Hybrid-Parallel Training. arXiv:2410.12588 [cs.DC] https:\/\/arxiv.org\/abs\/2410.12588","author":"Wu Tianyuan","year":"2024","unstructured":"Tianyuan Wu, Wei Wang, Yinghao Yu, Siran Yang, Wenchao Wu, Qinkai Duan, Guodong Yang, Jiamang Wang, Lin Qu, and Liping Zhang. 2024. FALCON: Pinpointing and Mitigating Stragglers for Large-Scale Hybrid-Parallel Training. arXiv:2410.12588 [cs.DC] https:\/\/arxiv.org\/abs\/2410.12588"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3127479.3131614"},{"key":"e_1_3_2_1_72_1","unstructured":"Ran Yan Youhe Jiang Wangcheng Tao Xiaonan Nie Bin Cui and Binhang Yuan. 2024. FlashFlex: Accommodating Large Language Model Training over Heterogeneous Environment. arXiv:2409.01143 [cs.DC] https:\/\/arxiv.org\/abs\/2409.01143"},{"key":"e_1_3_2_1_73_1","volume-title":"SkyPilot: An Intercloud Broker for Sky Computing. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Yang Zongheng","year":"2023","unstructured":"Zongheng Yang, Zhanghao Wu, Michael Luo, Wei-Lin Chiang, Romil Bhardwaj, Woosuk Kwon, Siyuan Zhuang, Frank Sifei Luan, Gautam Mittal, Scott Shenker, and Ion Stoica. 2023. SkyPilot: An Intercloud Broker for Sky Computing. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 437\u2013455. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/yang-zongheng"},{"key":"e_1_3_2_1_74_1","volume-title":"Tianyi Zhang, Tri Dao, Beidi Chen, Percy Liang, Christopher Re, and Ce Zhang.","author":"Yuan Binhang","year":"2023","unstructured":"Binhang Yuan, Yongjun He, Jared Quincy Davis, Tianyi Zhang, Tri Dao, Beidi Chen, Percy Liang, Christopher Re, and Ce Zhang. 2023. Decentralized Training of Foundation Models in Heterogeneous Environments. arXiv:2206.01288 [cs.DC]"},{"key":"e_1_3_2_1_75_1","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Yuan Tailing","year":"2024","unstructured":"Tailing Yuan, Yuliang Liu, Xucheng Ye, Shenglong Zhang, Jianchao Tan, Bin Chen, Chengru Song, and Di Zhang. 2024. Accelerating the Training of Large Language Models using Efficient Activation Rematerialization and Optimal Hybrid Parallelism. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). USENIX Association, Santa Clara, CA, 545\u2013561. https:\/\/www.usenix.org\/conference\/atc24\/presentation\/yuan"},{"key":"e_1_3_2_1_76_1","volume-title":"Poseidon: An Efficient Communication Architecture for Distributed Deep Learning on GPU Clusters. In 2017 USENIX Annual Technical Conference (USENIX ATC 17)","author":"Zhang Hao","unstructured":"Hao Zhang, Zeyu Zheng, Shizhen Xu, Wei Dai, Qirong Ho, Xiaodan Liang, Zhiting Hu, Jinliang Wei, Pengtao Xie, and Eric P. Xing. 2017. Poseidon: An Efficient Communication Architecture for Distributed Deep Learning on GPU Clusters. In 2017 USENIX Annual Technical Conference (USENIX ATC 17). USENIX Association, Santa Clara, CA, 181\u2013193. https:\/\/www.usenix.org\/conference\/atc17\/technical-sessions\/presentation\/zhang"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3337821.3337873"},{"key":"e_1_3_2_1_78_1","volume-title":"Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. arXiv:2201.12023 [cs.LG]","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P. Xing, Joseph E. Gonzalez, and Ion Stoica. 2022. Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. arXiv:2201.12023 [cs.LG]"},{"key":"e_1_3_2_1_79_1","unstructured":"Kan Zhu Yilong Zhao Liangyu Zhao Gefei Zuo Yile Gu Dedong Xie Yufei Gao Qinyu Xu Tian Tang Zihao Ye Keisuke Kamahori Chien-Yu Lin Stephanie Wang Arvind Krishnamurthy and Baris Kasikci. 2024. NanoFlow: Towards Optimal Large Language Model Serving Throughput. arXiv:2408.12757 [cs.DC] https:\/\/arxiv.org\/abs\/2408.12757"}],"event":{"name":"SOSP '25: ACM SIGOPS 31st Symposium on Operating Systems Principles","location":"Lotte Hotel World Seoul Republic of Korea","acronym":"SOSP '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","USENIX"]},"container-title":["Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles"],"original-title":[],"deposited":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:44:40Z","timestamp":1759322680000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731569.3764839"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":79,"alternative-id":["10.1145\/3731569.3764839","10.1145\/3731569"],"URL":"https:\/\/doi.org\/10.1145\/3731569.3764839","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}