{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T12:59:37Z","timestamp":1780664377904,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":98,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,4,26]],"date-time":"2026-04-26T00:00:00Z","timestamp":1777161600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/legalcode"}],"funder":[{"name":"Beijing Municipal Science and Technology Project","award":["Z241100004224024"],"award-info":[{"award-number":["Z241100004224024"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3769330","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"564-585","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Multipath Collective Communication Beyond Scale-up Networks in GPU Clouds"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5765-3825","authenticated-orcid":false,"given":"Yuchen","family":"Xu","sequence":"first","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8492-7983","authenticated-orcid":false,"given":"Jianglong","family":"Nie","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6714-5014","authenticated-orcid":false,"given":"Baojia","family":"Li","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, Guangdong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9299-2585","authenticated-orcid":false,"given":"Mingzhuo","family":"Chen","sequence":"additional","affiliation":[{"name":"Tencent, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4554-8554","authenticated-orcid":false,"given":"Hao","family":"Lu","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, Guangdong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6305-6217","authenticated-orcid":false,"given":"Guanyu","family":"Qu","sequence":"additional","affiliation":[{"name":"Tencent, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7176-3331","authenticated-orcid":false,"given":"Zhenchuan","family":"Liu","sequence":"additional","affiliation":[{"name":"Tencent, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1940-6308","authenticated-orcid":false,"given":"Shuangshuang","family":"Yin","sequence":"additional","affiliation":[{"name":"Tencent, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2176-4697","authenticated-orcid":false,"given":"Xiaojie","family":"Huang","sequence":"additional","affiliation":[{"name":"Tencent, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6128-0384","authenticated-orcid":false,"given":"Chunzhi","family":"He","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, Guangdong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2816-9777","authenticated-orcid":false,"given":"Yinben","family":"Xia","sequence":"additional","affiliation":[{"name":"Tencent, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3898-4142","authenticated-orcid":false,"given":"Quan","family":"Wen","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, Guangdong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0556-5290","authenticated-orcid":false,"given":"Xiang","family":"Li","sequence":"additional","affiliation":[{"name":"Tencent, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8020-0595","authenticated-orcid":false,"given":"Zekun","family":"He","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, Guangdong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3669-1202","authenticated-orcid":false,"given":"Yachen","family":"Wang","sequence":"additional","affiliation":[{"name":"Tencent, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5370-8162","authenticated-orcid":false,"given":"Xianneng","family":"Zou","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, Guangdong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3265-049X","authenticated-orcid":false,"given":"Congcong","family":"Miao","sequence":"additional","affiliation":[{"name":"Tencent, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1357-3137","authenticated-orcid":false,"given":"Wenfei","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"AFOX. 2023. AFDCU-Z100-HG64A - AFOX Professional Cards - AFOX. https:\/\/www.afox-corp.com\/show-136-624-1.html."},{"key":"e_1_3_2_1_2_1","volume-title":"Alibaba GPU Cluster Dataset","year":"2023","unstructured":"Alibaba. 2024. Alibaba GPU Cluster Dataset 2023. https:\/\/github.com\/alibaba\/alibaba-lingjun-dataset-2023."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/1851182.1851192"},{"key":"e_1_3_2_1_4_1","unstructured":"Amazon. 2025. High-Performance Computing - Network Specification. https:\/\/docs.aws.amazon.com\/ec2\/latest\/instancetypes\/hpc.html#hpc_network."},{"key":"e_1_3_2_1_5_1","unstructured":"AMD. 2021. AMD Infinity Fabric\u2122 Link. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/instinct-tech-docs\/other\/56978.pdf."},{"key":"e_1_3_2_1_6_1","unstructured":"AMD. 2023. AMD CDNA\u2122 3 Architecture. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/instinct-tech-docs\/white-papers\/amd-cdna-3-white-paper.pdf."},{"key":"e_1_3_2_1_7_1","unstructured":"AMD. 2024. RCCL. https:\/\/github.com\/ROCm\/rccl."},{"key":"e_1_3_2_1_8_1","unstructured":"InfiniBand Trade Association. 2023. Infiniband - A Low-Latency High-Bandwidth Interconnect. https:\/\/www.infinibandta.org\/about-infiniband."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519584"},{"key":"e_1_3_2_1_10_1","volume-title":"Data networks","author":"Bertsekas Dimitri","unstructured":"Dimitri Bertsekas and Robert Gallager. 1992. Data networks. Prentice-Hall, Inc., Chapter 6 Flow Control, 493\u2013536."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441620"},{"key":"e_1_3_2_1_12_1","unstructured":"Cambricon. 2021. MLU370-M8 Intelligent Accelerating Card Product Manual. https:\/\/fccid.io\/2ARVF-MLU370-M8\/User-Manual\/TempConfidential-MLU370-M8-user-manual-V-5528126.pdf."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672239"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3012426.3022184"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNET.2019.2948917"},{"key":"e_1_3_2_1_16_1","unstructured":"Peter Chen. 2022. First Level-E Liquid-cooled OAI Based AI System Solution. https:\/\/146a55aca6f00848c565-a7635525d40ac1c70300198708936b4e.ssl.cf1.rackcdn.com\/images\/00da31e8d80bde510cd27b9074dc0732f1f4d223.pdf."},{"key":"e_1_3_2_1_17_1","volume-title":"GPU Performance (Data Sheets) Quick Reference","author":"Chiao Arthur","year":"2023","unstructured":"Arthur Chiao. 2024. GPU Performance (Data Sheets) Quick Reference (2023). https:\/\/arthurchiao.art\/blog\/gpu-data-sheets."},{"key":"e_1_3_2_1_18_1","unstructured":"Cisco. 2024. Cisco 400G Data Center Networking. https:\/\/www.cisco.com\/site\/us\/en\/products\/networking\/cloud-networking-switches\/400g-switches\/index.html."},{"key":"e_1_3_2_1_19_1","unstructured":"clamchowder. 2022. Hot Chips 34 - Biren's BR100: A Machine Learning GPU from China. https:\/\/chipsandcheese.com\/2022\/10\/04\/hot-chips-34-birens-br100-a-machine-learning-gpu-from-china."},{"key":"e_1_3_2_1_20_1","unstructured":"Unified Communication Framework (UCF) Consortium. 2024. Unified Collective Communication (UCC). https:\/\/github.com\/openucx\/ucc."},{"key":"e_1_3_2_1_21_1","unstructured":"Ultra Ethernet Consortium. 2024. Ultra Ethernet Specification Update. https:\/\/ultraethernet.org\/ultra-ethernet-specification-update."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575724"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00056"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3091475"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472904"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/COMHPC.2016.006"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Richard L Graham Lion Levi Devendar Burredy Gil Bloch Gilad Shainer David Cho George Elias Daniel Klein Joshua Ladd Ophir Maor et al. 2020. Scalable Hierarchical Aggregation and Reduction Protocol (SHARP) Streaming-Aggregation Hardware Design and Evaluation. In High Performance Computing: 35th International Conference ISC High Performance 2020 Frankfurt\/Main Germany June 22\u201325 2020 Proceedings 35. 41\u201359.","DOI":"10.1007\/978-3-030-50743-5_3"},{"key":"e_1_3_2_1_28_1","unstructured":"Graphcore. 2024. IPU Processors. https:\/\/www.graphcore.ai\/products\/ipu."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934908"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/42411.42415"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3098822.3098825"},{"key":"e_1_3_2_1_32_1","volume-title":"Scalable and Interpretable Scheduler for Deep Learning Training Jobs. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"2","author":"Hu Qinghao","year":"2023","unstructured":"Qinghao Hu, Meng Zhang, Peng Sun, Yonggang Wen, and Tianwei Zhang. 2023. Lucid: A Non-intrusive, Scalable and Interpretable Scheduler for Deep Learning Training Jobs. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2. 457\u2013472."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTI66940.2025.00024"},{"key":"e_1_3_2_1_34_1","volume-title":"Communication Algorithm-Architecture Co-Design for Distributed Deep Learning. In 2021 ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA). 181\u2013194","author":"Huang Jiayi","year":"2021","unstructured":"Jiayi Huang, Pritam Majumder, Sungkeun Kim, Abdullah Muzahid, Ki Hwan Yum, and Eun Jung Kim. 2021. Communication Algorithm-Architecture Co-Design for Distributed Deep Learning. In 2021 ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA). 181\u2013194."},{"key":"e_1_3_2_1_35_1","unstructured":"Yanping Huang Youlong Cheng Ankur Bapna Orhan Firat Dehao Chen Mia Chen HyoukJoong Lee Jiquan Ngiam Quoc V Le Yonghui Wu et al. 2019. GPipe: Efficient Training of Giant Neural Networks Using Pipeline Parallelism. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_36_1","unstructured":"Huawei. 2024. Affinity Principles of Atlas Training Products. https:\/\/www.hiascend.com\/document\/detail\/en\/mindx-dl\/500\/ref\/affinityschedulesd\/dl_affinity_004.html."},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of Machine Learning and Systems 5","author":"Hwang Changho","year":"2023","unstructured":"Changho Hwang, Wei Cui, Yifan Xiong, Ziyue Yang, Ze Liu, Han Hu, Zilong Wang, Rafael Salas, Jithin Jose, Prabhat Ram, et al. 2023. Tutel: Adaptive Mixture-of-Experts at Scale. Proceedings of Machine Learning and Systems 5 (2023)."},{"key":"e_1_3_2_1_38_1","unstructured":"Intel. 2024. oneAPI Collective Communications Library (oneCCL). https:\/\/github.com\/oneapi-src\/oneCCL."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651362"},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of Machine Learning and Systems 5","author":"Korthikanti Vijay Anand","year":"2023","unstructured":"Vijay Anand Korthikanti, Jared Casper, Sangkug Lym, Lawrence McAfee, Michael Andersch, Mohammad Shoeybi, and Bryan Catanzaro. 2023. Reducing Activation Recomputation in Large Transformer Models. Proceedings of Machine Learning and Systems 5 (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3387514.3406591"},{"key":"e_1_3_2_1_42_1","volume-title":"ATP: In-Network Aggregation for Multi-Tenant Learning. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Lao ChonLam","year":"2021","unstructured":"ChonLam Lao, Yanfang Le, Kshiteej Mahajan, Yixi Chen, Wenfei Wu, Aditya Akella, and Michael Swift. 2021. ATP: In-Network Aggregation for Multi-Tenant Learning. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21). 741\u2013761."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2019.2928289"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00093"},{"key":"e_1_3_2_1_45_1","volume-title":"Accelerating Distributed MoE Training and Inference with Lina. In 2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Li Jiamin","year":"2023","unstructured":"Jiamin Li, Yimin Jiang, Yibo Zhu, Cong Wang, and Hong Xu. 2023. Accelerating Distributed MoE Training and Inference with Lina. In 2023 USENIX Annual Technical Conference (USENIX ATC 23). 945\u2013959."},{"key":"e_1_3_2_1_46_1","volume-title":"THC: Accelerating Distributed Deep Learning Using Tensor Homomorphic Compression. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Li Minghao","year":"2024","unstructured":"Minghao Li, Ran Ben Basat, Shay Vargaftik, ChonLam Lao, Kevin Xu, Michael Mitzenmacher, and Minlan Yu. 2024. THC: Accelerating Distributed Deep Learning Using Tensor Homomorphic Compression. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). 1191\u20131211."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3663408.3663409"},{"key":"e_1_3_2_1_48_1","volume-title":"Ascend: A Scalable and Unified Architecture for Ubiquitous Deep Neural Network Computing. In 2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA). 789\u2013801","author":"Liao Heng","year":"2021","unstructured":"Heng Liao, Jiajin Tu, Jing Xia, Hu Liu, Xiping Zhou, Honghui Yuan, and Yuxing Hu. 2021. Ascend: A Scalable and Unified Architecture for Ubiquitous Deep Neural Network Computing. In 2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA). 789\u2013801."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672228"},{"key":"e_1_3_2_1_50_1","volume-title":"Hostping: Diagnosing Intra-host Network Bottlenecks in RDMA Servers. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Liu Kefei","year":"2023","unstructured":"Kefei Liu, Zhuo Jiang, Jiao Zhang, Haoran Wei, Xiaolong Zhong, Lizhuang Tan, Tian Pan, and Tao Huang. 2023. Hostping: Diagnosing Intra-host Network Bottlenecks in RDMA Servers. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 15\u201329."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582037"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672249"},{"key":"e_1_3_2_1_53_1","volume-title":"Multi-Path Transport for RDMA in Datacenters. In 15th USENIX Symposium on Networked Systems Design and Implementation (NSDI 18)","author":"Lu Yuanwei","year":"2018","unstructured":"Yuanwei Lu, Guo Chen, Bojie Li, Kun Tan, Yongqiang Xiong, Peng Cheng, Jiansong Zhang, Enhong Chen, and Thomas Moscibroda. 2018. Multi-Path Transport for RDMA in Datacenters. In 15th USENIX Symposium on Networked Systems Design and Implementation (NSDI 18). 357\u2013371."},{"key":"e_1_3_2_1_54_1","volume-title":"Themis: Fair and Efficient GPU Cluster Scheduling. In 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)","author":"Mahajan Kshiteej","year":"2020","unstructured":"Kshiteej Mahajan, Arjun Balasubramanian, Arjun Singhvi, Shivaram Venkataraman, Aditya Akella, Amar Phanishayee, and Shuchi Chawla. 2020. Themis: Fair and Efficient GPU Cluster Scheduling. In 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20). 289\u2013304."},{"key":"e_1_3_2_1_55_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Mahajan Kshiteej","year":"2023","unstructured":"Kshiteej Mahajan, Ching-Hsiang Chu, Srinivas Sridharan, and Aditya Akella. 2023. Better Together: Jointly Optimizing ML Collective Scheduling and Execution Planning using SYNDICATE. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 809\u2013824."},{"key":"e_1_3_2_1_56_1","unstructured":"Microsoft. 2023. MSCCL. https:\/\/github.com\/microsoft\/msccl."},{"key":"e_1_3_2_1_57_1","volume-title":"Recursively Cautious Congestion Control. In 11th USENIX Symposium on Networked Systems Design and Implementation (NSDI 14)","author":"Mittal Radhika","year":"2014","unstructured":"Radhika Mittal, Justine Sherry, Sylvia Ratnasamy, and Scott Shenker. 2014. Recursively Cautious Congestion Control. In 11th USENIX Symposium on Networked Systems Design and Implementation (NSDI 14). 373\u2013385."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3230543.3230564"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_60_1","unstructured":"NVIDIA. 2022. Understanding the Need for Time-Sensitive Networking for Critical Applications. https:\/\/developer.nvidia.com\/blog\/understanding-the-need-for-time-sensitive-networking-for-critical-applications."},{"key":"e_1_3_2_1_61_1","unstructured":"NVIDIA. 2023. NCCL Tests. https:\/\/github.com\/NVIDIA\/nccl-tests."},{"key":"e_1_3_2_1_62_1","unstructured":"NVIDIA. 2023. NVIDIA A40 GPU for Visual Computing. https:\/\/www.nvidia.com\/en-us\/data-center\/a40."},{"key":"e_1_3_2_1_63_1","unstructured":"NVIDIA. 2023. NVIDIA L40S Product Brief. https:\/\/resources.nvidia.com\/en-us-l40s."},{"key":"e_1_3_2_1_64_1","unstructured":"NVIDIA. 2024. NCCL. https:\/\/github.com\/NVIDIA\/nccl."},{"key":"e_1_3_2_1_65_1","unstructured":"NVIDIA. 2024. NVIDIA Announces New Switches Optimized for Trillion-Parameter GPU Computing and AI Infrastructure. https:\/\/nvidianews.nvidia.com\/news\/networking-switches-gpu-computing-ai."},{"key":"e_1_3_2_1_66_1","unstructured":"NVIDIA. 2024. NVIDIA H100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/h100."},{"key":"e_1_3_2_1_67_1","unstructured":"NVIDIA. 2024. NVIDIA H200 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/h200."},{"key":"e_1_3_2_1_68_1","unstructured":"NVIDIA. 2024. NVLink & NVSwitch for Advanced Multi-GPU Communication. https:\/\/www.nvidia.com\/en-us\/data-center\/nvlink."},{"key":"e_1_3_2_1_69_1","unstructured":"NVIDIA. 2024. Upgrading Multi-GPU Interconnectivity with the Third-Generation NVIDIA NVSwitch. https:\/\/developer.nvidia.com\/blog\/upgrading-multi-gpu-interconnectivity-with-the-third-generation-nvidia-nvswitch\/?ncid=so-nvsh-708451."},{"key":"e_1_3_2_1_70_1","unstructured":"OpenAI. 2020. GPT-3: Language Models are Few-Shot Learners. https:\/\/github.com\/openai\/gpt-3."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3563766.3564096"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672265"},{"key":"e_1_3_2_1_74_1","volume-title":"International Conference on Machine Learning. 18332\u201318346","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022. DeepSpeed-MoE: Advancing Mixture-Of-Experts Inference and Training to Power Next-Generation AI Scale. In International Conference on Machine Learning. 18332\u201318346."},{"key":"e_1_3_2_1_75_1","volume-title":"ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. 1\u201316","author":"Rajbhandari Samyam","year":"2020","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. 2020. ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. 1\u201316."},{"key":"e_1_3_2_1_76_1","volume-title":"Enabling Compute-Communication Overlap in Distributed Deep Learning Training Platforms. In 2021 ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA). 540\u2013553","author":"Rashidi Saeed","year":"2021","unstructured":"Saeed Rashidi, Matthew Denton, Srinivas Sridharan, Sudarshan Srinivasan, Amoghavarsha Suresh, Jade Nie, and Tushar Krishna. 2021. Enabling Compute-Communication Overlap in Distributed Deep Learning Training Platforms. In 2021 ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA). 540\u2013553."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527382"},{"key":"e_1_3_2_1_78_1","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Romero Joshua","year":"2022","unstructured":"Joshua Romero, Junqi Yin, Nouamane Laanait, Bing Xie, M Todd Young, Sean Treichler, Vitalii Starchenko, Albina Borisevich, Alex Sergeev, and Michael Matheson. 2022. Accelerating Collective Communication in Data Parallel Training across Deep Learning Frameworks. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). 1027\u20131040."},{"key":"e_1_3_2_1_79_1","volume-title":"Scaling Distributed Machine Learning With In-Network Aggregation. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Sapio Amedeo","year":"2021","unstructured":"Amedeo Sapio, Marco Canini, Chen-Yu Ho, Jacob Nelson, Panos Kalnis, Changhoon Kim, Arvind Krishnamurthy, Masoud Moshref, Dan Ports, and Peter Richt\u00e1rik. 2021. Scaling Distributed Machine Learning With In-Network Aggregation. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21). 785\u2013808."},{"key":"e_1_3_2_1_80_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Shah Aashaka","year":"2023","unstructured":"Aashaka Shah, Vijay Chidambaram, Meghan Cowan, Saeed Maleki, Madan Musuvathi, Todd Mytkowicz, Jacob Nelson, Olli Saarikivi, and Rachee Singh. 2023. TACCL: Guiding Collective Algorithm Synthesis using Communication Sketches. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 593\u2013612."},{"key":"e_1_3_2_1_81_1","volume-title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_82_1","volume-title":"Realizing the AMD Exascale Heterogeneous Processor Vision. In 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA). 876\u2013889","author":"Smith Alan","year":"2024","unstructured":"Alan Smith, Gabriel H Loh, Michael J Schulte, Mike Ignatowski, Samuel Naffziger, Mike Mantor, Mark Fowler Nathan Kalyanasundharam, Vamsi Alla, Nicholas Malaya, Joseph L Greathouse, et al. 2024. Realizing the AMD Exascale Heterogeneous Processor Vision. In 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA). 876\u2013889."},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575712"},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342005051521"},{"key":"e_1_3_2_1_85_1","unstructured":"Moore Threads. 2024. MTT S4000 | Moore Threads. https:\/\/en.mthreads.com\/product\/S4000."},{"key":"e_1_3_2_1_86_1","unstructured":"Vast.ai. 2025. Pricing | Vast.ai. https:\/\/vast.ai\/pricing."},{"key":"e_1_3_2_1_87_1","unstructured":"Cudo Ventures. 2023. GPU Cloud - Deploy GPUs On-Demand. https:\/\/www.cudocompute.com\/products\/gpu-cloud."},{"key":"e_1_3_2_1_88_1","volume-title":"22th USENIX Symposium on Networked Systems Design and Implementation (NSDI 25).","author":"Wang Xizheng","unstructured":"Xizheng Wang, Qingxu Li, Yichi Xu, Gang Lu, Dan Li, Li Chen, Heyang Zhou, Linkang Zheng, Sen Zhang, Yikai Zhu, Yang Liu, Pengcheng Zhang, Kun Qian, Kunling He, Jiaqi Gao, Ennan Zhai, Dennis Cai, and Binzhang Fu. 2025. SimAI: Unifying Architecture Design and Performance Tunning for Large-Scale Large Language Model Training with Scalability and Precision. In 22th USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)."},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11390-023-2894-6"},{"key":"e_1_3_2_1_90_1","volume-title":"Implementation and Evaluation of Congestion Control for Multipath TCP. In 8th USENIX Symposium on Networked Systems Design and Implementation (NSDI 11)","author":"Wischik Damon","year":"2011","unstructured":"Damon Wischik, Costin Raiciu, Adam Greenhalgh, and Mark Handley. 2011. Design, Implementation and Evaluation of Congestion Control for Multipath TCP. In 8th USENIX Symposium on Networked Systems Design and Implementation (NSDI 11)."},{"key":"e_1_3_2_1_91_1","volume-title":"2023 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS). 283\u2013294","author":"Won William","year":"2023","unstructured":"William Won, Taekyung Heo, Saeed Rashidi, Srinivas Sridharan, Sudarshan Srinivasan, and Tushar Krishna. 2023. ASTRA-sim2. 0: Modeling Hierarchical Networks and Disaggregated Systems for Large-model Training at Scale. In 2023 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS). 283\u2013294."},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672252"},{"key":"e_1_3_2_1_93_1","unstructured":"Sherry Xu. 2024. Inside Maia 100: Revolutionizing AI Workloads with Microsoft's Custom AI Accelerator. https:\/\/techcommunity.microsoft.com\/t5\/azure-infrastructure-blog\/inside-maia-100-revolutionizing-ai-workloads-with-microsoft-s\/ba-p\/4229118."},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","DOI":"10.14778\/3561261.3561265"},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"publisher","DOI":"10.1145\/3617232.3624863"},{"key":"e_1_3_2_1_96_1","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","DOI":"10.1145\/3718958.3750506"},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472897"}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3767295.3769330","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T12:00:10Z","timestamp":1780660810000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3769330"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":98,"alternative-id":["10.1145\/3767295.3769330","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3769330","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}