{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:51:12Z","timestamp":1772905872241,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":88,"publisher":"ACM","funder":[{"name":"Beijing Municipal Science and Technology Project","award":["Z241100004224023"],"award-info":[{"award-number":["Z241100004224023"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,8]]},"DOI":"10.1145\/3718958.3750468","type":"proceedings-article","created":{"date-parts":[[2025,8,27]],"date-time":"2025-08-27T16:54:11Z","timestamp":1756313651000},"page":"1-23","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["InfiniteHBD: Building Datacenter-Scale High-Bandwidth Domain for LLM with Optical Circuit Switching Transceivers"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-7113-0357","authenticated-orcid":false,"given":"Chenchen","family":"Shou","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"},{"name":"StepFun, Shanghai, China"},{"name":"Lightelligence Pte. Ltd., Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4933-0276","authenticated-orcid":false,"given":"Guyue","family":"Liu","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3031-5930","authenticated-orcid":false,"given":"Hao","family":"Nie","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5326-3056","authenticated-orcid":false,"given":"Huaiyu","family":"Meng","sequence":"additional","affiliation":[{"name":"Lightelligence Pte. Ltd., Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3469-0906","authenticated-orcid":false,"given":"Yu","family":"Zhou","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0049-873X","authenticated-orcid":false,"given":"Yimin","family":"Jiang","sequence":"additional","affiliation":[{"name":"Unaffiliated, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1395-8940","authenticated-orcid":false,"given":"Wenqing","family":"Lv","sequence":"additional","affiliation":[{"name":"Lightelligence Pte. Ltd., Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7443-9988","authenticated-orcid":false,"given":"Yelong","family":"Xu","sequence":"additional","affiliation":[{"name":"Lightelligence Pte. Ltd., Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2554-6490","authenticated-orcid":false,"given":"Yuanwei","family":"Lu","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8725-9315","authenticated-orcid":false,"given":"Zhang","family":"Chen","sequence":"additional","affiliation":[{"name":"Lightelligence Pte. Ltd., Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4231-4859","authenticated-orcid":false,"given":"Yanbo","family":"Yu","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3800-2143","authenticated-orcid":false,"given":"Yichen","family":"Shen","sequence":"additional","affiliation":[{"name":"Lightelligence Pte. Ltd., Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9113-2660","authenticated-orcid":false,"given":"Yibo","family":"Zhu","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6657-5806","authenticated-orcid":false,"given":"Daxin","family":"Jiang","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,8,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints.","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai. 2023. GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints. (2023). arXiv:cs.CL\/2305.13245 https:\/\/arxiv.org\/abs\/2305.13245"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/1402958.1402967"},{"key":"e_1_3_2_1_3_1","unstructured":"AMD. 2025. AMD Instinct\u2122 MI300 Series Microarchitecture. (2025). https:\/\/rocm.docs.amd.com\/en\/latest\/conceptual\/gpu-arch\/mi300.html"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672248"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3387514.3406221"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/71.642949"},{"key":"e_1_3_2_1_7_1","volume-title":"https:\/\/www.calient.net\/","author":"AI.","year":"2024","unstructured":"Calient.AI. 2024. (2024). https:\/\/www.calient.net\/."},{"key":"e_1_3_2_1_8_1","volume-title":"OSA: An Optical Switching Architecture for Data Center Networks with Unprecedented Flexibility. In 9th USENIX Symposium on Networked Systems Design and Implementation (NSDI 12)","author":"Chen Kai","year":"2012","unstructured":"Kai Chen, Ankit Singla, Atul Singh, Kishore Ramachandran, Lei Xu, Yueping Zhang, Xitao Wen, and Yan Chen. 2012. OSA: An Optical Switching Architecture for Data Center Networks with Unprecedented Flexibility. In 9th USENIX Symposium on Networked Systems Design and Implementation (NSDI 12). USENIX Association."},{"key":"e_1_3_2_1_9_1","volume-title":"Enabling Wide-Spread Communications on Optical Fabric with MegaSwitch. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Chen Li","year":"2017","unstructured":"Li Chen, Kai Chen, Zhonghua Zhu, Minlan Yu, George Porter, Chunming Qiao, and Shan Zhong. 2017. Enabling Wide-Spread Communications on Optical Fabric with MegaSwitch. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17). USENIX Association, Boston, MA, 577\u2013593. https:\/\/www.usenix.org\/conference\/nsdi17\/technical-sessions\/presentation\/chen"},{"key":"e_1_3_2_1_10_1","unstructured":"Intel Corporation. 2023. Intel\u00ae Gaudi\u00ae 3 AI Accelerator White Paper. (2023). https:\/\/www.intel.com\/content\/www\/us\/en\/content-details\/817486\/intel-gaudi-3-ai-accelerator-white-paper.html"},{"key":"e_1_3_2_1_11_1","unstructured":"NVIDIA Corporation. 2018. Accelerated Computing and the Democratization of Supercomputing: Technical Overview. Technical Report. NVIDIA Corporation."},{"key":"e_1_3_2_1_12_1","unstructured":"DeepSeek-AI Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan Damai Dai Daya Guo Dejian Yang Deli Chen Dongjie Ji Erhang Li Fangyun Lin Fucong Dai Fuli Luo Guangbo Hao Guanting Chen Guowei Li H. Zhang Han Bao Hanwei Xu Haocheng Wang Haowei Zhang Honghui Ding Huajian Xin Huazuo Gao Hui Li Hui Qu J. L. Cai Jian Liang Jianzhong Guo Jiaqi Ni Jiashi Li Jiawei Wang Jin Chen Jingchang Chen Jingyang Yuan Junjie Qiu Junlong Li Junxiao Song Kai Dong Kai Hu Kaige Gao Kang Guan Kexin Huang Kuai Yu Lean Wang Lecong Zhang Lei Xu Leyi Xia Liang Zhao Litong Wang Liyue Zhang Meng Li Miaojun Wang Mingchuan Zhang Minghua Zhang Minghui Tang Mingming Li Ning Tian Panpan Huang Peiyi Wang Peng Zhang Qiancheng Wang Qihao Zhu Qinyu Chen Qiushi Du R. J. Chen R. L. Jin Ruiqi Ge Ruisong Zhang Ruizhe Pan Runji Wang Runxin Xu Ruoyu Zhang Ruyi Chen S. S. Li Shanghao Lu Shangyan Zhou Shanhuang Chen Shaoqing Wu Shengfeng Ye Shengfeng Ye Shirong Ma Shiyu Wang Shuang Zhou Shuiping Yu Shunfeng Zhou Shuting Pan T. Wang Tao Yun Tian Pei Tianyu Sun W. L. Xiao Wangding Zeng Wanjia Zhao Wei An Wen Liu Wenfeng Liang Wenjun Gao Wenqin Yu Wentao Zhang X. Q. Li Xiangyue Jin Xianzu Wang Xiao Bi Xiaodong Liu Xiaohan Wang Xiaojin Shen Xiaokang Chen Xiaokang Zhang Xiaosha Chen Xiaotao Nie Xiaowen Sun Xiaoxiang Wang Xin Cheng Xin Liu Xin Xie Xingchao Liu Xingkai Yu Xinnan Song Xinxia Shan Xinyi Zhou Xinyu Yang Xinyuan Li Xuecheng Su Xuheng Lin Y. K. Li Y. Q. Wang Y. X. Wei Y. X. Zhu Yang Zhang Yanhong Xu Yanhong Xu Yanping Huang Yao Li Yao Zhao Yaofeng Sun Yaohui Li Yaohui Wang Yi Yu Yi Zheng Yichao Zhang Yifan Shi Yiliang Xiong Ying He Ying Tang Yishi Piao Yisong Wang Yixuan Tan Yiyang Ma Yiyuan Liu Yongqiang Guo Yu Wu Yuan Ou Yuchen Zhu Yuduan Wang Yue Gong Yuheng Zou Yujia He Yukun Zha Yunfan Xiong Yunxian Ma Yuting Yan Yuxiang Luo Yuxiang You Yuxuan Liu Yuyang Zhou Z. F. Wu Z. Z. Ren Zehui Ren Zhangli Sha Zhe Fu Zhean Xu Zhen Huang Zhen Zhang Zhenda Xie Zhengyan Zhang Zhewen Hao Zhibin Gou Zhicheng Ma Zhigang Yan Zhihong Shao Zhipeng Xu Zhiyu Wu Zhongyu Zhang Zhuoshu Li Zihui Gu Zijia Zhu Zijun Liu Zilin Li Ziwei Xie Ziyang Song Ziyi Gao and Zizheng Pan. 2025. DeepSeek-V3 Technical Report. (2025). arXiv:cs.CL\/2412.19437 https:\/\/arxiv.org\/abs\/2412.19437"},{"key":"e_1_3_2_1_13_1","volume-title":"International Conference on Machine Learning. PMLR, 5547\u20135569","author":"Du Nan","year":"2022","unstructured":"Nan Du, Yanping Huang, Andrew M Dai, Simon Tong, Dmitry Lepikhin, Yuanzhong Xu, Maxim Krikun, Yanqi Zhou, Adams Wei Yu, Orhan Firat, et al. 2022. Glam: Efficient scaling of language models with mixture-of-experts. In International Conference on Machine Learning. PMLR, 5547\u20135569."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/1851182.1851223"},{"key":"e_1_3_2_1_15_1","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research 23, 120 (2022), 1\u201339.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_16_1","unstructured":"fibermall. 2024. OSFP-400G-FR4 400G FR4 OSFP PAM4 CWDM4 2km LC SMF FEC Optical Transceiver Module. (2024). https:\/\/www.fibermall.com\/sale-459190-osfp-400g-fr4-cwdm4-2km.htm."},{"key":"e_1_3_2_1_17_1","volume-title":"https:\/\/www.fibermall.com\/","author":"FIBERMALL.COM.","year":"2024","unstructured":"FIBERMALL.COM. 2024. (2024). https:\/\/www.fibermall.com\/."},{"key":"e_1_3_2_1_18_1","volume-title":"https:\/\/www.fs.com\/","author":"FS.","year":"2024","unstructured":"FS. 2024. (2024). https:\/\/www.fs.com\/."},{"key":"e_1_3_2_1_19_1","volume-title":"https:\/\/www.fs.com\/products\/219579.html","author":"Generic FS.","year":"2024","unstructured":"FS. 2024. 1.5m (5ft) Generic Compatible 400G OSFP Flat Top Passive Direct Attach Copper Twinax Cable. (2024). https:\/\/www.fs.com\/products\/219579.html."},{"key":"e_1_3_2_1_20_1","volume-title":"https:\/\/www.fs.com\/products\/155618.html","author":"Ethernet Passive Direct Attach FS.","year":"2024","unstructured":"FS. 2024. 1.5m (5ft) NVIDIA\/Mellanox MCP1650-V01AE30 Compatible 200G QSFP56 Ethernet Passive Direct Attach Copper Twinax Cable. (2024). https:\/\/www.fs.com\/products\/155618.html."},{"key":"e_1_3_2_1_21_1","volume-title":"https:\/\/www.fs.com\/products\/244361.html","author":"Generic FS.","year":"2024","unstructured":"FS. 2024. 1m (3ft) Generic Compatible 1.6T OSFP Close Top Passive Direct Attach Copper Twinax Cable. (2024). https:\/\/www.fs.com\/products\/244361.html."},{"key":"e_1_3_2_1_22_1","volume-title":"Duplex, 2 Fibers, Single Mode (OS2), Riser (OFNR), 2.0mm, Tight-Buffered, Yellow.","author":"Fiber Patch Cable FS.","year":"2024","unstructured":"FS. 2024. 50m (164ft) Fiber Patch Cable, LC UPC to LC UPC, Duplex, 2 Fibers, Single Mode (OS2), Riser (OFNR), 2.0mm, Tight-Buffered, Yellow. (2024). https:\/\/www.fs.com\/products\/177394.html."},{"key":"e_1_3_2_1_23_1","first-page":"288","article-title":"Megablocks: Efficient sparse training with mixture-of-experts","volume":"5","author":"Gale Trevor","year":"2023","unstructured":"Trevor Gale, Deepak Narayanan, Cliff Young, and Matei Zaharia. 2023. Megablocks: Efficient sparse training with mixture-of-experts. Proceedings of Machine Learning and Systems 5 (2023), 288\u2013304.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672233"},{"key":"e_1_3_2_1_25_1","volume-title":"https:\/\/cloud.google.com\/tpu\/docs\/v5p","year":"2024","unstructured":"Google. 2024. TPUv5p. (2024). https:\/\/cloud.google.com\/tpu\/docs\/v5p."},{"key":"e_1_3_2_1_26_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri et al. 2024. The Llama 3 Herd of Models. (2024). arXiv:cs.AI\/2407.21783 https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_1_27_1","volume-title":"Tiresias: A GPU Cluster Manager for Distributed Deep Learning. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu, Mosharaf Chowdhury, Kang G. Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Liu, and Chuanxiong Guo. 2019. Tiresias: A GPU Cluster Manager for Distributed Deep Learning. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19). USENIX Association, Boston, MA, 485\u2013500. https:\/\/www.usenix.org\/conference\/nsdi19\/presentation\/gu"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2619239.2626328"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/0-387-29159-8_4"},{"key":"e_1_3_2_1_30_1","unstructured":"Albert Q. Jiang Alexandre Sablayrolles Antoine Roux Arthur Mensch Blanche Savary Chris Bamford Devendra Singh Chaplot Diego de las Casas Emma Bou Hanna Florian Bressand Gianna Lengyel Guillaume Bour Guillaume Lample L\u00e9lio Renard Lavaud Lucile Saulnier Marie-Anne Lachaux Pierre Stock Sandeep Subramanian Sophia Yang Szymon Antoniak Teven Le Scao Th\u00e9ophile Gervet Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William El Sayed. 2024. Mixtral of Experts. (2024). arXiv:cs.LG\/2401.04088 https:\/\/arxiv.org\/abs\/2401.04088"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.5555\/3488766.3488792"},{"key":"e_1_3_2_1_32_1","unstructured":"Ziheng Jiang Haibin Lin Yinmin Zhong Qi Huang Yangrui Chen Zhi Zhang Yanghua Peng Xiang Li Cong Xie Shibiao Nong Yulu Jia Sun He Hongmin Chen Zhihao Bai Qi Hou Shipeng Yan Ding Zhou Yiyao Sheng Zhuo Jiang Haohan Xu Haoran Wei Zhang Zhang Pengfei Nie Leqi Zou Sida Zhao Liang Xiang Zherui Liu Zhe Li Xiaoying Jia Jianxi Ye Xin Jin and Xin Liu. 2024. MegaScale: Scaling Large Language Model Training to More Than 10 000 GPUs. (2024). arXiv:cs.LG\/2402.15627 https:\/\/arxiv.org\/abs\/2402.15627"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3360307"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472900"},{"key":"e_1_3_2_1_36_1","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Li Jiamin","year":"2023","unstructured":"Jiamin Li, Yimin Jiang, Yibo Zhu, Cong Wang, and Hong Xu. 2023. Accelerating distributed {MoE} training and inference with lina. In 2023 USENIX Annual Technical Conference (USENIX ATC 23). 945\u2013959."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587445"},{"key":"e_1_3_2_1_38_1","volume-title":"Zhizhen Zhong, Guyue Liu, Ying Zhang, Xiaofeng Ye, Yiming Zhang, and Kai Chen.","author":"Liao Xudong","year":"2025","unstructured":"Xudong Liao, Yijun Sun, Han Tian, Xinchen Wan, Yilun Jin, Zilong Wang, Zhenghang Ren, Xinyang Huang, Wenxue Li, Kin Fai Tse, Zhizhen Zhong, Guyue Liu, Ying Zhang, Xiaofeng Ye, Yiming Zhang, and Kai Chen. 2025. mFabric: An Efficient and Scalable Fabric for Mixture-of-Experts Training. (2025). arXiv:cs.NI\/2501.03905 https:\/\/arxiv.org\/abs\/2501.03905"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604836"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604869"},{"key":"e_1_3_2_1_41_1","volume-title":"Themis: Fair and Efficient GPU Cluster Scheduling. In 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)","author":"Mahajan Kshiteej","year":"2020","unstructured":"Kshiteej Mahajan, Arjun Balasubramanian, Arjun Singhvi, Shivaram Venkataraman, Aditya Akella, Amar Phanishayee, and Shuchi Chawla. 2020. Themis: Fair and Efficient GPU Cluster Scheduling. In 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20). USENIX Association, Santa Clara, CA, 289\u2013304. https:\/\/www.usenix.org\/conference\/nsdi20\/presentation\/mahajan"},{"key":"e_1_3_2_1_42_1","volume-title":"17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)","author":"Mellette William M","year":"2020","unstructured":"William M Mellette, Rajdeep Das, Yibo Guo, Rob McGuinness, Alex C Snoeren, and George Porter. 2020. Expanding across time to deliver bandwidth efficiency and low latency. In 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20). 1\u201318."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3098822.3098838"},{"key":"e_1_3_2_1_44_1","volume-title":"Our most capable models to date","author":"Introducing Llama","year":"2024","unstructured":"Introducing Llama 3.1: Our most capable models to date. 2024. (2024). https:\/\/ai.meta.com\/blog\/meta-llama-3-1\/."},{"key":"e_1_3_2_1_45_1","unstructured":"OSFP MSA. 2022. The Next Generation of Pluggable Optical Module Solutions from the OSFP MSA. (2022). https:\/\/osfpmsa.org\/assets\/pdf\/OSFP1600_and_OSFP-XD.pdf."},{"key":"e_1_3_2_1_46_1","unstructured":"QSFP-DD MSA. 2024. QSFP-DD\/QSFP-DD800\/QSFP-DD1600 Hardware Specification. (2024). http:\/\/www.qsfp-dd.com\/wp-content\/uploads\/2024\/07\/QSFP-DD-Hardware-Rev7.1.pdf."},{"key":"e_1_3_2_1_47_1","volume-title":"Broadcom Tomahawk 5, Support RoCEv2","author":"NADDOD.","year":"2024","unstructured":"NADDOD. 2024. N9500-128QC, 128x400G QSFP112 Ethernet L3 4U Managed Switch, 51.2Tbps, Broadcom Tomahawk 5, Support RoCEv2, for AI\/ML\/Cloud Data Center\/HPC. (2024). https:\/\/www.naddod.com\/products\/102323.html."},{"key":"e_1_3_2_1_48_1","volume-title":"https:\/\/www.naddod.com\/","author":"NADDOD.COM.","year":"2024","unstructured":"NADDOD.COM. 2024. (2024). https:\/\/www.naddod.com\/."},{"key":"e_1_3_2_1_49_1","volume-title":"Vu Le, Jeffery J Maki, Juniper Networks Scott Sommers, Tom Palkert, and Weiming Chen.","author":"Nowell Mark","year":"2018","unstructured":"Mark Nowell, Cisco Attila Aranyosi, Vu Le, Jeffery J Maki, Juniper Networks Scott Sommers, Tom Palkert, and Weiming Chen. 2018. QSFP-DD: Enabling 15 Watt Cooling Solutions. (2018)."},{"key":"e_1_3_2_1_50_1","unstructured":"NVIDIA. 2018. NVIDIA NVSwitch Technical Overview. (2018). https:\/\/images.nvidia.com\/content\/pdf\/nvswitch-technical-overview.pdf."},{"key":"e_1_3_2_1_51_1","volume-title":"https:\/\/docs.nvidia.com\/deeplearning\/nccl\/user-guide\/docs\/usage\/p2p.html#all-to-all","author":"AllToAll NVIDIA.","year":"2020","unstructured":"NVIDIA. 2020. NCCL AllToAll. (2020). https:\/\/docs.nvidia.com\/deeplearning\/nccl\/user-guide\/docs\/usage\/p2p.html#all-to-all."},{"key":"e_1_3_2_1_52_1","unstructured":"NVIDIA. 2021. Doubling all2all Performance with NVIDIA Collective Communication Library 2.12. (2021). https:\/\/developer.nvidia.com\/blog\/doubling-all2all-performance-with-nvidia-collective-communication-library-2-12\/"},{"key":"e_1_3_2_1_53_1","unstructured":"NVIDIA. 2023. Matrix Multiplication Background User's Guide. (2023). https:\/\/docs.nvidia.com\/deeplearning\/performance\/dl-performance-matrix-multiplication\/index.html."},{"key":"e_1_3_2_1_54_1","unstructured":"NVIDIA. 2024. NVIDIA DGX SuperPOD. (2024). https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-superpod."},{"key":"e_1_3_2_1_55_1","unstructured":"NVIDIA. 2024. NVIDIA GB200 NVL72. (2024). https:\/\/www.nvidia.com\/en-us\/data-center\/gb200-nvl72\/."},{"key":"e_1_3_2_1_56_1","unstructured":"NVIDIA. 2024. NVIDIA H100 Tensor Core GPU. (2024). https:\/\/www.nvidia.com\/en-us\/data-center\/h100\/."},{"key":"e_1_3_2_1_57_1","unstructured":"NVIDIA. 2024. NVLink and NVLink Switch. (2024). https:\/\/www.nvidia.com\/en-us\/data-center\/nvlink."},{"key":"e_1_3_2_1_58_1","volume-title":"https:\/\/www.opencompute.org\/documents\/oai-ubb-base-specification-r2-0-v1-0-20230919-pdf","author":"Universal Open Accelerator","year":"2023","unstructured":"Open Accelerator Infrastructure (OAI). 2023. Universal Baseboard (UBB) Base Specification r2.0 v1.0. (2023). https:\/\/www.opencompute.org\/documents\/oai-ubb-base-specification-r2-0-v1-0-20230919-pdf."},{"key":"e_1_3_2_1_59_1","first-page":"71","article-title":"Comparative Analysis of Different Topologies Based On Network-on-Chip Architectures","volume":"1","author":"Pandey Deepika","year":"2012","unstructured":"Deepika Pandey. 2012. Comparative Analysis of Different Topologies Based On Network-on-Chip Architectures. International Journal of Engineering Research and Development 1, 11 (2012), 71\u201376.","journal-title":"International Journal of Engineering Research and Development"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"e_1_3_2_1_62_1","volume-title":"https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-platform\/","author":"Platform NVIDIA DGX","year":"2024","unstructured":"NVIDIA DGX Platform. 2024. (2024). https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-platform\/."},{"key":"e_1_3_2_1_63_1","volume-title":"www.polatis.com","year":"2025","unstructured":"Polatis. 2025. Homepage. (2025). www.polatis.com"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/2534169.2486007"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672265"},{"key":"e_1_3_2_1_66_1","volume-title":"Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Qiao Aurick","unstructured":"Aurick Qiao, Sang Keun Choe, Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R. Ganger, and Eric P. Xing. 2021. Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21). USENIX Association, 1\u201318. https:\/\/www.usenix.org\/conference\/osdi21\/presentation\/qiao"},{"key":"e_1_3_2_1_67_1","unstructured":"Samyam Rajbhandari Jeff Rasley Olatunji Ruwase and Yuxiong He. 2020. ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. (2020). arXiv:cs.LG\/1910.02054 https:\/\/arxiv.org\/abs\/1910.02054"},{"key":"e_1_3_2_1_68_1","unstructured":"SemiAnalysis. 2023. Google OCS Apollo: The >$3 Billion Game-Changer in Datacenter Networking. (2023). https:\/\/semianalysis.com\/2023\/03\/17\/google-apollo-the-3-billion-game\/."},{"key":"e_1_3_2_1_69_1","unstructured":"SemiAnalysis. 2024. GB200 Hardware Architecture - Component Supply Chain & BOM. (2024). https:\/\/semianalysis.com\/2024\/07\/17\/gb200-hardware-architecture-and-component\/."},{"key":"e_1_3_2_1_70_1","unstructured":"SemiAnalysis. 2024. NVIDIA's Blackwell Reworked - Shipment Delays & GB200A Reworked Platforms. (2024). https:\/\/semianalysis.com\/2024\/08\/04\/nvidias-blackwell-reworked-shipment\/."},{"key":"e_1_3_2_1_71_1","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. (2020). arXiv:cs.CL\/1909.08053 https:\/\/arxiv.org\/abs\/1909.08053"},{"key":"e_1_3_2_1_72_1","volume-title":"Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, and Bryan Catanzaro.","author":"Smith Shaden","year":"2022","unstructured":"Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, Elton Zhang, Rewon Child, Reza Yazdani Aminabadi, Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, and Bryan Catanzaro. 2022. Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model. (2022). arXiv:cs.CL\/2201.11990 https:\/\/arxiv.org\/abs\/2201.11990"},{"key":"e_1_3_2_1_73_1","unstructured":"Xingwu Sun Yanfeng Chen Yiqing Huang Ruobing Xie Jiaqi Zhu Kai Zhang Shuaipeng Li Zhen Yang Jonny Han Xiaobo Shu Jiahao Bu Zhongzhi Chen Xuemeng Huang Fengzong Lian Saiyong Yang Jianfeng Yan Yuyuan Zeng Xiaoqin Ren Chao Yu Lulu Wu Yue Mao Jun Xia Tao Yang Suncong Zheng Kan Wu Dian Jiao Jinbao Xue Xipeng Zhang Decheng Wu Kai Liu Dengpeng Wu Guanghui Xu Shaohua Chen Shuang Chen Xiao Feng Yigeng Hong Junqiang Zheng Chengcheng Xu Zongwei Li Xiong Kuang Jianglu Hu Yiqi Chen Yuchi Deng Guiyang Li Ao Liu Chenchen Zhang Shihui Hu Zilong Zhao Zifan Wu Yao Ding Weichao Wang Han Liu Roberts Wang Hao Fei Peijie Yu Ze Zhao Xun Cao Hai Wang Fusheng Xiang Mengyuan Huang Zhiyuan Xiong Bin Hu Xuebin Hou Lei Jiang Jianqiang Ma Jiajia Wu Yaping Deng Yi Shen Qian Wang Weijie Liu Jie Liu Meng Chen Liang Dong Weiwen Jia Hu Chen Feifei Liu Rui Yuan Huilin Xu Zhenxiang Yan Tengfei Cao Zhichao Hu Xinhua Feng Dong Du Tinghao Yu Yangyu Tao Feng Zhang Jianchen Zhu Chengzhong Xu Xirui Li Chong Zha Wen Ouyang Yinben Xia Xiang Li Zekun He Rongpeng Chen Jiawei Song Ruibin Chen Fan Jiang Chongqing Zhao Bo Wang Hao Gong Rong Gan Winston Hu Zhanhui Kang Yong Yang Yuhong Liu Di Wang and Jie Jiang. 2024. Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent. (2024). arXiv:cs.CL\/2411.02265 https:\/\/arxiv.org\/abs\/2411.02265"},{"key":"e_1_3_2_1_74_1","volume-title":"www.telescent.com\/products","year":"2025","unstructured":"Telescent. 2025. Homepage. (2025). www.telescent.com\/products"},{"key":"e_1_3_2_1_75_1","volume-title":"https:\/\/en.wikipedia.org\/wiki\/Tesla_Dojo","author":"Dojo Tesla","year":"2024","unstructured":"Tesla. 2024. Tesla Dojo. (2024). https:\/\/en.wikipedia.org\/wiki\/Tesla_Dojo."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-39924-7_38"},{"key":"e_1_3_2_1_77_1","volume-title":"https:\/\/aws.amazon.com\/ai\/machine-learning\/trainium\/","author":"Trainium AWS","year":"2024","unstructured":"AWS Trainium. 2024. (2024). https:\/\/aws.amazon.com\/ai\/machine-learning\/trainium\/."},{"key":"e_1_3_2_1_78_1","volume-title":"Mission Apollo: Landing Optical Circuit Switching at Datacenter Scale.","author":"Urata Ryohei","year":"2022","unstructured":"Ryohei Urata, Hong Liu, Kevin Yasumura, Erji Mao, Jill Berger, Xiang Zhou, Cedric Lam, Roy Bannon, Darren Hutchinson, Daniel Nelson, Leon Poutievski, Arjun Singh, Joon Ong, and Amin Vahdat. 2022. Mission Apollo: Landing Optical Circuit Switching at Datacenter Scale. (2022). arXiv:cs.NI\/2208.10041 https:\/\/arxiv.org\/abs\/2208.10041"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/1851182.1851222"},{"key":"e_1_3_2_1_80_1","volume-title":"Rail-only: A Low-Cost High-Performance Network for Training LLMs with Trillion Parameters.","author":"Wang Weiyang","year":"2024","unstructured":"Weiyang Wang, Manya Ghobadi, Kayvon Shakeri, Ying Zhang, and Naader Hasani. 2024. Rail-only: A Low-Cost High-Performance Network for Training LLMs with Trillion Parameters. (2024). arXiv:cs.NI\/2307.12169 https:\/\/arxiv.org\/abs\/2307.12169"},{"key":"e_1_3_2_1_81_1","volume-title":"TopoOpt: Co-optimizing Network Topology and Parallelization Strategy for Distributed Training Jobs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wang Weiyang","year":"2023","unstructured":"Weiyang Wang, Moein Khazraee, Zhizhen Zhong, Manya Ghobadi, Zhihao Jia, Dheevatsa Mudigere, Ying Zhang, and Anthony Kewitsch. 2023. TopoOpt: Co-optimizing Network Topology and Parallelization Strategy for Distributed Training Jobs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 739\u2013767."},{"key":"e_1_3_2_1_82_1","volume-title":"MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Weng Qizhen","year":"2022","unstructured":"Qizhen Weng, Wencong Xiao, Yinghao Yu, Wei Wang, Cheng Wang, Jian He, Yong Li, Liping Zhang, Wei Lin, and Yu Ding. 2022. MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). USENIX Association, Renton, WA, 945\u2013960. https:\/\/www.usenix.org\/conference\/nsdi22\/presentation\/weng"},{"key":"e_1_3_2_1_83_1","unstructured":"Wikipedia. 2024. Mach-Zehnder interferometer. (2024). https:\/\/en.wikipedia.org\/wiki\/Mach-Zehnder_interferometer."},{"key":"e_1_3_2_1_84_1","volume-title":"7th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 15)","author":"Xia Yiting","year":"2015","unstructured":"Yiting Xia, Mike Schlansker, TS Eugene Ng, and Jean Tourrilhes. 2015. Enabling Topological Flexibility for Data Centers Using {OmniSwitch}. In 7th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 15)."},{"key":"e_1_3_2_1_85_1","volume-title":"Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, Fan Yang, and Lidong Zhou. 2018. Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). USENIX Association, Carlsbad, CA, 595\u2013610. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/xiao"},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1109\/35.965375"},{"key":"e_1_3_2_1_87_1","volume-title":"DistTrain: Addressing model and data heterogeneity with disaggregated training for multimodal large language models. arXiv preprint arXiv:2408.04275","author":"Zhang Zili","year":"2024","unstructured":"Zili Zhang, Yinmin Zhong, Ranchen Ming, Hanpeng Hu, Jianjian Sun, Zheng Ge, Yibo Zhu, and Xin Jin. 2024. DistTrain: Addressing model and data heterogeneity with disaggregated training for multimodal large language models. arXiv preprint arXiv:2408.04275 (2024)."},{"key":"e_1_3_2_1_88_1","unstructured":"Yinmin Zhong Zili Zhang Bingyang Wu Shengyu Liu Yukun Chen Changyi Wan Hanpeng Hu Lei Xia Ranchen Ming Yibo Zhu et al. 2024. RLHFuse: Efficient rlhf training for large language models with inter-and intra-stage fusion. arXiv preprint arXiv:2409.13221 (2024)."}],"event":{"name":"SIGCOMM '25: ACM SIGCOMM 2025 Conference","location":"S\u00e3o Francisco Convent Coimbra Portugal","acronym":"SIGCOMM '25","sponsor":["SIGCOMM ACM Special Interest Group on Data Communication"]},"container-title":["Proceedings of the ACM SIGCOMM 2025 Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3718958.3750468","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,27]],"date-time":"2025-08-27T16:55:21Z","timestamp":1756313721000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3718958.3750468"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,27]]},"references-count":88,"alternative-id":["10.1145\/3718958.3750468","10.1145\/3718958"],"URL":"https:\/\/doi.org\/10.1145\/3718958.3750468","relation":{},"subject":[],"published":{"date-parts":[[2025,8,27]]},"assertion":[{"value":"2025-08-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}