{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,17]],"date-time":"2025-11-17T12:08:19Z","timestamp":1763381299190,"version":"3.45.0"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CNS-2211383"],"award-info":[{"award-number":["CNS-2211383"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,17]]},"DOI":"10.1145\/3772356.3772415","type":"proceedings-article","created":{"date-parts":[[2025,11,17]],"date-time":"2025-11-17T12:02:48Z","timestamp":1763380968000},"page":"280-288","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Your network doesn't end at the NIC"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1146-9931","authenticated-orcid":false,"given":"Raj","family":"Joshi","sequence":"first","affiliation":[{"name":"Harvard University, Cambridge, MA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1530-2568","authenticated-orcid":false,"given":"Saksham","family":"Agarwal","sequence":"additional","affiliation":[{"name":"UIUC, Champaign, IL, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0214-664X","authenticated-orcid":false,"given":"ChonLam","family":"Lao","sequence":"additional","affiliation":[{"name":"Harvard University, Cambridge, MA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2381-0212","authenticated-orcid":false,"given":"Minlan","family":"Yu","sequence":"additional","affiliation":[{"name":"Harvard University, Cambridge, MA, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,11,17]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3563766.3564110"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604878"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData47090.2019.9005703"},{"key":"e_1_3_2_1_4_1","unstructured":"Matt Bowman and Jeremy Baumgartner. 2023. Grand Teton Systems Overview. https:\/\/www.youtube.com\/watch?v=fmXfWad-NiA."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCOM.1974.1092259"},{"key":"e_1_3_2_1_6_1","unstructured":"Google Cloud. 2025. Networking and GPU Machines | Compute Engine Documentation. https:\/\/cloud.google.com\/compute\/docs\/gpus\/gpu-network-bandwidth."},{"key":"e_1_3_2_1_7_1","unstructured":"Ultra Ethernet Consortium. 2025. Ultra Ethernet Consortium (UEC) Launches Specification 1.0 Transforming Ethernet for AI and HPC at Scale."},{"key":"e_1_3_2_1_8_1","unstructured":"DeepSeek-AI. 2025. DeepSeek-V3 Technical Report. arXiv:2412.19437 [cs]"},{"key":"e_1_3_2_1_9_1","unstructured":"Dell. 2023. A State-of-the-Art Data Center for Large-Scale AI. https:\/\/www.dell.com\/en-us\/blog\/a-state-of-the-art-data-center-for-large-scale-ai\/."},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of NSDI.","author":"Deng Yangtao","year":"2025","unstructured":"Yangtao Deng, Xiang Shi, Zhuo Jiang, Xingjian Zhang, Lei Zhang, Zhang Zhang, Bo Li, Zuquan Song, Hang Zhu, Gaohong Liu, Fuliang Li, Shuguang Wang, Haibin Lin, Jianxi Ye, and Minlan Yu. 2025. Minder: Faulty Machine Detection for Large-scale Distributed Model Training. In Proceedings of NSDI."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of NSDI.","author":"Dong Jianbo","year":"2025","unstructured":"Jianbo Dong, Kun Qian, Pengcheng Zhang, Zhilong Zheng, Liang Chen, Fei Feng, Yichi Xu, Yikai Zhu, Gang Lu, Xue Li, Zhihui Ren, Zhicheng Wang, Bin Luo, Peng Zhang, Yang Liu, Yanqing Chen, Yu Guan, Weicheng Wang, Chaojie Yang, Yang Zhang, Man Yuan, Hanyu Zhao, Yong Li, Zihan Zhao, Shan Li, Xianlong Zeng, Zhiping Yao, Binzhang Fu, Ennan Zhai, Wei Lin, Chao Wang, and Dennis Cai. 2025. Evolution of Aegis: Fault Diagnosis for {AI} Model Training Service in Production. In Proceedings of NSDI."},{"key":"e_1_3_2_1_12_1","unstructured":"NVIDIA Developer Forum. 2021. Why Do Lossy RoCE Accelerations Change RDMA Write {First Middle Last} to RDMA Write Only? https:\/\/forums.developer.nvidia.com\/t\/why-do-lossy-roce-accelerations-change-rdma-write-first-middle-last-to-rdma-write-only\/205952."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672233"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"GigaByte. 2023. G893-ZD1-AAX3 GPU Server. https:\/\/www.gigabyte.com\/Enterprise\/GPU-Server\/G893-ZD1-AAX3.","DOI":"10.46471\/gigabyte.89"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3642968.3654820"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Christian Hopps. 2000. Analysis of an Equal-Cost Multi-Path Algorithm. RFC 2992.","DOI":"10.17487\/rfc2992"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of NSDI.","author":"Hou Wentao","year":"2024","unstructured":"Wentao Hou, Jie Zhang, Zeke Wang, and Ming Liu. 2024. Understanding Routable PCIe Performance for Composable Infrastructures. In Proceedings of NSDI."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3555050.3569128"},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of NSDI.","author":"Hwang Changho","year":"2023","unstructured":"Changho Hwang, KyoungSoo Park, Ran Shu, Xinyuan Qu, Peng Cheng, and Yongqiang Xiong. 2023. ARK: GPU-driven Code Execution for Distributed Deep Learning. In Proceedings of NSDI."},{"key":"e_1_3_2_1_20_1","unstructured":"Sylvain Jeaugey. 2022. Commenton 'Abnormal recovery problem when GPU node fails' Issue #989. GitHub. https:\/\/github.com\/NVIDIA\/nccl\/issues\/989#issuecomment-1361225583 Accessed: 2025-07-02."},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of NSDI.","author":"Jiang Ziheng","year":"2024","unstructured":"Ziheng Jiang, Haibin Lin, Yinmin Zhong, Qi Huang, Yangrui Chen, Zhi Zhang, Yanghua Peng, Xiang Li, Cong Xie, Shibiao Nong, Yulu Jia, Sun He, Hongmin Chen, Zhihao Bai, Qi Hou, Shipeng Yan, Ding Zhou, Yiyao Sheng, Zhuo Jiang, Haohan Xu, Haoran Wei, Zhang Zhang, Pengfei Nie, Leqi Zou, Sida Zhao, Liang Xiang, Zherui Liu, Zhe Li, Xiaoying Jia, Jianxi Ye, Xin Jin, and Xin Liu. 2024. MegaScale: Scaling Large Language Model Training to More Than 10,000 GPUs. In Proceedings of NSDI."},{"key":"e_1_3_2_1_22_1","unstructured":"Nathan Kalyanasundharam. 2025. UALink 200G 1.0 Specification. https:\/\/ualinkconsortium.org\/."},{"key":"e_1_3_2_1_23_1","unstructured":"Chetan Kapoor Alexandru Costin and Belinda Zeng. 2023. A Deep Dive on AWS Infrastructure Powering the Generative AI Boom. https:\/\/d1.awsstatic.com\/events\/Summits\/reinvent2023\/CMP201_A-deep-dive-on-AWS-infrastructure-powering-the-generative-AI-boom.pdf."},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of OSDI.","author":"Kim Sangman","year":"2014","unstructured":"Sangman Kim, Seonggu Huh, Xinya Zhang, Yige Hu, Amir Wated, Emmett Witchel, and Mark Silberstein. 2014. GPUnet: Networking Abstractions for GPU Programs. In Proceedings of OSDI."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_27_1","unstructured":"ChonLam Lao Minlan Yu Aditya Akella Jiamin Cao Yu Guan Pengcheng Zhang Zhilong Zheng Yichi Xu Ennan Zhai Dennis Cai and Jiaqi Gao. 2025. TrainMover: An Interruption-Resilient and Reliable ML Training Runtime. arXiv:2412.12636 [cs.DC]"},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of OSDI.","author":"Lee Wonbeom","year":"2024","unstructured":"Wonbeom Lee, Jungi Lee, Junghwan Seo, and Jaewoong Sim. 2024. InfiniGen: Efficient Generative Inference of Large Language Models with Dynamic KV Cache Management. In Proceedings of OSDI."},{"key":"e_1_3_2_1_29_1","unstructured":"Lenovo. 2023. ThinkSystem SR685a V3. https:\/\/pubs.lenovo.com\/sr685a-v3\/."},{"key":"e_1_3_2_1_30_1","unstructured":"Patrick Lewis Ethan Perez Aleksandra Piktus Fabio Petroni Vladimir Karpukhin Naman Goyal Heinrich K\u00fcttler Mike Lewis Wen-tau Yih Tim Rockt\u00e4schel Sebastian Riedel and Douwe Kiela. 2021. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. arXiv:2005.11401 [cs]"},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of NSDI.","author":"Liu Kefei","year":"2023","unstructured":"Kefei Liu, Zhuo Jiang, Jiao Zhang, Haoran Wei, Xiaolong Zhong, Lizhuang Tan, Tian Pan, and Tao Huang. 2023. Hostping: Diagnosing Intra-host Network Bottlenecks in RDMA Servers. In Proceedings of NSDI."},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of NSDI.","author":"Liu Vincent","year":"2013","unstructured":"Vincent Liu, Daniel Halperin, Arvind Krishnamurthy, and Thomas Anderson. 2013. F10: A Fault-Tolerant Engineered Network. In Proceedings of NSDI."},{"key":"e_1_3_2_1_33_1","unstructured":"MinIO. 2025. NVIDIA GPUDirect Storage and MinIO AIStor: Unlocking Efficiency for GPU-Powered AI Workloads. https:\/\/blog.min.io\/nvidia-gpudirect-storage-and-aistor\/"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_35_1","unstructured":"NVIDIA. 2020. DGX A100 System Topology. https:\/\/docs.nvidia.com\/dgx\/dgxa100-user-guide\/introduction-to-dgxa100.html#dgx-a100-system-topology."},{"key":"e_1_3_2_1_36_1","unstructured":"NVIDIA. 2020. GPUDirect Storage. https:\/\/docs.nvidia.com\/gpudirect-storage\/index.html."},{"key":"e_1_3_2_1_37_1","unstructured":"NVIDIA. 2021. Magnum IO GPUDirect Storage Overview Guide v1.0.0. https:\/\/docs.nvidia.com\/cuda\/archive\/11.4.0\/gds\/pdf\/overview-guide.pdf."},{"key":"e_1_3_2_1_38_1","unstructured":"NVIDIA. 2022. Doubling All2all Performance with NVIDIA Collective Communication Library 2.12. https:\/\/developer.nvidia.com\/blog\/doubling-all2all-performance-with-nvidia-collective-communication-library-2-12\/."},{"key":"e_1_3_2_1_39_1","unstructured":"NVIDIA. 2024. Storage Architecture. https:\/\/docs.nvidia.com\/dgx-superpod\/reference-architecture\/scalable-infrastructure-h200\/latest\/storage-architecture.html"},{"key":"e_1_3_2_1_40_1","unstructured":"NVIDIA. 2025. GPUDirect RDMA. https:\/\/docs.nvidia.com\/cuda\/gpudirect-rdma\/index.html."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_2_1_42_1","unstructured":"PCI-SIG 2022. PCI Express Base Specification Revision 6.0. PCI-SIG."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Jon Postel. 1981. Internet Protocol. Request for Comments RFC 791. Internet Engineering Task Force.","DOI":"10.17487\/rfc0791"},{"key":"e_1_3_2_1_44_1","unstructured":"Lenovo Press. 2024. ThinkSystem NVIDIA H100 PCIe Gen5 GPUs Product Guide. https:\/\/lenovopress.lenovo.com\/lp1732-thinksystem-nvidia-h100-pcie-gen5-gpu."},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of FAST.","author":"Qin Ruoyu","year":"2025","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Jialei Cui, Feng Ren, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2025. Mooncake: Trading More Storage for Less Computation \u2014 A KVCache-centric Architecture for Serving LLM Chatbot. In Proceedings of FAST."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575748"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3719330.3721230"},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of OSDI.","author":"Ren Zhenghang","year":"2025","unstructured":"Zhenghang Ren and Yuxuan Li. 2025. Enabling Efficient GPU Communication over Multiple NICs with FuseLink. In Proceedings of OSDI."},{"key":"e_1_3_2_1_50_1","unstructured":"Amazon Web Services. 2020. Amazon EC2 P4d Instances Deep Dive | AWS Compute Blog. https:\/\/aws.amazon.com\/blogs\/compute\/amazon-ec2-p4d-instances-deep-dive\/."},{"key":"e_1_3_2_1_51_1","unstructured":"Noam Shazeer Azalia Mirhoseini Krzysztof Maziarz Andy Davis Quoc Le Geoffrey Hinton and Jeff Dean. 2017. Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. arXiv:1701.06538 [cs]"},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of ICML.","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. 2023. FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU. In Proceedings of ICML."},{"key":"e_1_3_2_1_53_1","volume-title":"Proceedings of NSDI.","author":"Shu Ran","year":"2019","unstructured":"Ran Shu, Peng Cheng, Guo Chen, Zhiyuan Guo, Lei Qu, Yongqiang Xiong, Derek Chiou, and Thomas Moscibroda. 2019. Direct Universal Access: Making Data Center Resources Available to FPGA. In Proceedings of NSDI."},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of OSDI.","author":"Skiadopoulos Athinagoras","year":"2024","unstructured":"Athinagoras Skiadopoulos, Zhiqiang Xie, Mark Zhao, Qizhe Cai, Saksham Agarwal, Jacob Adelmann, David Ahern, Carlo Contavalli, Michael Goldflam, Vitaly Mayatskikh, Raghu Raja, Daniel Walton, Rachit Agarwal, Shrijeet Mukherjee, and Christos Kozyrakis. 2024. High-Throughput and Flexible Host Networking for Accelerated Computing. In Proceedings of OSDI."},{"key":"e_1_3_2_1_55_1","unstructured":"Supermicro. 2023. AS-8125GS-TNHR. https:\/\/www.supermicro.com\/manuals\/superserver\/8U\/MNL-2598.pdf."},{"key":"e_1_3_2_1_56_1","unstructured":"The Llama Team. 2024. The Llama 3 Herd of Models. arXiv:2407.21783 [cs]"},{"key":"e_1_3_2_1_57_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2023. Attention Is All You Need. arXiv:1706.03762 [cs]"},{"key":"e_1_3_2_1_58_1","volume-title":"Proceedings of ASPLOS.","author":"Kumar Abhishek Vijaya","year":"2025","unstructured":"Abhishek Vijaya Kumar, Gianni Antichi, and Rachee Singh. 2025. Aqua: Network-Accelerated Memory Offloading for LLMs in Scale-Up GPU Domains. In Proceedings of ASPLOS."},{"key":"e_1_3_2_1_59_1","unstructured":"vLLM. 2025. Distributed Inference and Serving. https:\/\/docs.vllm.ai\/en\/latest\/serving\/distributed_serving.html."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672271"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613145"},{"key":"e_1_3_2_1_62_1","unstructured":"WikiChip. 2022. NVLink - Nvidia. https:\/\/en.wikichip.org\/wiki\/nvidia\/nvlink."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.7544\/issn1000-1239.202330402"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"crossref","unstructured":"Takeshi Yoshimura Tatsuhiro Chiba Manish Sethi Daniel Waddington and Swaminathan Sundararaman. 2025. Speeding up Model Loading with fastsafetensors. arXiv:2505.23072v1 [cs.DC]","DOI":"10.1109\/CLOUD67622.2025.00026"},{"key":"e_1_3_2_1_65_1","volume-title":"Proceedings of OSDI.","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In Proceedings of OSDI."}],"event":{"name":"HotNets '25: 24th ACM Workshop on Hot Topics in Networks","location":"UMD Campus College Park MD USA","acronym":"HotNets '25","sponsor":["SIGCOMM ACM Special Interest Group on Data Communication"]},"container-title":["Proceedings of the 24th ACM Workshop on Hot Topics in Networks"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772356.3772415","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,17]],"date-time":"2025-11-17T12:04:46Z","timestamp":1763381086000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772356.3772415"}},"subtitle":["A case for unifying the inter-host and intra-host networks in (AI) datacenters"],"short-title":[],"issued":{"date-parts":[[2025,11,17]]},"references-count":65,"alternative-id":["10.1145\/3772356.3772415","10.1145\/3772356"],"URL":"https:\/\/doi.org\/10.1145\/3772356.3772415","relation":{},"subject":[],"published":{"date-parts":[[2025,11,17]]},"assertion":[{"value":"2025-11-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}