{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T15:31:20Z","timestamp":1773588680936,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,22]]},"DOI":"10.1145\/3779212.3790188","type":"proceedings-article","created":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T13:55:26Z","timestamp":1773150926000},"page":"1201-1215","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MSCCL++: Rethinking GPU Communication Abstractions for AI Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-8756-4480","authenticated-orcid":false,"given":"Changho","family":"Hwang","sequence":"first","affiliation":[{"name":"Microsoft Research, Vancouver, BC, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4014-4757","authenticated-orcid":false,"given":"Peng","family":"Cheng","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8815-7468","authenticated-orcid":false,"given":"Roshan","family":"Dathathri","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4849-6776","authenticated-orcid":false,"given":"Abhinav","family":"Jangda","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7998-3681","authenticated-orcid":false,"given":"Saeed","family":"Maleki","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2482-7892","authenticated-orcid":false,"given":"Madan","family":"Musuvathi","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7596-4734","authenticated-orcid":false,"given":"Olli","family":"Saarikivi","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0628-4515","authenticated-orcid":false,"given":"Aashaka","family":"Shah","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0491-7082","authenticated-orcid":false,"given":"Ziyue","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9295-6530","authenticated-orcid":false,"given":"Binyang","family":"Li","sequence":"additional","affiliation":[{"name":"Microsoft Azure, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2036-1379","authenticated-orcid":false,"given":"Caio","family":"Rocha","sequence":"additional","affiliation":[{"name":"Microsoft Azure, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8490-7990","authenticated-orcid":false,"given":"Qinghua","family":"Zhou","sequence":"additional","affiliation":[{"name":"Microsoft Azure, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8226-5515","authenticated-orcid":false,"given":"Mahdieh","family":"Ghazimirsaeed","sequence":"additional","affiliation":[{"name":"Microsoft Azure, Cambridge, MA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0414-2200","authenticated-orcid":false,"given":"Sreevatsa","family":"Anantharamu","sequence":"additional","affiliation":[{"name":"Microsoft Azure, ,"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9549-7918","authenticated-orcid":false,"given":"Jithin","family":"Jose","sequence":"additional","affiliation":[{"name":"Microsoft Azure, Austin, TX, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,3,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"ROCm Communication Collectives Library (RCCL). https:\/\/github.com\/ROCm\/rccl. [Online","author":"AMD.","year":"2025","unstructured":"AMD. 2025. ROCm Communication Collectives Library (RCCL). https:\/\/github.com\/ROCm\/rccl. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441620"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672239"},{"key":"e_1_3_2_1_4_1","volume-title":"MSCCL: Microsoft Collective Communication Library. arXiv:2201.11840","author":"Cowan Meghan","year":"2022","unstructured":"Meghan Cowan, Saeed Maleki, Madan Musuvathi, Olli Saarikivi, and Yifan Xiong. 2022. MSCCL: Microsoft Collective Communication Library. arXiv:2201.11840"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575724"},{"key":"e_1_3_2_1_6_1","unstructured":"DeepSeek-AI. 2024. DeepSeek-V3 Technical Report. arXiv:2412.19437"},{"key":"e_1_3_2_1_7_1","unstructured":"Raja Gond Nipun Kwatra and Ramachandran Ramjee. 2025. TokenWeave: Efficient Compute-Communication Overlap for Distributed LLM Inference. arXiv:2505.11329"},{"key":"e_1_3_2_1_8_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey et al. 2024. The Llama 3 Herd of Models. arXiv:2407.21783"},{"key":"e_1_3_2_1_9_1","unstructured":"Zhiyi Hu Siyuan Shen Tommaso Bonato Sylvain Jeaugey Cedell Alexander Eric Spada James Dinan Jeff Hammond and Torsten Hoefler. 2025. Demystifying NCCL: An In-depth Analysis of GPU Communication Protocols and Algorithms. arXiv:2507.04786"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the 6th Conference on Machine Learning and Systems (MLSys).","author":"Hwang Changho","year":"2023","unstructured":"Changho Hwang, Wei Cui, Yifan Xiong, Ziyue Yang, Ze Liu, Han Hu, Zilong Wang, Rafael Salas, Jithin Jose, Prabhat Ram, HoYuen Chau, Peng Cheng, Fan Yang, Mao Yang, and Yongqiang Xiong. 2023a. Tutel: Adaptive Mixture-of-Experts at Scale. In Proceedings of the 6th Conference on Machine Learning and Systems (MLSys)."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI).","author":"Hwang Changho","year":"2023","unstructured":"Changho Hwang, KyoungSoo Park, Ran Shu, Xinyuan Qu, Peng Cheng, and Yongqiang Xiong. 2023b. ARK: GPU-driven Code Execution for Distributed Deep Learning. In Proceedings of the 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507778"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_14_1","volume-title":"Liangyu Zhao, Vincent Liu, Miguel Castro, Srikanth Kandula, and Luke Marshall.","author":"Liu Xuting","year":"2024","unstructured":"Xuting Liu, Behnaz Arzani, Siva Kesava Reddy Kakarla, Liangyu Zhao, Vincent Liu, Miguel Castro, Srikanth Kandula, and Luke Marshall. 2024. Rethinking Machine Learning Collective Communication as a Multi-Commodity Flow Problem. In Proceedings of the 38th ACM Special Interest Group on Data Communication (SIGCOMM)."},{"key":"e_1_3_2_1_15_1","unstructured":"Pak Markthub Jim Dinan Sreeram Potluri and Seth Howell. 2022. Improving Network Performance of HPC Systems Using NVIDIA Magnum IO NVSHMEM and GPUDirect Async. https:\/\/developer.nvidia.com\/blog\/improving-network-performance-of-hpc-systems-using-nvidia-magnum-io-nvshmem-and-gpudirect-async. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_16_1","volume-title":"Microsoft Collective Communication Library. https:\/\/github.com\/azure\/msccl. [Online","year":"2025","unstructured":"Microsoft. 2025a. Microsoft Collective Communication Library. https:\/\/github.com\/azure\/msccl. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_17_1","volume-title":"MSCCL: A GPU-driven communication stack for scalable AI applications. https:\/\/github.com\/microsoft\/mscclpp. [Online","year":"2025","unstructured":"Microsoft. 2025b. MSCCL: A GPU-driven communication stack for scalable AI applications. https:\/\/github.com\/microsoft\/mscclpp. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_18_1","volume-title":"https:\/\/github.com\/azure\/msccl-scheduler. [Online","author":"Scheduler MSCCL","year":"2025","unstructured":"Microsoft. 2025c. MSCCL Scheduler. https:\/\/github.com\/azure\/msccl-scheduler. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_19_1","volume-title":"Multicast Support -- CUDA C Programming Guide. docs.nvidia.com\/cuda\/archive\/13.1.0\/cuda-c-programming-guide\/index.html#multicast-support. [Online","author":"NVIDIA.","year":"2025","unstructured":"NVIDIA. 2025a. 14.9. Multicast Support -- CUDA C Programming Guide. docs.nvidia.com\/cuda\/archive\/13.1.0\/cuda-c-programming-guide\/index.html#multicast-support. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_20_1","volume-title":"Multinode NVLink User Guide. https:\/\/docs.nvidia.com\/multi-node-nvlink-systems\/mnnvl-user-guide\/. [Online","author":"NVIDIA.","year":"2025","unstructured":"NVIDIA. 2025b. Multinode NVLink User Guide. https:\/\/docs.nvidia.com\/multi-node-nvlink-systems\/mnnvl-user-guide\/. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_21_1","volume-title":"nvbandwidth: A tool for bandwidth measurements on NVIDIA GPUs. https:\/\/github.com\/NVIDIA\/nvbandwidth. [Online","author":"NVIDIA.","year":"2025","unstructured":"NVIDIA. 2025c. nvbandwidth: A tool for bandwidth measurements on NVIDIA GPUs. https:\/\/github.com\/NVIDIA\/nvbandwidth. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_22_1","volume-title":"NVIDIA Collective Communications Library (NCCL). https:\/\/github.com\/NVIDIA\/nccl. [Online","author":"NVIDIA.","year":"2025","unstructured":"NVIDIA. 2025d. NVIDIA Collective Communications Library (NCCL). https:\/\/github.com\/NVIDIA\/nccl. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_23_1","volume-title":"2025 e","author":"NVIDIA.","year":"2025","unstructured":"NVIDIA. 2025 e. NVIDIA OpenSHMEM Library (NVSHMEM) Documentation. https:\/\/docs.nvidia.com\/nvshmem\/api\/. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_24_1","volume-title":"Optimizing for Low-Latency Communication in Inference Workloads with JAX and XLA. developer.nvidia.com\/blog\/optimizing-for-low-latency-communication-in-inference-workloads-with-jax-and-xla\/. [Online","author":"NVIDIA.","year":"2025","unstructured":"NVIDIA. 2025. Optimizing for Low-Latency Communication in Inference Workloads with JAX and XLA. developer.nvidia.com\/blog\/optimizing-for-low-latency-communication-in-inference-workloads-with-jax-and-xla\/. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_25_1","volume-title":"https:\/\/github.com\/NVIDIA\/TensorRT-LLM. [Online","author":"NVIDIA.","year":"2025","unstructured":"NVIDIA. 2025a. TensorRT-LLM. https:\/\/github.com\/NVIDIA\/TensorRT-LLM. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_26_1","volume-title":"User Buffer Registration - NCCL 2.26.2 documentation. https:\/\/docs.nvidia.com\/deeplearning\/nccl\/user-guide\/docs\/usage\/bufferreg.html. [Online","author":"NVIDIA.","year":"2025","unstructured":"NVIDIA. 2025b. User Buffer Registration - NCCL 2.26.2 documentation. https:\/\/docs.nvidia.com\/deeplearning\/nccl\/user-guide\/docs\/usage\/bufferreg.html. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_27_1","volume-title":"OpenSHMEM Application Programming Interface. https:\/\/github.com\/openshmem-org\/specification. [Online","author":"SHMEM.","year":"2025","unstructured":"OpenSHMEM. 2025. OpenSHMEM Application Programming Interface. https:\/\/github.com\/openshmem-org\/specification. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the 6th Conference on Machine Learning and Systems (MLSys).","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. 2023. Efficiently Scaling Transformer Inference. In Proceedings of the 6th Conference on Machine Learning and Systems (MLSys)."},{"key":"e_1_3_2_1_30_1","volume-title":"4th International Conference","volume":"9","author":"Rabenseifner Rolf","year":"2004","unstructured":"Rolf Rabenseifner. 2004. Optimization of Collective Reduction Operations. In Computational Science - ICCS 2004, 4th International Conference, Krak\u00f3w, Poland, June 6-9, 2004, Proceedings, Part I (Lecture Notes in Computer Science, Vol. 3036). Springer, 1-9."},{"key":"e_1_3_2_1_31_1","volume-title":"perftest: Infiniband Verbs Performance Tests. https:\/\/github.com\/linux-rdma\/perftest. [Online","author":"Linux RDMA.","year":"2025","unstructured":"Linux RDMA. 2025. perftest: Infiniband Verbs Performance Tests. https:\/\/github.com\/linux-rdma\/perftest. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_32_1","volume-title":"SGLang is a fast serving framework for large language models and vision language models., https:\/\/github.com\/sgl-project\/sglang. [Online","year":"2025","unstructured":"SGLang. 2025. SGLang is a fast serving framework for large language models and vision language models., https:\/\/github.com\/sgl-project\/sglang. [Online; accessed Dec 2025]."},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI).","author":"Shah Aashaka","year":"2023","unstructured":"Aashaka Shah, Vijay Chidambaram, Meghan Cowan, Saeed Maleki, Madan Musuvathi, Todd Mytkowicz, Jacob Nelson, and Olli Saarikivi. 2023. TACCL: Guiding Collective Algorithm Synthesis using Communication Sketches. In Proceedings of the 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI)."},{"key":"e_1_3_2_1_34_1","volume-title":"MPI-the Complete Reference: the MPI core","author":"Snir Marc","unstructured":"Marc Snir, William Gropp, Steve Otto, Steven Huss-Lederman, Jack Dongarra, and David Walker. 1998. MPI-the Complete Reference: the MPI core. Vol. 1. MIT press."},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS).","author":"Wang Shibo","year":"2023","unstructured":"Shibo Wang, Jinliang Wei, Amit Sabne, Andy Davis, Berkin Ilbeyi, Blake Hechtman, Dehao Chen, Karthik Srinivasa Murthy, Marcello Maggioni, Qiao Zhang, Sameer Kumar, Tongfei Guo, Yuanzhong Xu, and Zongwei Zhou. 2023. Overlap Communication with Dependent Computation via Decomposition in Large Deep Learning Models. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)."},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the 19th USENIX Conference on Operating Systems Design and Implementation (OSDI).","author":"Zhang Dingyan","year":"2025","unstructured":"Dingyan Zhang, Haotian Wang, Yang Liu, Xingda Wei, Yizhou Shan, Rong Chen, and Haibo Chen. 2025. BLITZSCALE: fast and live large model autoscaling with O(1) host caching. In Proceedings of the 19th USENIX Conference on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_37_1","volume-title":"DeepEP: an efficient expert-parallel communication library. https:\/\/github.com\/deepseek-ai\/DeepEP. [Online","author":"Zhao Chenggang","year":"2025","unstructured":"Chenggang Zhao, Shangyan Zhou, Liyue Zhang, Chengqi Deng, Zhean Xu, Yuxuan Liu, Kuai Yu, Jiashi Li, and Liang Zhao. 2025. DeepEP: an efficient expert-parallel communication library. https:\/\/github.com\/deepseek-ai\/DeepEP. [Online; accessed Dec 2025]."}],"event":{"name":"ASPLOS '26: 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Pittsburgh PA USA","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"deposited":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T14:06:00Z","timestamp":1773583560000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3779212.3790188"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,22]]},"references-count":37,"alternative-id":["10.1145\/3779212.3790188","10.1145\/3779212"],"URL":"https:\/\/doi.org\/10.1145\/3779212.3790188","relation":{},"subject":[],"published":{"date-parts":[[2026,3,22]]},"assertion":[{"value":"2026-03-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}