{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T21:33:50Z","timestamp":1777066430911,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China","award":["62325405"],"award-info":[{"award-number":["62325405"]}]},{"name":"National Natural Science Foundation of China","award":["U24B6015"],"award-info":[{"award-number":["U24B6015"]}]},{"name":"Beijing Natural Science Foundation","award":["L242018"],"award-info":[{"award-number":["L242018"]}]},{"name":"Beijing Natural Science Foundation","award":["L257010"],"award-info":[{"award-number":["L257010"]}]},{"name":"Beijing Municipal Science and Technology Project","award":["Z241100004224013"],"award-info":[{"award-number":["Z241100004224013"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3769370","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"1894-1911","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Efficient and Adaptable Overlapping for Computation and Communication via Signaling and Reordering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5768-6037","authenticated-orcid":false,"given":"Ke","family":"Hong","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4896-121X","authenticated-orcid":false,"given":"Xiuhong","family":"Li","sequence":"additional","affiliation":[{"name":"Infinigence-AI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1905-0977","authenticated-orcid":false,"given":"Minxu","family":"Liu","sequence":"additional","affiliation":[{"name":"Infinigence-AI, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8777-2579","authenticated-orcid":false,"given":"Qiuli","family":"Mao","sequence":"additional","affiliation":[{"name":"Infinigence-AI, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3445-7055","authenticated-orcid":false,"given":"Tianqi","family":"Wu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1273-2573","authenticated-orcid":false,"given":"Zixiao","family":"Huang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8345-3882","authenticated-orcid":false,"given":"Lufang","family":"Chen","sequence":"additional","affiliation":[{"name":"Infinigence-AI, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0500-5824","authenticated-orcid":false,"given":"Zhong","family":"Wang","sequence":"additional","affiliation":[{"name":"Infinigence-AI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0937-3116","authenticated-orcid":false,"given":"Yichong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9259-7180","authenticated-orcid":false,"given":"Zhenhua","family":"Zhu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0849-3252","authenticated-orcid":false,"given":"Guohao","family":"Dai","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6108-5157","authenticated-orcid":false,"given":"Yu","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Mixtral of Experts. arXiv preprint arXiv:2401.04088","author":"Mistral AI.","year":"2024","unstructured":"Mistral AI. 2024. Mixtral of Experts. arXiv preprint arXiv:2401.04088 (2024)."},{"key":"e_1_3_2_1_2_1","unstructured":"Ascend. 2025. CANN Samples. [Online]. https:\/\/gitee.com\/ascend\/samples."},{"key":"e_1_3_2_1_3_1","unstructured":"Ascend. 2025. Huawei Collective Communication Library. [Online]. https:\/\/gitee.com\/ascend\/cann-hccl."},{"key":"e_1_3_2_1_4_1","volume-title":"FLUX: Fast Software-based Communication Overlap On GPUs Through Kernel Fusion. arXiv preprint arXiv:2406.06858","author":"Chang Li-Wen","year":"2024","unstructured":"Li-Wen Chang, Wenlei Bao, Qi Hou, Chengquan Jiang, Ningxin Zheng, Yinmin Zhong, Xuanrun Zhang, Zuquan Song, Ziheng Jiang, Haibin Lin, Xin Jin, and Xin Liu. 2024. FLUX: Fast Software-based Communication Overlap On GPUs Through Kernel Fusion. arXiv preprint arXiv:2406.06858 (2024)."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems. 178\u2013191","author":"Chen Chang","year":"2024","unstructured":"Chang Chen, Xiuhong Li, Qianchao Zhu, Jiangfei Duan, Peng Sun, Xingcheng Zhang, and Chao Yang. 2024. Centauri: Enabling Efficient Scheduling for Communication-Computation Overlap in Large Model Training via Communication Partitioning. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems. 178\u2013191."},{"key":"e_1_3_2_1_6_1","volume-title":"Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al.","author":"Chen Mark","year":"2021","unstructured":"Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. 2021. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374 (2021)."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming.","author":"Chen Zhaodong","year":"2024","unstructured":"Zhaodong Chen, Andrew Kerr, Richard Cai, Jack Kosaian, Haicheng Wu, Yufei Ding, and Yuan Xie. 2024. EVT: Accelerating Deep Learning Training with Epilogue Visitor Tree. In Proceedings of the 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"2","author":"Cowan Meghan","year":"2023","unstructured":"Meghan Cowan, Saeed Maleki, Madanlal Musuvathi, Olli Saarikivi, and Yifan Xiong. 2023. Mscclang: Microsoft collective communication language. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2. 502\u2013514."},{"key":"e_1_3_2_1_9_1","unstructured":"DeepSeek-AI. 2024. DeepSeek-V3 Technical Report. arXiv preprint arXiv:2412.19437v1 (2024)."},{"key":"e_1_3_2_1_10_1","unstructured":"DeepSeek-AI. 2025. DeepEP: an efficient expert-parallel communication library. https:\/\/github.com\/deepseek-ai\/DeepEP."},{"key":"e_1_3_2_1_11_1","volume-title":"xDiT: an Inference Engine for Diffusion Transformers (DiTs) with Massive Parallelism. arXiv preprint arXiv:2411.01738","author":"Fang Jiarui","year":"2024","unstructured":"Jiarui Fang, Jinzhe Pan, Xibo Sun, Aoyu Li, and Jiannan Wang. 2024. xDiT: an Inference Engine for Diffusion Transformers (DiTs) with Massive Parallelism. arXiv preprint arXiv:2411.01738 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming.","author":"He Jiaao","year":"2022","unstructured":"Jiaao He, Jidong Zhai, Tiago Antunes, Haojie Wang, Fuwen Luo, Shangfeng Shi, and Qin Li. 2022. FasterMoE: modeling and optimizing training of large-scale dynamic pre-trained models. In Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming."},{"key":"e_1_3_2_1_13_1","volume-title":"CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers. arXiv preprint arXiv:2205.15868","author":"Hong Wenyi","year":"2022","unstructured":"Wenyi Hong, Ming Ding, Wendi Zheng, Xinghan Liu, and Jie Tang. 2022. CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers. arXiv preprint arXiv:2205.15868 (2022)."},{"key":"e_1_3_2_1_14_1","volume-title":"Understanding the planning of LLM agents: A survey. arXiv preprint arXiv:2402.02716","author":"Huang Xu","year":"2024","unstructured":"Xu Huang, Weiwen Liu, Xiaolong Chen, Xingmei Wang, Hao Wang, Defu Lian, Yasheng Wang, Ruiming Tang, and Enhong Chen. 2024. Understanding the planning of LLM agents: A survey. arXiv preprint arXiv:2402.02716 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems.","author":"Jangda Abhinav","year":"2022","unstructured":"Abhinav Jangda, Jun Huang, Guodong Liu, Amir Hossein Nodehi Sabet, Saeed Maleki, Youshan Miao, Madanlal Musuvathi, Todd Mytkowicz, and Olli Saarikivi. 2022. Breaking the computation and communication abstraction barrier in distributed machine learning workloads. In Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems."},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the 7th Annual Conference on Machine Learning and Systems.","author":"Jiang Chenyu","year":"2024","unstructured":"Chenyu Jiang, Ye Tian, Zhen Jia, Shuai Zheng, Chuan Wu, and Yida Wang. 2024. Lancet: Accelerating Mixture-of-Experts Training via Whole Graph Computation-Communication Overlapping. In Proceedings of the 7th Annual Conference on Machine Learning and Systems."},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the 21st USENIX Symposium on Networked System Design and Implementation.","author":"Jiang Ziheng","year":"2024","unstructured":"Ziheng Jiang, Haibin Lin, Yinmin Zhong, Qi Huang, Yangrui Chen, Zhi Zhang, Yanghua Peng, Xiang Li, Cong Xie, Shibiao Nong, Yulu Jia, Sun He, Hongmin Chen, Zhihao Bai, Qi Hou, Shipeng Yan, Ding Zhou, Yiyao Sheng, Zhuo Jiang, Haohan Xu, Haoran Wei, Zhang Zhang, Pengfei Nie, Leqi Zou, Sida Zhao, Liang Xiang, Zherui Liu, Zhe Li, Xiaoying Jia, Jianxi Ye, Xin Jin, and Xin Liu. 2024. Overlap Communication with Dependent Computation via Decomposition in Large Deep Learning Models. In Proceedings of the 21st USENIX Symposium on Networked System Design and Implementation."},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles. 611\u2013626","author":"Woosuk","unstructured":"Woosuk Kwon et al. 2023. Efficient Memory Management for Large Language Model Serving with PagedAttention. In Proceedings of the 29th Symposium on Operating Systems Principles. 611\u2013626."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/2508834.2513149"},{"key":"e_1_3_2_1_20_1","volume-title":"GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. In International Conference on Learning Representations.","author":"Lepikhin Dmitry","year":"2021","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2021. GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_21_1","volume-title":"PyTorch Distributed: Experiences on Accelerating Data Parallel Training. arXiv preprint arXiv:2006.15704","author":"Li Shen","year":"2020","unstructured":"Shen Li, Yanli Zhao, Rohan Varma, Omkar Salpekar, Pieter Noordhuis, Teng Li, Adam Paszke, Jeff Smith, Brian Vaughan, Pritam Damania, and Soumith Chintala. 2020. PyTorch Distributed: Experiences on Accelerating Data Parallel Training. arXiv preprint arXiv:2006.15704 (2020)."},{"key":"e_1_3_2_1_22_1","unstructured":"Zijun Liu Yanzhe Zhang Peng Li Yang Liu and Diyi Yang. 2023. Dynamic LLM-Agent Network: An LLM-agent Collaboration Framework with Agent Team Optimization. arXiv:2310.02170 [cs.CL]"},{"key":"e_1_3_2_1_23_1","unstructured":"Meta. 2025. The Llama 4 herd: The beginning of a new era of natively multimodal AI innovation. [Online]. https:\/\/ai.meta.com\/blog\/llama-4-multimodal-intelligence\/."},{"key":"e_1_3_2_1_24_1","unstructured":"NVIDIA. 2017. cuBLAS: Basic Linear Algebra on NVIDIA GPUs. [Online]. https:\/\/developer.nvidia.com\/cublas."},{"key":"e_1_3_2_1_25_1","volume-title":"CUTLASS: CUDA Templates for Linear Algebra Subroutines. [Online]. https:\/\/github.com\/NVIDIA\/cutlass.","author":"NVIDIA.","year":"2017","unstructured":"NVIDIA. 2017. CUTLASS: CUDA Templates for Linear Algebra Subroutines. [Online]. https:\/\/github.com\/NVIDIA\/cutlass."},{"key":"e_1_3_2_1_26_1","unstructured":"NVIDIA. 2023. GPU Performance Background User's Guide. [Online]. https:\/\/docs.nvidia.com\/deeplearning\/performance\/dl-performance-gpu-background."},{"key":"e_1_3_2_1_27_1","unstructured":"NVIDIA. 2025. The building blocks of high-speed multi-GPU communication for feeding large datasets faster into models and rapidly exchanging data between GPUs. [Online]. https:\/\/www.nvidia.com\/en-us\/data-center\/nvlink\/."},{"key":"e_1_3_2_1_28_1","unstructured":"NVIDIA. 2025. cuBLASMp: A High-Performance CUDA Library for Distributed Dense Linear Algebra. https:\/\/docs.nvidia.com\/cuda\/cublasmp\/."},{"key":"e_1_3_2_1_29_1","unstructured":"Nvidia. 2025. CUDA Toolkit. [Online]. https:\/\/developer.nvidia.com\/cuda-toolkit."},{"key":"e_1_3_2_1_30_1","unstructured":"NVIDIA. 2025. GPU Optimized Techniques for Training Transformer Models At-Scale. [Online]. https:\/\/github.com\/NVIDIA\/Megatron-LM."},{"key":"e_1_3_2_1_31_1","unstructured":"NVIDIA. 2025. InfiniBand. [Online]. https:\/\/www.nvidia.com\/en-us\/networking\/products\/infiniband\/."},{"key":"e_1_3_2_1_32_1","unstructured":"NVIDIA. 2025. NVIDIA Collective Communication Library. [Online]. https:\/\/docs.nvidia.com\/deeplearning\/nccl."},{"key":"e_1_3_2_1_33_1","unstructured":"NVIDIA. 2025. Parallel Thread Execution ISA Version 8.7. [Online]. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution."},{"key":"e_1_3_2_1_34_1","unstructured":"NVIDIA. 2025. Stream Management Functions of the Low-Level CUDA Driver Application Programming Interface. [Online]. https:\/\/docs.nvidia.com\/cuda\/cuda-driver-api\/."},{"key":"e_1_3_2_1_35_1","unstructured":"OpenAI. 2025. ChatGPT. [Online]. https:\/\/chatgpt.com\/."},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, 429\u2013431","author":"Osama Muhammad","unstructured":"Muhammad Osama, Duane Merrill, Cris Cecka, Michael Garland, and John D. Owens. 2023. Stream-K: Work-Centric Parallel Decomposition for Dense Matrix-Matrix Multiplication on the GPU. Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, 429\u2013431."},{"key":"e_1_3_2_1_37_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","first-page":"117","DOI":"10.1016\/j.jpdc.2008.09.002","article-title":"Bandwidth optimal all-reduce algorithms for clusters of workstations","volume":"9","author":"Patarasuk Pitch","year":"2009","unstructured":"Pitch Patarasuk and Xin Yuan. 2009. Bandwidth optimal all-reduce algorithms for clusters of workstations. J. Parallel and Distrib. Comput. 9 (2009), 117\u2013124. Issue 2.","journal-title":"J. Parallel and Distrib. Comput."},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"2","author":"Pati Suchita","year":"2024","unstructured":"Suchita Pati, Shaizeen Aga, Mahzabeen Islam, Nuwan Jayasena, and Matthew D Sinclair. 2024. T3: Transparent tracking & triggering for fine-grained overlap of compute & collectives. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2. 1146\u20131164."},{"key":"e_1_3_2_1_40_1","unstructured":"PCI-SIG. 2025. PCI Express 6.0 Specification. [Online]. https:\/\/pcisig.com\/pci-express-6.0-specification."},{"key":"e_1_3_2_1_41_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis. 1\u201317","author":"Punniyamurthy Kishore","unstructured":"Kishore Punniyamurthy, Khaled Hamidouche, and Bradford M. Beckmann. 2024. Optimizing Distributed ML Communication with Fused Computation-Collective Operations. In Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis. 1\u201317."},{"key":"e_1_3_2_1_42_1","unstructured":"PyTorch. 2024. Introducing Async Tensor Parallelism in PyTorch. [Online]. https:\/\/discuss.pytorch.org\/t\/distributed-w-torchtitan-introducing-async-tensor-parallelism-in-pytorch\/209487."},{"key":"e_1_3_2_1_43_1","unstructured":"Baptiste Rozi\u00e8re Jonas Gehring Fabian Gloeckle Sten Sootla Itai Gat Xiaoqing Ellen Tan Yossi Adi Jingyu Liu Romain Sauvestre Tal Remez J\u00e9r\u00e9my Rapin Artyom Kozhevnikov Ivan Evtimov Joanna Bitton Manish Bhatt Cristian Canton Ferrer Aaron Grattafiori Wenhan Xiong Alexandre D\u00e9fossez Jade Copet Faisal Azhar Hugo Touvron Louis Martin Nicolas Usunier Thomas Scialom and Gabriel Synnaeve. 2023. Code Llama: Open Foundation Models for Code. arXiv preprint arXiv:2308.12950 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_45_1","volume-title":"Make a Video: Text-to-Video Generation without Text-Video Data. arXiv preprint arXiv:2209.14792","author":"Singer Uriel","year":"2022","unstructured":"Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, Devi Parikh, Sonal Gupta, and Yaniv Taigman. 2022. Make a Video: Text-to-Video Generation without Text-Video Data. arXiv preprint arXiv:2209.14792 (2022)."},{"key":"e_1_3_2_1_46_1","unstructured":"StepFun. 2025. Step-Video-T2V Technical Report: The Practice Challenges and Future of Video Foundation Model. (2025). arXiv:2502.10248 [cs.CV] https:\/\/arxiv.org\/abs\/2502.10248"},{"key":"e_1_3_2_1_47_1","unstructured":"Llama3 Team. 2024. The Llama 3 Herd of Models. (2024). arXiv:2407.21783 [cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_1_48_1","volume-title":"WAN: Open and Advanced Large-Scale Video Generative Models. arXiv preprint arXiv:2503.20314","author":"Alibaba Group Wan Team.","year":"2025","unstructured":"Alibaba Group Wan Team. 2025. WAN: Open and Advanced Large-Scale Video Generative Models. arXiv preprint arXiv:2503.20314 (2025)."},{"key":"e_1_3_2_1_49_1","volume-title":"Domino: Eliminating Communication in LLM Training via Generic Tensor Slicing and Overlapping. arXiv preprint arXiv:2409.15241","author":"Wang Guanhua","year":"2024","unstructured":"Guanhua Wang, Chengming Zhang, Zheyu Shen, Ang Li, and Olatunji Ruwase. 2024. Domino: Eliminating Communication in LLM Training via Generic Tensor Slicing and Overlapping. arXiv preprint arXiv:2409.15241 (2024)."},{"key":"e_1_3_2_1_50_1","volume-title":"Hiding Communication Cost in Distributed LLM Training via Micro-batch Co-execution. arXiv preprint arXiv:2411.15871","author":"Wang Haiquan","year":"2024","unstructured":"Haiquan Wang, Chaoyi Ruan, Jia He, Jiaqi Ruan, Chengjie Tang, Xiaosong Ma, and Cheng Li. 2024. Hiding Communication Cost in Distributed LLM Training via Micro-batch Co-execution. arXiv preprint arXiv:2411.15871 (2024)."},{"key":"e_1_3_2_1_51_1","volume-title":"Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming.","author":"Wang Hulin","year":"2025","unstructured":"Hulin Wang, Yaqi Xia, Donglin Yang, Xiaobo Zhou, and Dazhao Cheng. 2025. Harnessing Inter-GPU Shared Memory for Seamless MoE Communication-Computation Fusion. In Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming."},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems.","author":"Wang Shibo","year":"2023","unstructured":"Shibo Wang, Jinliang Wei, Amit Sabne, Andy Davis, Berkin Ilbeyi, Blake Hechtman, Dehao Chen, Karthik Srinivasa Murthy, Marcello Maggioni, Qiao Zhang, Sameer Kumar, Tongfei Guo, Yuanzhong Xu, and Zongwei Zhou. 2023. Overlap Communication with Dependent Computation via Decomposition in Large Deep Learning Models. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems."},{"key":"e_1_3_2_1_53_1","unstructured":"Zhuoyi Yang Jiayan Teng Wendi Zheng Ming Ding Shiyu Huang Jiazheng Xu Yuanming Yang Wenyi Hong Xiaohan Zhang Guanyu Feng et al. 2024. CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer. arXiv preprint arXiv:2408.06072 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of the 8th Annual Conference on Machine Learning and Systems.","author":"Zhang Shulai","year":"2025","unstructured":"Shulai Zhang, Ningxin Zheng, Haibin Lin, Ziheng Jiang, Wenlei Bao, Chengquan Jiang, Qi Hou, Weihao Cui, Size Zheng, Li-Wen Chang, Quan Chen, and Xin Liu. 2025. Comet: Fine-grained Computation-communication Overlapping for Mixture-of-Experts. In Proceedings of the 8th Annual Conference on Machine Learning and Systems."},{"key":"e_1_3_2_1_55_1","volume-title":"Proceedings of the VLDB Endowment. 3848\u20133860","author":"Zhao Yanli","year":"2023","unstructured":"Yanli Zhao, Andrew Gu, Rohan Varma, Liang Luo, Chien-Chin Huang, Min Xu, Less Wright, Hamid Shojanazeri, Myle Ott, Sam Shleifer, Alban Desmaison, Can Balioglu, Pritam Damania, Bernard Nguyen, Geeta Chauhan, Yuchen Hao, Ajit Mathews, and Shen Li. 2023. PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel. In Proceedings of the VLDB Endowment. 3848\u20133860."},{"key":"e_1_3_2_1_56_1","volume-title":"TileLink: Generating Efficient Compute-Communication Overlapping Kernels using Tile-Centric Primitives. arXiv preprint arXiv:2503.20313","author":"Zheng Size","year":"2025","unstructured":"Size Zheng, Jin Fang, Xuegui Zheng, Qi Hou, Wenlei Bao, Ningxin Zheng, Ziheng Jiang, Dongyang Wang, Jianxi Ye, Haibin Lin, Li-Wen Chang, and Xin Liu. 2025. TileLink: Generating Efficient Compute-Communication Overlapping Kernels using Tile-Centric Primitives. arXiv preprint arXiv:2503.20313 (2025)."},{"key":"e_1_3_2_1_57_1","volume-title":"Proceedings of the 19th USENIX Symposium on Operating Systems Design and Implementation. 749\u2013765","author":"Zhu Kan","year":"2025","unstructured":"Kan Zhu, Yilong Zhao, Liangyu Zhao, Gefei Zuo, Yile Gu, Dedong Xie, Yufei Gao, Qinyu Xu, Tian Tang, Zihao Ye, et al. 2025. NanoFlow: Towards Optimal Large Language Model Serving Throughput. In Proceedings of the 19th USENIX Symposium on Operating Systems Design and Implementation. 749\u2013765."}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"deposited":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:34:19Z","timestamp":1777062859000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3769370"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":57,"alternative-id":["10.1145\/3767295.3769370","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3769370","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}