{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:51:38Z","timestamp":1777063898070,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China","award":["62172008"],"award-info":[{"award-number":["62172008"]}]},{"name":"National Natural Science Foundation of China","award":["62325201"],"award-info":[{"award-number":["62325201"]}]},{"name":"National Key Research and Development Program of China","award":["2022YFB4500700"],"award-info":[{"award-number":["2022YFB4500700"]}]},{"name":"Scientific Research Innovation Capability Support Project for Young Faculty","award":["ZYGXQNJSKYCXNLZCXM-I1"],"award-info":[{"award-number":["ZYGXQNJSKYCXNLZCXM-I1"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3769325","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"366-382","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MegaScale-MoE: Large-Scale Communication-Efficient Training of Mixture-of-Experts Models in Production"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-1355-4995","authenticated-orcid":false,"given":"Chao","family":"Jin","sequence":"first","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7732-4391","authenticated-orcid":false,"given":"Ziheng","family":"Jiang","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2223-1921","authenticated-orcid":false,"given":"Zhihao","family":"Bai","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2066-1779","authenticated-orcid":false,"given":"Zheng","family":"Zhong","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5783-731X","authenticated-orcid":false,"given":"Juncai","family":"Liu","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8712-9692","authenticated-orcid":false,"given":"Xiang","family":"Li","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1147-6984","authenticated-orcid":false,"given":"Ningxin","family":"Zheng","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4036-2175","authenticated-orcid":false,"given":"Xi","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5682-0230","authenticated-orcid":false,"given":"Cong","family":"Xie","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2143-7494","authenticated-orcid":false,"given":"Qi","family":"Huang","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7862-8059","authenticated-orcid":false,"given":"Wen","family":"Heng","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1070-7256","authenticated-orcid":false,"given":"Yiyuan","family":"Ma","sequence":"additional","affiliation":[{"name":"ByteDance Seed, San Jose, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0826-8283","authenticated-orcid":false,"given":"Wenlei","family":"Bao","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9471-1780","authenticated-orcid":false,"given":"Size","family":"Zheng","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4635-7309","authenticated-orcid":false,"given":"Xuegui","family":"Zheng","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3989-4358","authenticated-orcid":false,"given":"Yanghua","family":"Peng","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4879-5335","authenticated-orcid":false,"given":"Haibin","family":"Lin","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7908-8484","authenticated-orcid":false,"given":"Xuanzhe","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8741-5847","authenticated-orcid":false,"given":"Xin","family":"Jin","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8346-3323","authenticated-orcid":false,"given":"Xin","family":"Liu","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Seattle, USA"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2025. Context parallelism in Megatron-LM. (2025). https:\/\/docs.nvidia.com\/megatron-core\/developer-guide\/latest\/api-guide\/context_parallel.html."},{"key":"e_1_3_2_1_2_1","unstructured":"2025. Introducing DBRX: A New State-of-the-Art Open LLM. (2025). https:\/\/www.databricks.com\/blog\/introducing-dbrx-new-state-art-open-llm"},{"key":"e_1_3_2_1_3_1","unstructured":"2025. Open Release of Grok-1. (2025). https:\/\/x.ai\/blog\/grok-os"},{"key":"e_1_3_2_1_4_1","volume-title":"Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai. 2023. Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245 (2023)."},{"key":"e_1_3_2_1_5_1","unstructured":"Li-Wen Chang Wenlei Bao Qi Hou Chengquan Jiang Ningxin Zheng Yinmin Zhong Xuanrun Zhang Zuquan Song Chengji Yao Ziheng Jiang et al. 2024. FLUX: fast software-based communication overlap on gpus through kernel fusion. arXiv preprint arXiv:2406.06858 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"Centauri: Enabling Efficient Scheduling for Communication-Computation Overlap in Large Model Training via Communication Partitioning. In ACM ASPLOS.","author":"Chen Chang","year":"2024","unstructured":"Chang Chen, Xiuhong Li, Qianchao Zhu, Jiangfei Duan, Peng Sun, Xingcheng Zhang, and Chao Yang. 2024. Centauri: Enabling Efficient Scheduling for Communication-Computation Overlap in Large Model Training via Communication Partitioning. In ACM ASPLOS."},{"key":"e_1_3_2_1_7_1","volume-title":"Charles Sutton, Sebastian Gehrmann, et al.","author":"Chowdhery Aakanksha","year":"2023","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. 2023. Palm: Scaling language modeling with pathways. Journal of Machine Learning Research (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness. Neural Information Processing Systems","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. Flashattention: Fast and memory-efficient exact attention with io-awareness. Neural Information Processing Systems (2022)."},{"key":"e_1_3_2_1_9_1","volume-title":"GLaM: Efficient Scaling of Language Models with Mixture-of-Experts. In International Conference on Machine Learning (ICML).","author":"Du Nan","year":"2022","unstructured":"Nan Du, Yanping Huang, Andrew M Dai, Simon Tong, Dmitry Lepikhin, Yuanzhong Xu, Maxim Krikun, Yanqi Zhou, Adams Wei Yu, Orhan Firat, Barret Zoph, Liam Fedus, Maarten P Bosma, Zongwei Zhou, Tao Wang, Emma Wang, Kellie Webster, Marie Pellat, Kevin Robinson, Kathleen Meier-Hellstern, Toju Duke, Lucas Dixon, Kun Zhang, Quoc Le, Yonghui Wu, Zhifeng Chen, and Claire Cui. 2022. GLaM: Efficient Scaling of Language Models with Mixture-of-Experts. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_10_1","volume-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research (2022)."},{"key":"e_1_3_2_1_11_1","volume-title":"Loongtrain: Efficient training of long-sequence llms with head-context parallelism. arXiv preprint arXiv:2406.18485","author":"Gu Diandian","year":"2024","unstructured":"Diandian Gu, Peng Sun, Qinghao Hu, Ting Huang, Xun Chen, Yingtong Xiong, Guoteng Wang, Qiaoling Chen, Shangchun Zhao, Jiarui Fang, et al. 2024. Loongtrain: Efficient training of long-sequence llms with head-context parallelism. arXiv preprint arXiv:2406.18485 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of Machine Learning and Systems","author":"Hashemi Sayed Hadi","year":"2019","unstructured":"Sayed Hadi Hashemi, Sangeetha Abdu Jyothi, and Roy Campbell. 2019. Tictac: Accelerating distributed deep learning with communication scheduling. Proceedings of Machine Learning and Systems (2019)."},{"key":"e_1_3_2_1_13_1","unstructured":"Jiaao He Jidong Zhai Tiago Antunes Haojie Wang Fuwen Luo Shangfeng Shi and Qin Li. 2022. Fastermoe: modeling and optimizing training of large-scale dynamic pre-trained models. In ACM PPoPP."},{"key":"e_1_3_2_1_14_1","volume-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism. Neural Information Processing Systems","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. 2019. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Neural Information Processing Systems (2019)."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of Machine Learning and Systems","author":"Hwang Changho","year":"2023","unstructured":"Changho Hwang, Wei Cui, Yifan Xiong, Ziyue Yang, Ze Liu, Han Hu, Zilong Wang, Rafael Salas, Jithin Jose, Prabhat Ram, et al. 2023. Tutel: Adaptive mixture-of-experts at scale. Proceedings of Machine Learning and Systems (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Samyam Rajbhandari, and Yuxiong He.","author":"Jacobs Sam Ade","year":"2023","unstructured":"Sam Ade Jacobs, Masahiro Tanaka, Chengming Zhang, Minjia Zhang, Shuaiwen Leon Song, Samyam Rajbhandari, and Yuxiong He. 2023. Deepspeed ulysses: System optimizations for enabling training of extreme long sequence transformer models. arXiv preprint arXiv:2309.14509 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"Saeed Maleki, Youshan Miao, Madanlal Musuvathi, Todd Mytkowicz, and Olli Saarikivi.","author":"Jangda Abhinav","year":"2022","unstructured":"Abhinav Jangda, Jun Huang, Guodong Liu, Amir Hossein Nodehi Sabet, Saeed Maleki, Youshan Miao, Madanlal Musuvathi, Todd Mytkowicz, and Olli Saarikivi. 2022. Breaking the computation and communication abstraction barrier in distributed machine learning workloads. In ACM ASPLOS."},{"key":"e_1_3_2_1_18_1","volume-title":"Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al.","author":"Jiang Albert Q","year":"2024","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al. 2024. Mixtral of experts. arXiv preprint arXiv:2401.04088 (2024)."},{"key":"e_1_3_2_1_19_1","unstructured":"Ziheng Jiang Haibin Lin Yinmin Zhong Qi Huang Yangrui Chen Zhi Zhang Yanghua Peng Xiang Li Cong Xie Shibiao Nong Yulu Jia Sun He Hongmin Chen Zhihao Bai Qi Hou Shipeng Yan Ding Zhou Yiyao Sheng Zhuo Jiang Haohan Xu Haoran Wei Zhang Zhang Pengfei Nie Leqi Zou Sida Zhao Liang Xiang Zherui Liu Zhe Li Xiaoying Jia Jianxi Ye Xin Jin and Xin Liu. 2024. MegaScale: Scaling Large Language Model Training to More Than 10 000 GPUs. In USENIX NSDI."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of Machine Learning and Systems","author":"Korthikanti Vijay Anand","year":"2023","unstructured":"Vijay Anand Korthikanti, Jared Casper, Sangkug Lym, Lawrence McAfee, Michael Andersch, Mohammad Shoeybi, and Bryan Catanzaro. 2023. Reducing activation recomputation in large transformer models. Proceedings of Machine Learning and Systems (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"DISTFLASHATTN: Distributed Memory-efficient Attention for Long-context LLMs Training. arxiv preprint arXiv:2310.03294","author":"Li Dacheng","year":"2024","unstructured":"Dacheng Li, Rulin Shao, Anze Xie, Eric P. Xing, Xuezhe Ma, Ion Stoica, Joseph E. Gonzalez, and Hao Zhang. 2024. DISTFLASHATTN: Distributed Memory-efficient Attention for Long-context LLMs Training. arxiv preprint arXiv:2310.03294 (2024)."},{"key":"e_1_3_2_1_22_1","unstructured":"Jiamin Li Yimin Jiang Yibo Zhu Cong Wang and Hong Xu. 2023. Accelerating distributed {MoE} training and inference with lina. In USENIX ATC."},{"key":"e_1_3_2_1_23_1","volume-title":"Sequence parallelism: Long sequence training from system perspective. arXiv preprint arXiv:2105.13120","author":"Li Shenggui","year":"2021","unstructured":"Shenggui Li, Fuzhao Xue, Chaitanya Baranwal, Yongbin Li, and Yang You. 2021. Sequence parallelism: Long sequence training from system perspective. arXiv preprint arXiv:2105.13120 (2021)."},{"key":"e_1_3_2_1_24_1","unstructured":"Shen Li Yanli Zhao Rohan Varma Omkar Salpekar Pieter Noordhuis Teng Li Adam Paszke Jeff Smith Brian Vaughan Pritam Damania et al. 2020. Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704 (2020)."},{"key":"e_1_3_2_1_25_1","unstructured":"Wanchao Liang Tianyu Liu Less Wright Will Constable Andrew Gu Chien-Chin Huang Iris Zhang Wei Feng Howard Huang Junjie Wang et al. 2024. TorchTitan: One-stop PyTorch native solution for production ready LLM pre-training. arXiv preprint arXiv:2410.06511 (2024)."},{"key":"e_1_3_2_1_26_1","unstructured":"Aixin Liu Bei Feng Bin Wang Bingxuan Wang Bo Liu Chenggang Zhao Chengqi Dengr Chong Ruan Damai Dai Daya Guo et al. 2024. Deepseek-v2: A strong economical and efficient mixture-of-experts language model. arXiv preprint arXiv:2405.04434 (2024)."},{"key":"e_1_3_2_1_27_1","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"Blockwise Parallel Transformers for Large Context Models. Neural Information Processing Systems","author":"Liu Hao","year":"2024","unstructured":"Hao Liu and Pieter Abbeel. 2024. Blockwise Parallel Transformers for Large Context Models. Neural Information Processing Systems (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"Ring attention with blockwise transformers for near-infinite context. arXiv preprint arXiv:2310.01889","author":"Liu Hao","year":"2023","unstructured":"Hao Liu, Matei Zaharia, and Pieter Abbeel. 2023. Ring attention with blockwise transformers for near-infinite context. arXiv preprint arXiv:2310.01889 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"Jessie Hui Wang, and Yimin Jiang","author":"Liu Juncai","year":"2023","unstructured":"Juncai Liu, Jessie Hui Wang, and Yimin Jiang. 2023. Janus: A unified distributed training framework for sparse mixture-of-experts models. In ACM SIGCOMM."},{"key":"e_1_3_2_1_31_1","volume-title":"Better Together: Jointly Optimizing ML Collective Scheduling and Execution Planning using {SYNDICATE}. In USENIX NSDI.","author":"Mahajan Kshiteej","year":"2023","unstructured":"Kshiteej Mahajan, Ching-Hsiang Chu, Srinivas Sridharan, and Aditya Akella. 2023. Better Together: Jointly Optimizing ML Collective Scheduling and Execution Planning using {SYNDICATE}. In USENIX NSDI."},{"key":"e_1_3_2_1_32_1","unstructured":"Megatron-LM 2025. GPU optimized techniques for training transformer models at-scale. (2025). https:\/\/github.com\/NVIDIA\/Megatron-LM."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Deepak Narayanan Aaron Harlap Amar Phanishayee Vivek Seshadri Nikhil R Devanur Gregory R Ganger Phillip B Gibbons and Matei Zaharia. 2019. PipeDream: generalized pipeline parallelism for DNN training. In ACM SOSP.","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_35_1","unstructured":"NCCL 2025. Optimized primitives for inter-GPU communication. (2025). https:\/\/github.com\/NVIDIA\/nccl."},{"key":"e_1_3_2_1_36_1","volume-title":"HetuMoE: An efficient trillion-scale mixture-of-expert distributed training system. arXiv preprint arXiv:2203.14685","author":"Nie Xiaonan","year":"2022","unstructured":"Xiaonan Nie, Pinxue Zhao, Xupeng Miao, Tong Zhao, and Bin Cui. 2022. HetuMoE: An efficient trillion-scale mixture-of-expert distributed training system. arXiv preprint arXiv:2203.14685 (2022)."},{"key":"e_1_3_2_1_37_1","volume-title":"Sinclair","author":"Pati Suchita","year":"2024","unstructured":"Suchita Pati, Shaizeen Aga, Mahzabeen Islam, Nuwan Jayasena, and Matthew D. Sinclair. 2024. T3: Transparent Tracking & Triggering for Fine-grained Overlap of Compute & Collectives. In ACM ASPLOS."},{"key":"e_1_3_2_1_38_1","unstructured":"Houwen Peng Kan Wu Yixuan Wei Guoshuai Zhao Yuxiang Yang Ze Liu Yifan Xiong Ziyue Yang Bolin Ni Jingcheng Hu et al. 2023. Fp8-lm: Training fp8 large language models. arXiv preprint arXiv:2310.18313 (2023)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Yanghua Peng Yibo Zhu Yangrui Chen Yixin Bao Bairen Yi Chang Lan Chuan Wu and Chuanxiong Guo. 2019. A generic communication scheduler for distributed DNN training acceleration. In ACM SOSP.","DOI":"10.1145\/3341301.3359642"},{"key":"e_1_3_2_1_40_1","volume-title":"International Conference on Machine Learning (ICML).","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022. DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_44_1","volume-title":"Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He.","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021. Zero-offload: Democratizing billion-scale model training. In USENIX ATC."},{"key":"e_1_3_2_1_45_1","volume-title":"Glu variants improve transformer. arXiv preprint arXiv:2002.05202","author":"Shazeer Noam","year":"2020","unstructured":"Noam Shazeer. 2020. Glu variants improve transformer. arXiv preprint arXiv:2002.05202 (2020)."},{"key":"e_1_3_2_1_46_1","volume-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538 (2017)."},{"key":"e_1_3_2_1_47_1","volume-title":"Se-moe: A scalable and efficient mixture-of-experts distributed training and inference system. arXiv preprint arXiv:2205.10034","author":"Shen Liang","year":"2022","unstructured":"Liang Shen, Zhihua Wu, WeiBao Gong, Hongxiang Hao, Yangfan Bai, HuaChao Wu, Xinxuan Wu, Jiang Bian, Haoyi Xiong, Dianhai Yu, et al. 2022. Se-moe: A scalable and efficient mixture-of-experts distributed training and inference system. arXiv preprint arXiv:2205.10034 (2022)."},{"key":"e_1_3_2_1_48_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_49_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_50_1","unstructured":"TransformerEngine 2025. A library for accelerating Transformer models on NVIDIA GPUs including using 8-bit floating point (FP8) precision on Hopper and Ada GPUs to provide better performance with lower memory utilization in both training and inference. (2025). https:\/\/github.com\/NVIDIA\/TransformerEngine."},{"key":"e_1_3_2_1_51_1","volume-title":"Attention is all you need. Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_1_52_1","volume-title":"Marcello Maggioni, Qiao Zhang, et al.","author":"Wang Shibo","year":"2022","unstructured":"Shibo Wang, Jinliang Wei, Amit Sabne, Andy Davis, Berkin Ilbeyi, Blake Hechtman, Dehao Chen, Karthik Srinivasa Murthy, Marcello Maggioni, Qiao Zhang, et al. 2022. Overlap communication with dependent computation via decomposition in large deep learning models. In ACM ASPLOS."},{"key":"e_1_3_2_1_53_1","volume-title":"Comet: Fine-grained Computation-communication Overlapping for Mixture-of-Experts. arXiv preprint arXiv:2502.19811","author":"Zhang Shulai","year":"2025","unstructured":"Shulai Zhang, Ningxin Zheng, Haibin Lin, Ziheng Jiang, Wenlei Bao, Chengquan Jiang, Qi Hou, Weihao Cui, Size Zheng, Li-Wen Chang, et al. 2025. Comet: Fine-grained Computation-communication Overlapping for Mixture-of-Experts. arXiv preprint arXiv:2502.19811 (2025)."},{"key":"e_1_3_2_1_54_1","volume-title":"Disttrain: Addressing model and data heterogeneity with disaggregated training for multimodal large language models. In ACM SIGCOMM.","author":"Zhang Zili","year":"2025","unstructured":"Zili Zhang, Yinmin Zhong, Yimin Jiang, Hanpeng Hu, Jianjian Sun, Zheng Ge, Yibo Zhu, Daxin Jiang, and Xin Jin. 2025. Disttrain: Addressing model and data heterogeneity with disaggregated training for multimodal large language models. In ACM SIGCOMM."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_2_1_56_1","volume-title":"Tilelink: Generating efficient compute-communication overlapping kernels using tile-centric primitives. arXiv preprint arXiv:2503.20313","author":"Zheng Size","year":"2025","unstructured":"Size Zheng, Jin Fang, Xuegui Zheng, Qi Hou, Wenlei Bao, Ningxin Zheng, Ziheng Jiang, Dongyang Wang, Jianxi Ye, Haibin Lin, et al. 2025. Tilelink: Generating efficient compute-communication overlapping kernels using tile-centric primitives. arXiv preprint arXiv:2503.20313 (2025)."}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"deposited":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:22:23Z","timestamp":1777062143000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3769325"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":56,"alternative-id":["10.1145\/3767295.3769325","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3769325","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}