{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T12:31:12Z","timestamp":1773318672382,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T00:00:00Z","timestamp":1763164800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2441601"],"award-info":[{"award-number":["2441601"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759886","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:04:47Z","timestamp":1762963487000},"page":"1315-1331","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["X-MoE: Enabling Scalable Training for Emerging Mixture-of-Experts Architectures on HPC Platforms"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-7443-6098","authenticated-orcid":false,"given":"Yueming","family":"Yuan","sequence":"first","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2664-8545","authenticated-orcid":false,"given":"Ahan","family":"Gupta","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1408-8564","authenticated-orcid":false,"given":"Jianping","family":"Li","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5308-914X","authenticated-orcid":false,"given":"Sajal","family":"Dash","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory (ORNL), Oak Ridge, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0099-1559","authenticated-orcid":false,"given":"Feiyi","family":"Wang","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory (ORNL), Oak Ridge, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8165-166X","authenticated-orcid":false,"given":"Minjia","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_3_2_2","unstructured":"Meta AI. 2024. Introducing Meta LLaMA-3. https:\/\/ai.meta.com\/blog\/meta-llama-3\/."},{"key":"e_1_3_3_3_3_2","unstructured":"Meta AI. 2025. Llama 4: Multimodal Intelligence. https:\/\/ai.meta.com\/blog\/llama-4-multimodal-intelligence\/."},{"key":"e_1_3_3_3_4_2","unstructured":"Anthropic. 2024. Claude 3 haiku: our fastest model yet. https:\/\/www.anthropic.com\/news\/claude-3-haiku."},{"key":"e_1_3_3_3_5_2","unstructured":"Argonne National Laboratory. 2024. Aurora Supercomputer. https:\/\/www.alcf.anl.gov\/aurora."},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607089"},{"key":"e_1_3_3_3_7_2","volume-title":"Advances in Neural Information Processing Systems (NeurIPS \u201920)","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, and et al.2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems (NeurIPS \u201920)."},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651379"},{"key":"e_1_3_3_3_9_2","unstructured":"Tianqi Chen Bing Xu Chiyuan Zhang and Carlos Guestrin. 2016. Training Deep Nets with Sublinear Memory Cost. arxiv:https:\/\/arXiv.org\/abs\/1604.06174\u00a0[cs.LG]"},{"key":"e_1_3_3_3_10_2","first-page":"797","volume-title":"USENIX Symposium on Operating Systems Design and Implementation (OSDI \u201923)","author":"Cui Weihao","year":"2023","unstructured":"Weihao Cui, Zhenhua Han, Lingji Ouyang, Yichuan Wang, Ningxin Zheng, Lingxiao Ma, Yuqing Yang, Fan Yang, Jilong Xue, Lili Qiu, Lidong Zhou, Quan Chen, Haisheng Tan, and Minyi Guo. 2023. Optimizing Dynamic Neural Networks with Brainstorm. In USENIX Symposium on Operating Systems Design and Implementation (OSDI \u201923). 797\u2013815."},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.70"},{"key":"e_1_3_3_3_12_2","unstructured":"DeepSeek-AI. 2025. DeepSeek-V3 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2412.19437\u00a0[cs.CL]"},{"key":"e_1_3_3_3_13_2","first-page":"224","volume-title":"Proceedings of Machine Learning and Systems (MLSys \u201924)","volume":"6","author":"Du Zhixu","year":"2024","unstructured":"Zhixu Du, Shiyu Li, Yuhao Wu, Xiangyu Jiang, Jingwei Sun, Qilin Zheng, Yongkai Wu, Ang Li, Hai\u00a0Helen Li, and Yiran Chen. 2024. SiDA: Sparsity-Inspired Data-Aware Serving for Efficient and Scalable Large Mixture-of-Experts Models. In Proceedings of Machine Learning and Systems (MLSys \u201924) , Vol.\u00a06. 224\u2013238."},{"key":"e_1_3_3_3_14_2","unstructured":"William Fedus Barret Zoph and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. The Journal of Machine Learning Research (JMLR \u201922) 23 1 (2022) 5232\u20135270."},{"key":"e_1_3_3_3_15_2","unstructured":"Trevor Gale Deepak Narayanan Cliff Young and Matei Zaharia. 2023. Megablocks: Efficient sparse training with mixture-of-experts. Proceedings of Machine Learning and Systems (MLSys \u201923) 5 (2023)."},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508418"},{"key":"e_1_3_3_3_17_2","unstructured":"Changho Hwang Wei Cui Yifan Xiong Ziyue Yang Ze Liu Han Hu Zilong Wang Rafael Salas Jithin Jose Prabhat Ram et\u00a0al. 2023. Tutel: Adaptive mixture-of-experts at scale. Proceedings of Machine Learning and Systems (MLSys \u201923) (2023)."},{"key":"e_1_3_3_3_18_2","unstructured":"Albert\u00a0Q Jiang Alexandre Sablayrolles Antoine Roux Arthur Mensch Blanche Savary Chris Bamford Devendra\u00a0Singh Chaplot et\u00a0al. 2024. Mixtral of experts. arXiv:https:\/\/arXiv.org\/abs\/2401.04088 (2024)."},{"key":"e_1_3_3_3_19_2","unstructured":"Chenyu Jiang Ye Tian Zhen Jia Shuai Zheng Chuan Wu and Yida Wang. 2024. Lancet: Accelerating mixture-of-experts training via whole graph computation-communication overlapping. (2024)."},{"key":"e_1_3_3_3_20_2","unstructured":"Heehoon Kim Junyeol Ryu and Jaejin Lee. 2024. TCCL: Discovering Better Communication Paths for PCIe GPU Clusters(ASPLOS \u201924)."},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2008.19"},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_3_3_23_2","unstructured":"Dmitry Lepikhin HyoukJoong Lee Yuanzhong Xu Dehao Chen Orhan Firat Yanping Huang Maxim Krikun Noam Shazeer and Zhifeng Chen. 2020. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2006.16668 (2020)."},{"key":"e_1_3_3_3_24_2","unstructured":"Dmitry Lepikhin HyoukJoong Lee Yuanzhong Xu Dehao Chen Orhan Firat Yanping Huang Maxim Krikun Noam Shazeer and Zhifeng Chen. 2020. GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. CoRR abs\/2006.16668 (2020). arxiv:https:\/\/arXiv.org\/abs\/2006.16668"},{"key":"e_1_3_3_3_25_2","unstructured":"Aixin Liu Bei Feng Bin Wang Bingxuan Wang Bo Liu Chenggang Zhao Chengqi Dengr Chong Ruan Damai Dai Daya Guo et\u00a0al. 2024. Deepseek-v2: A strong economical and efficient mixture-of-experts language model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.04434 (2024)."},{"key":"e_1_3_3_3_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_3_3_27_2","volume-title":"NVIDIA TensorRT: Programmable Inference Accelerator","author":"Corporation NVIDIA","year":"2023","unstructured":"NVIDIA Corporation. 2023. NVIDIA TensorRT: Programmable Inference Accelerator. https:\/\/developer.nvidia.com\/tensorrt"},{"key":"e_1_3_3_3_28_2","unstructured":"OpenAI. 2023. GPT-4 Technical Report. CoRR abs\/2303.08774 (2023)."},{"key":"e_1_3_3_3_29_2","unstructured":"OpenAI. 2024. GPT-4o System Card. arxiv:https:\/\/arXiv.org\/abs\/2410.21276\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2410.21276"},{"key":"e_1_3_3_3_30_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-4009"},{"key":"e_1_3_3_3_31_2","volume-title":"International Conference on Machine Learning (ICML\u201922)","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza\u00a0Yazdani Aminabadi, Ammar\u00a0Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022. DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale. In International Conference on Machine Learning (ICML\u201922)."},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_3_3_33_2","unstructured":"Noam Shazeer Azalia Mirhoseini Krzysztof Maziarz Andy Davis Quoc Le Geoffrey Hinton and Jeff Dean. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1701.06538 (2017)."},{"key":"e_1_3_3_3_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593704"},{"key":"e_1_3_3_3_35_2","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew\u00a0M Dai Anja Hauth Katie Millican et\u00a0al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.11805 (2023)."},{"key":"e_1_3_3_3_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_3_3_37_2","volume-title":"Grok","year":"2025","unstructured":"XAI. 2025. Grok. https:\/\/x.ai\/grok"},{"key":"e_1_3_3_3_38_2","unstructured":"Zihao Ye Lequn Chen Ruihang Lai Wuwei Lin Yineng Zhang Stephanie Wang Tianqi Chen Baris Kasikci Vinod Grover Arvind Krishnamurthy and Luis Ceze. 2025. FlashInfer: Efficient and Customizable Attention Engine for LLM Inference Serving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.01005 (2025)."},{"key":"e_1_3_3_3_39_2","unstructured":"Chenggang Zhao Shangyan Zhou Liyue Zhang Chengqi Deng Zhean Xu Yuxuan Liu Kuai Yu Jiashi Li and Liang Zhao. 2025. DeepEP: an efficient expert-parallel communication library. https:\/\/github.com\/deepseek-ai\/DeepEP."},{"key":"e_1_3_3_3_40_2","unstructured":"Lianmin Zheng Liangsheng Yin Zhiqiang Xie Jeff Huang Chuyue Sun Cody\u00a0Hao Yu Shiyi Cao Christos Kozyrakis Ion Stoica Joseph\u00a0E. Gonzalez Clark\u00a0W. Barrett and Ying Sheng. 2023. Efficiently Programming Large Language Models using SGLang. CoRR abs\/2312.07104 (2023)."}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3712285.3759886","content-type":"text\/html","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759886","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759886","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:48:11Z","timestamp":1773254891000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759886"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":39,"alternative-id":["10.1145\/3712285.3759886","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759886","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}