{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,9]],"date-time":"2026-07-09T15:23:43Z","timestamp":1783610623709,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":25,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Swiss National Science Foundation","award":["10.001.796"],"award-info":[{"award-number":["10.001.796"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3721146.3721940","type":"proceedings-article","created":{"date-parts":[[2025,4,1]],"date-time":"2025-04-01T17:42:05Z","timestamp":1743529325000},"page":"192-199","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Accelerating MoE Model Inference with Expert Sharding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6822-8891","authenticated-orcid":false,"given":"Oana","family":"Balmau","sequence":"first","affiliation":[{"name":"McGill University, Montreal, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8187-724X","authenticated-orcid":false,"given":"Anne-Marie","family":"Kermarrec","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7826-1599","authenticated-orcid":false,"given":"Rafael","family":"Pires","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0111-1863","authenticated-orcid":false,"given":"Andr\u00e9 Loureiro Esp\u00edrito","family":"Santo","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4157-4847","authenticated-orcid":false,"given":"Martijn","family":"de Vos","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5083-3045","authenticated-orcid":false,"given":"Milos","family":"Vujasinovic","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,4]]},"reference":[{"issue":"120","key":"e_1_3_2_1_1_1","article-title":"Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research, 23(120), 2022.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_2_1","unstructured":"Aakanksha Chowdhery Sharan Narang Jacob Devlin Maarten Bosma Gaurav Mishra Adam Roberts Paul Barham Hyung Won Chung Charles Sutton Sebastian Gehrmann Parker Schuh Kensen Shi Sasha Tsvyashchenko Joshua Maynez Abhishek Rao Parker Barnes Yi Tay Noam Shazeer Vinodkumar Prabhakaran Emily Reif Nan Du Ben Hutchinson Reiner Pope James Bradbury Jacob Austin Michael Isard Guy Gur-Ari Pengcheng Yin Toju Duke Anselm Levskaya Sanjay Ghemawat Sunipa Dev Henryk Michalewski Xavier Garcia Vedant Misra Kevin Robinson Liam Fedus Denny Zhou Daphne Ippolito David Luan Hyeontaek Lim Barret Zoph Alexander Spiridonov Ryan Sepassi David Dohan Shivani Agrawal Mark Omernick Andrew M. Dai Thanumalayan Sankaranarayana Pillai Marie Pellat Aitor Lewkowycz Erica Moreira Rewon Child Oleksandr Polozov Katherine Lee Zongwei Zhou Xuezhi Wang Brennan Saeta Mark Diaz Orhan Firat Michele Catasta Jason Wei Kathy Meier-Hellstern Douglas Eck Jeff Dean Slav Petrov and Noah Fiedel. Palm: Scaling language modeling with pathways 2022."},{"key":"e_1_3_2_1_3_1","volume-title":"On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258","author":"Bommasani Rishi","year":"2021","unstructured":"Rishi Bommasani, Drew A Hudson, Ehsan Adeli, Russ Altman, Simran Arora, Sydney von Arx, Michael S Bernstein, Jeannette Bohg, Antoine Bosselut, Emma Brunskill, et al. On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258, 2021."},{"key":"e_1_3_2_1_4_1","volume-title":"Risks and benefits of large language models for the environment. Environmental science and technology, 57, 02","author":"Rillig Matthias","year":"2023","unstructured":"Matthias Rillig, Marlene \u00c5gerstrand, Mohan Bi, Kenneth Gould, and Uli Sauerland. Risks and benefits of large language models for the environment. Environmental science and technology, 57, 02 2023."},{"key":"e_1_3_2_1_5_1","volume-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer, 2017."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593704"},{"key":"e_1_3_2_1_7_1","volume-title":"Fastmoe: A fast mixture-of-expert training system","author":"He Jiaao","year":"2021","unstructured":"Jiaao He, Jiezhong Qiu, Aohan Zeng, Zhilin Yang, Jidong Zhai, and Jie Tang. Fastmoe: A fast mixture-of-expert training system, 2021."},{"key":"e_1_3_2_1_8_1","first-page":"120","volume-title":"Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP '22","author":"He Jiaao","year":"2022","unstructured":"Jiaao He, Jidong Zhai, Tiago Antunes, Haojie Wang, Fuwen Luo, Shangfeng Shi, and Qin Li. Fastermoe: modeling and optimizing training of large-scale dynamic pre-trained models. In Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP '22, page 120--134, New York, NY, USA, 2022. Association for Computing Machinery."},{"key":"e_1_3_2_1_9_1","first-page":"961","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Zhai Mingshu","year":"2023","unstructured":"Mingshu Zhai, Jiaao He, Zixuan Ma, Zan Zong, Runqing Zhang, and Jidong Zhai. SmartMoE: Efficiently training Sparsely-Activated models through combining offline and online parallelization. In 2023 USENIX Annual Technical Conference (USENIX ATC 23), pages 961--975, Boston, MA, July 2023. USENIX Association."},{"key":"e_1_3_2_1_10_1","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Zhou Yanqi","year":"2022","unstructured":"Yanqi Zhou, Tao Lei, Hanxiao Liu, Nan Du, Yanping Huang, Vincent Zhao, Andrew M Dai, zhifeng Chen, Quoc V Le, and James Laudon. Mixture-of-experts with expert choice routing. In Advances in Neural Information Processing Systems, volume 35. Curran Associates, Inc., 2022."},{"key":"e_1_3_2_1_11_1","volume-title":"M6-t: Exploring sparse expert models and beyond","author":"Yang An","year":"2021","unstructured":"An Yang, Junyang Lin, Rui Men, Chang Zhou, Le Jiang, Xianyan Jia, Ang Wang, Jie Zhang, Jiamang Wang, Yong Li, Di Zhang, Wei Lin, Lin Qu, Jingren Zhou, and Hongxia Yang. M6-t: Exploring sparse expert models and beyond, 2021."},{"key":"e_1_3_2_1_12_1","volume-title":"ICML","author":"Kim Yechan","year":"2024","unstructured":"Yechan Kim, Hwijoon Lim, and Dongsu Han. Scaling beyond the GPU memory limit for large mixture-of-experts model training. In ICML, 2024."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","first-page":"82","DOI":"10.1109\/CLUSTER52292.2023.00015","volume-title":"2023 IEEE International Conference on Cluster Computing (CLUSTER)","author":"Wang Wei","year":"2023","unstructured":"Wei Wang, Zhiquan Lai, Shengwei Li, Weijie Liu, Keshi Ge, Yujie Liu, Ao Shen, and Dongsheng Li. Prophet: Fine-grained load balancing for parallel training of large-scale moe models. In 2023 IEEE International Conference on Cluster Computing (CLUSTER), pages 82--94. IEEE, 2023."},{"key":"e_1_3_2_1_14_1","volume-title":"Lazarus: Resilient and elastic training of mixture-of-experts models with adaptive expert placement. arXiv preprint arXiv:2407.04656","author":"Wu Yongji","year":"2024","unstructured":"Yongji Wu, Wenjie Qu, Tianyang Tao, Zhuang Wang, Wei Bai, Zhuohao Li, Yuan Tian, Jiaheng Zhang, Matthew Lentz, and Danyang Zhuo. Lazarus: Resilient and elastic training of mixture-of-experts models with adaptive expert placement. arXiv preprint arXiv:2407.04656, 2024."},{"key":"e_1_3_2_1_15_1","volume-title":"NeurIPS","author":"A","year":"2017","unstructured":"A Vaswani et al. Attention is all you need. NeurIPS, 2017."},{"key":"e_1_3_2_1_16_1","volume-title":"ICLR","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. In ICLR, 2017."},{"key":"e_1_3_2_1_17_1","volume-title":"Gshard: Scaling giant models with conditional computation and automatic sharding","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. Gshard: Scaling giant models with conditional computation and automatic sharding, 2020."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2019.2928289"},{"key":"e_1_3_2_1_19_1","volume-title":"Megablocks: Efficient sparse training with mixture-of-experts","author":"Gale Trevor","year":"2022","unstructured":"Trevor Gale, Deepak Narayanan, Cliff Young, and Matei Zaharia. Megablocks: Efficient sparse training with mixture-of-experts, 2022."},{"key":"e_1_3_2_1_20_1","volume-title":"Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, 21(140)","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, 21(140), 2020."},{"key":"e_1_3_2_1_21_1","volume-title":"Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. In arXiv:1506.06724","author":"Zhu Yukun","year":"2015","unstructured":"Yukun Zhu, Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. In arXiv:1506.06724, 2015."},{"key":"e_1_3_2_1_22_1","series-title":"Proceedings of Machine Learning Research","first-page":"18332","volume-title":"Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He. DeepSpeed-MoE: Advancing mixture-of-experts inference and training to power next-generation AI scale. In Kamalika Chaudhuri","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He. DeepSpeed-MoE: Advancing mixture-of-experts inference and training to power next-generation AI scale. In Kamalika Chaudhuri, Stefanie Jegelka, Le Song, Csaba Szepesvari, Gang Niu, and Sivan Sabato, editors, Proceedings of the 39th International Conference on Machine Learning, volume 162 of Proceedings of Machine Learning Research, pages 18332--18346. PMLR, 17--23 Jul 2022."},{"key":"e_1_3_2_1_23_1","volume-title":"Tutel: Adaptive mixture-of-experts at scale","author":"Hwang Changho","year":"2023","unstructured":"Changho Hwang, Wei Cui, Yifan Xiong, Ziyue Yang, Ze Liu, Han Hu, Zilong Wang, Rafael Salas, Jithin Jose, Prabhat Ram, Joe Chau, Peng Cheng, Fan Yang, Mao Yang, and Yongqiang Xiong. Tutel: Adaptive mixture-of-experts at scale, 2023."},{"key":"e_1_3_2_1_24_1","first-page":"945","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Li Jiamin","year":"2023","unstructured":"Jiamin Li, Yimin Jiang, Yibo Zhu, Cong Wang, and Hong Xu. Accelerating distributed MoE training and inference with lina. In 2023 USENIX Annual Technical Conference (USENIX ATC 23), pages 945--959, Boston, MA, July 2023. USENIX Association."},{"key":"e_1_3_2_1_25_1","volume-title":"Exploiting inter-layer expert affinity for accelerating mixture-of-experts model inference","author":"Yao Jinghan","year":"2024","unstructured":"Jinghan Yao, Quentin Anthony, Aamir Shafi, Hari Subramoni, Dhabaleswar K., and Panda. Exploiting inter-layer expert affinity for accelerating mixture-of-experts model inference, 2024."}],"event":{"name":"EuroMLSys '25: 5th Workshop on Machine Learning and Systems","location":"World Trade Center Rotterdam Netherlands","acronym":"EuroMLSys '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 5th Workshop on Machine Learning and Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721146.3721940","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721146.3721940","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:57:39Z","timestamp":1750298259000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721146.3721940"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":25,"alternative-id":["10.1145\/3721146.3721940","10.1145\/3721146"],"URL":"https:\/\/doi.org\/10.1145\/3721146.3721940","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-04-01","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}