{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T11:05:56Z","timestamp":1777460756292,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","funder":[{"name":"Swiss National Science Foundation","award":["10.001.796"],"award-info":[{"award-number":["10.001.796"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3805621.3807659","type":"proceedings-article","created":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T13:08:45Z","timestamp":1777381725000},"page":"466-472","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["The Cost of Expertise: Understanding MoE Decode Performance"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-6207-5905","authenticated-orcid":false,"given":"Sami","family":"Abuzakuk","sequence":"first","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6822-8891","authenticated-orcid":false,"given":"Oana","family":"Balmau","sequence":"additional","affiliation":[{"name":"McGill University, Montreal, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2867-7814","authenticated-orcid":false,"given":"Jiaxuan","family":"Chen","sequence":"additional","affiliation":[{"name":"McGill University, Montreal, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8187-724X","authenticated-orcid":false,"given":"Anne-Marie","family":"Kermarrec","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7826-1599","authenticated-orcid":false,"given":"Rafael","family":"Pires","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7266-5522","authenticated-orcid":false,"given":"Ramya","family":"Prabhu","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4157-4847","authenticated-orcid":false,"given":"Martijn","family":"de Vos","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,4,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Anthropic. Claude (claude 3 opus version) 2024. Large language model."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.5555\/944919.944966"},{"key":"e_1_3_2_1_3_1","first-page":"1901","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, et al. Language models are few-shot learners. In Advances in Neural Information Processing Systems (NeurIPS), volume 33, pages 1877\u20131901, 2020."},{"key":"e_1_3_2_1_4_1","volume-title":"Deepseek-v3 technical report","author":"AI.","year":"2025","unstructured":"DeepSeek-AI. Deepseek-v3 technical report, 2025."},{"key":"e_1_3_2_1_5_1","volume-title":"Harmoeny: Efficient multi-gpu inference of moe models","author":"Doucet Zachary","year":"2025","unstructured":"Zachary Doucet, Rishi Sharma, Martijn de Vos, Rafael Pires, Anne-Marie Kermarrec, and Oana Balmau. Harmoeny: Efficient multi-gpu inference of moe models, 2025."},{"issue":"120","key":"e_1_3_2_1_6_1","first-page":"1","article-title":"Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research, 23(120):1\u201339, 2022.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_7_1","volume-title":"Megablocks: Efficient sparse training with mixture-of-experts","author":"Gale Trevor","year":"2022","unstructured":"Trevor Gale, Deepak Narayanan, Cliff Young, and Matei Zaharia. Megablocks: Efficient sparse training with mixture-of-experts, 2022."},{"key":"e_1_3_2_1_8_1","volume-title":"Gemini: a family of highly capable multimodal models","author":"Gemini Team","year":"2023","unstructured":"Gemini Team et al. Gemini: a family of highly capable multimodal models, 2023."},{"key":"e_1_3_2_1_9_1","volume-title":"Moetuner: Optimizing mixture-of-experts inference via expert placement and routing-aware scheduling","author":"Go","year":"2025","unstructured":"Go et al. Moetuner: Optimizing mixture-of-experts inference via expert placement and routing-aware scheduling, 2025. arXiv preprint."},{"key":"e_1_3_2_1_10_1","volume-title":"September","author":"Gordic Aleksa","year":"2025","unstructured":"Aleksa Gordic. Inside vllm: Anatomy of a high-throughput llm inference system, September 2025. Accessed: 2026-02-01."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of Recent Advances in Natural Language Processing (RANLP)","author":"Gupta Rohit","year":"2019","unstructured":"Rohit Gupta, Constantin Or\u0103san, and Ruslan Mitkov. Character-based neural machine translation with transformers. In Proceedings of Recent Advances in Natural Language Processing (RANLP), 2019."},{"key":"e_1_3_2_1_12_1","volume-title":"Inference without interference: Disaggregate llm inference for mixed downstream workloads","author":"Hu Qizheng","year":"2024","unstructured":"Qizheng Hu, Ziyun Huang, et al. Inference without interference: Disaggregate llm inference for mixed downstream workloads, 2024."},{"key":"e_1_3_2_1_13_1","volume-title":"Towards efficient mixture-of-experts deployment: Dynamic gating and expert load balancing","author":"Huang","year":"2023","unstructured":"Huang et al. Towards efficient mixture-of-experts deployment: Dynamic gating and expert load balancing, 2023. arXiv preprint."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of Machine Learning and Systems (MLSys)","author":"Hwang Sehoon","year":"2023","unstructured":"Sehoon Hwang, Deepak Narayanan, Jongsoo Kim, et al. Tutel: Adaptive mixture-of-experts at scale. In Proceedings of Machine Learning and Systems (MLSys), 2023."},{"key":"e_1_3_2_1_15_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Imai Saki","year":"2024","unstructured":"Saki Imai, Rina Nakazawa, Marcelo Amaral, Sunyanan Choochotkaew, and Tatsuhiro Chiba. Predicting llm inference latency: A roofline-driven ml method. In Advances in Neural Information Processing Systems (NeurIPS), 2024."},{"key":"e_1_3_2_1_16_1","volume-title":"Mixtral of experts","author":"Jiang Albert Q.","year":"2024","unstructured":"Albert Q. Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, et al. Mixtral of experts, 2024."},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (ACL)","author":"Kudo Taku","year":"2018","unstructured":"Taku Kudo. Subword regularization: Improving neural network translation models with multiple subword candidates. In Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (ACL), 2018."},{"key":"e_1_3_2_1_18_1","volume-title":"Accelerating distributed moe training and inference with lina","author":"Li Jiamin","year":"2024","unstructured":"Jiamin Li, Yimin Jiang, Yibo Zhu, Cong Wang, and Hong Xu. Accelerating distributed moe training and inference with lina, 2024."},{"key":"e_1_3_2_1_19_1","volume-title":"Mastering llm techniques: Inference optimization. https:\/\/developer.nvidia.com\/blog\/mastering-llm-techniques-inference-optimization\/","author":"NVIDIA.","year":"2023","unstructured":"NVIDIA. Mastering llm techniques: Inference optimization. https:\/\/developer.nvidia.com\/blog\/mastering-llm-techniques-inference-optimization\/, 2023. Accessed: 2026-02-24."},{"key":"e_1_3_2_1_20_1","volume-title":"Gpt-4 technical report","author":"AI.","year":"2024","unstructured":"OpenAI. Gpt-4 technical report, 2024."},{"key":"e_1_3_2_1_21_1","volume-title":"gpt-oss-120b & gpt-oss-20b model card","author":"AI.","year":"2025","unstructured":"OpenAI. gpt-oss-120b & gpt-oss-20b model card, 2025."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_2_1_23_1","volume-title":"Forecasting llm inference performance via hardware-agnostic analytical modeling","author":"Patwari Rajeev","year":"2025","unstructured":"Rajeev Patwari, Ashish Sirasao, and Devleena Das. Forecasting llm inference performance via hardware-agnostic analytical modeling, 2025."},{"key":"e_1_3_2_1_24_1","volume-title":"February","author":"Team Qwen","year":"2024","unstructured":"Qwen Team. Introducing qwen1.5, February 2024."},{"key":"e_1_3_2_1_25_1","volume-title":"Language models are unsupervised multitask learners","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, et al. Language models are unsupervised multitask learners, 2019. OpenAI blog."},{"key":"e_1_3_2_1_26_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Jeffrey Li, Zhewei Yao, et al. Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale. In International Conference on Machine Learning (ICML), 2022."},{"key":"e_1_3_2_1_27_1","volume-title":"Openai gpt-5 system card","author":"Singh Aaditya","year":"2025","unstructured":"Aaditya Singh, Adam Fry, Adam Perelman, Adam Tart, et al. Openai gpt-5 system card, 2025."},{"key":"e_1_3_2_1_28_1","volume-title":"Llm performance: Prefill, decode, and concurrent requests. https:\/\/huggingface.co\/blog\/tngtech\/llm-performance-prefill-decode-concurrent-requests","author":"Technology Consulting TNG","year":"2024","unstructured":"TNG Technology Consulting and Hugging Face. Llm performance: Prefill, decode, and concurrent requests. https:\/\/huggingface.co\/blog\/tngtech\/llm-performance-prefill-decode-concurrent-requests, 2024. Accessed: 2026-02-24."},{"key":"e_1_3_2_1_29_1","first-page":"6008","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. Attention is all you need. In Advances in Neural Information Processing Systems (NeurIPS), volume 30, pages 5998\u20136008, 2017."},{"key":"e_1_3_2_1_30_1","volume-title":"Qwen3 technical report","author":"Yang An","year":"2025","unstructured":"An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, et al. Qwen3 technical report, 2025."},{"key":"e_1_3_2_1_31_1","volume-title":"Llm inference unveiled: Survey and roofline model insights","author":"Yuan Zhihang","year":"2024","unstructured":"Zhihang Yuan, Yuzhang Shang, Yang Zhou, Zhen Dong, et al. Llm inference unveiled: Survey and roofline model insights, 2024."},{"key":"e_1_3_2_1_32_1","first-page":"1096","volume-title":"2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA)","author":"Zhang Hengrui","unstructured":"Hengrui Zhang, August Ning, Rohan Baskar Prabhakar, and David Wentzlaff. Llmcompass: Enabling efficient hardware design for large language model inference. In 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA), pages 1080\u20131096. IEEE Computer Society, 2024."},{"key":"e_1_3_2_1_33_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Zhang, Siming Zhao, et al. Distserve: Disaggregating prefill and decoding for goodput-optimized large language model serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI), 2024."}],"event":{"name":"EuroSys '26: 21st European Conference on Computer Systems","location":"Edinburgh Scotland Uk","acronym":"EuroMLSys '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Sixth European Workshop on Machine Learning and Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3805621.3807659","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T13:13:04Z","timestamp":1777381984000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805621.3807659"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,27]]},"references-count":33,"alternative-id":["10.1145\/3805621.3807659","10.1145\/3805621"],"URL":"https:\/\/doi.org\/10.1145\/3805621.3807659","relation":{},"subject":[],"published":{"date-parts":[[2026,4,27]]},"assertion":[{"value":"2026-04-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}