{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T02:21:04Z","timestamp":1768011664158,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3731599.3767706","type":"proceedings-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T16:20:02Z","timestamp":1762532402000},"page":"1502-1511","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MoE-Inference-Bench: Performance Evaluation of Mixture of Expert Large Language and Vision Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3027-1915","authenticated-orcid":false,"given":"Krishna Teja","family":"Chitty-Venkata","sequence":"first","affiliation":[{"name":"Argonne National Laboratory (ANL), Lemont, IL, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7846-751X","authenticated-orcid":false,"given":"Sylvia","family":"Howland","sequence":"additional","affiliation":[{"name":"Cerebras, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2016-2822","authenticated-orcid":false,"given":"Golara","family":"Azar","sequence":"additional","affiliation":[{"name":"Cerebras, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2654-3767","authenticated-orcid":false,"given":"Daria","family":"Soboleva","sequence":"additional","affiliation":[{"name":"Cerebras, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1038-1572","authenticated-orcid":false,"given":"Natalia","family":"Vassilieva","sequence":"additional","affiliation":[{"name":"Cerebras, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4832-0834","authenticated-orcid":false,"given":"Siddhisanket","family":"Raskar","sequence":"additional","affiliation":[{"name":"Pacific Northwest National Laboratory (PNNL), Providence, RI, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6279-0007","authenticated-orcid":false,"given":"Murali","family":"Emani","sequence":"additional","affiliation":[{"name":"Argonne National Laboratory (ANL), Lemont, IL, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7248-6116","authenticated-orcid":false,"given":"Venkatram","family":"Vishwanath","sequence":"additional","affiliation":[{"name":"Argonne National Laboratory (ANL), Lemont, IL, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Marah Abdin Jyoti Aneja Hany Awadalla Ahmed Awadallah Ammar\u00a0Ahmad Awan Nguyen Bach Amit Bahree Arash Bakhtiari Jianmin Bao Harkirat Behl et\u00a0al. 2024. Phi-3 technical report: A highly capable language model locally on your phone. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.14219 (2024)."},{"key":"e_1_3_3_1_3_2","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et\u00a0al. 2023. Qwen technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.16609 (2023)."},{"key":"e_1_3_3_1_4_2","unstructured":"Weilin Cai Juyong Jiang Fan Wang Jing Tang Sunghun Kim and Jiayi Huang. 2024. A Survey on Mixture of Experts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.06204 (2024)."},{"key":"e_1_3_3_1_5_2","unstructured":"Cerebras. 2024. CS-3. https:\/\/www.cerebras.ai\/system"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/SCW63240.2024.00178"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Jack Choquette. 2023. Nvidia hopper h100 gpu: Scaling performance. IEEE Micro 43 3 (2023) 9\u201317.","DOI":"10.1109\/MM.2023.3256796"},{"key":"e_1_3_3_1_8_2","unstructured":"Christopher Clark Kenton Lee Ming-Wei Chang Tom Kwiatkowski Michael Collins and Kristina Toutanova. 2019. Boolq: Exploring the surprising difficulty of natural yes\/no questions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1905.10044 (2019)."},{"key":"e_1_3_3_1_9_2","unstructured":"Peter Clark Isaac Cowhey Oren Etzioni Tushar Khot Ashish Sabharwal Carissa Schoenick and Oyvind Tafjord. 2018. Think you have Solved Question Answering? Try ARC the AI2 Reasoning Challenge. ArXiv abs\/1803.05457 (2018)."},{"key":"e_1_3_3_1_10_2","unstructured":"Damai Dai Chengqi Deng Chenggang Zhao RX Xu Huazuo Gao Deli Chen Jiashi Li Wangding Zeng Xingkai Yu Y Wu et\u00a0al. 2024. Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.06066 (2024)."},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3685520"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW63119.2024.00016"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/PMBS56514.2022.00007"},{"key":"e_1_3_3_1_14_2","unstructured":"William Fedus Barret Zoph and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research 23 120 (2022) 1\u201339."},{"key":"e_1_3_3_1_15_2","unstructured":"Elias Frantar Saleh Ashkboos Torsten Hoefler and Dan Alistarh. 2022. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.17323 (2022)."},{"key":"e_1_3_3_1_16_2","unstructured":"Chaoyou Fu Peixian Chen Yunhang Shen Yulei Qin Mengdan Zhang Xu Lin Jinrui Yang Xiawu Zheng Ke Li Xing Sun et\u00a0al. 2023. MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.13394 (2023)."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","unstructured":"Leo Gao Jonathan Tow Baber Abbasi Stella Biderman Sid Black Anthony DiPofi Charles Foster Laurence Golding Jeffrey Hsu Alain Le\u00a0Noac\u2019h Haonan Li Kyle McDonell Niklas Muennighoff Chris Ociepa Jason Phang Laria Reynolds Hailey Schoelkopf Aviya Skowron Lintang Sutawika Eric Tang Anish Thite Ben Wang Kevin Wang and Andy Zou. 2024. The Language Model Evaluation Harness. 10.5281\/zenodo.12608602","DOI":"10.5281\/zenodo.12608602"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1201\/9781003162810-13"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508418"},{"key":"e_1_3_3_1_20_2","unstructured":"Dan Hendrycks Collin Burns Steven Basart Andy Zou Mantas Mazeika Dawn Song and Jacob Steinhardt. 2021. Measuring Massive Multitask Language Understanding. Proceedings of the International Conference on Learning Representations (ICLR) (2021)."},{"key":"e_1_3_3_1_21_2","unstructured":"Albert\u00a0Q Jiang Alexandre Sablayrolles Antoine Roux Arthur Mensch Blanche Savary Chris Bamford Devendra\u00a0Singh Chaplot Diego de\u00a0las Casas Emma\u00a0Bou Hanna Florian Bressand et\u00a0al. 2024. Mixtral of experts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.04088 (2024)."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_15"},{"key":"e_1_3_3_1_23_2","unstructured":"Andrey Kuzmin Mart Van\u00a0Baalen Yuwei Ren Markus Nagel Jorn Peters and Tijmen Blankevoort. 2022. Fp8 quantization: The power of the exponent. Advances in Neural Information Processing Systems 35 (2022) 14651\u201314662."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_3_1_25_2","unstructured":"Dmitry Lepikhin HyoukJoong Lee Yuanzhong Xu Dehao Chen Orhan Firat Yanping Huang Maxim Krikun Noam Shazeer and Zhifeng Chen. 2020. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2006.16668 (2020)."},{"key":"e_1_3_3_1_26_2","unstructured":"Ji Lin Jiaming Tang Haotian Tang Shang Yang Wei-Ming Chen Wei-Chen Wang Guangxuan Xiao Xingyu Dang Chuang Gan and Song Han. 2024. AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. arxiv:https:\/\/arXiv.org\/abs\/2306.00978\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2306.00978"},{"key":"e_1_3_3_1_27_2","unstructured":"Aixin Liu Bei Feng Bin Wang Bingxuan Wang Bo Liu Chenggang Zhao Chengqi Dengr Chong Ruan Damai Dai Daya Guo et\u00a0al. 2024. Deepseek-v2: A strong economical and efficient mixture-of-experts language model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.04434 (2024)."},{"key":"e_1_3_3_1_28_2","unstructured":"Jiacheng Liu Peng Tang Wenfeng Wang Yuhang Ren Xiaofeng Hou Pheng-Ann Heng Minyi Guo and Chao Li. 2024. A survey on inference optimization techniques for mixture of experts models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.14219 (2024)."},{"key":"e_1_3_3_1_29_2","unstructured":"Pan Lu Swaroop Mishra Tanglin Xia Liang Qiu Kai-Wei Chang Song-Chun Zhu Oyvind Tafjord Peter Clark and Ashwin Kalyan. 2022. Learn to explain: Multimodal reasoning via thought chains for science question answering. Advances in Neural Information Processing Systems 35 (2022) 2507\u20132521."},{"key":"e_1_3_3_1_30_2","unstructured":"Xudong Lu Qi Liu Yuhui Xu Aojun Zhou Siyuan Huang Bo Zhang Junchi Yan and Hongsheng Li. 2024. Not all experts are equal: Efficient expert pruning and skipping for mixture-of-experts large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.14800 (2024)."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00264"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"e_1_3_3_1_33_2","unstructured":"AI Meta. 2025. The llama 4 herd: The beginning of a new era of natively multimodal ai innovation. https:\/\/ai. meta. com\/blog\/llama-4-multimodal-intelligence\/ checked on 4 7 (2025) 2025."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1260"},{"key":"e_1_3_3_1_35_2","unstructured":"mistralai. 2023. Mistral-7B-v0.1. https:\/\/huggingface.co\/mistralai\/Mistral-7B-v0.1"},{"key":"e_1_3_3_1_36_2","unstructured":"Niklas Muennighoff Luca Soldaini Dirk Groeneveld Kyle Lo Jacob Morrison Sewon Min Weijia Shi Pete Walsh Oyvind Tafjord Nathan Lambert et\u00a0al. 2024. Olmoe: Open mixture-of-experts language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.02060 (2024)."},{"key":"e_1_3_3_1_37_2","first-page":"18332","volume-title":"International conference on machine learning","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza\u00a0Yazdani Aminabadi, Ammar\u00a0Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022. Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale. In International conference on machine learning. PMLR, 18332\u201318346."},{"key":"e_1_3_3_1_38_2","unstructured":"Keisuke Sakaguchi Ronan\u00a0Le Bras Chandra Bhagavatula and Yejin Choi. 2019. WinoGrande: An Adversarial Winograd Schema Challenge at Scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1907.10641 (2019)."},{"key":"e_1_3_3_1_39_2","unstructured":"Noam Shazeer Azalia Mirhoseini Krzysztof Maziarz Andy Davis Quoc Le Geoffrey Hinton and Jeff Dean. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1701.06538 (2017)."},{"key":"e_1_3_3_1_40_2","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1909.08053 (2019)."},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593704"},{"key":"e_1_3_3_1_43_2","unstructured":"Kimi Team Yifan Bai Yiping Bao Guanduo Chen Jiahao Chen Ningxin Chen Ruijue Chen Yanru Chen Yuankun Chen Yutian Chen et\u00a0al. 2025. Kimi K2: Open Agentic Intelligence. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2507.20534 (2025)."},{"key":"e_1_3_3_1_44_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_1_45_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5446"},{"key":"e_1_3_3_1_47_2","unstructured":"Zhiyu Wu Xiaokang Chen Zizheng Pan Xingchao Liu Wen Liu Damai Dai Huazuo Gao Yiyang Ma Chengyue Wu Bingxuan Wang et\u00a0al. 2024. Deepseek-vl2: Mixture-of-experts vision-language models for advanced multimodal understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.10302 (2024)."},{"key":"e_1_3_3_1_48_2","unstructured":"An Yang Anfeng Li Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chang Gao Chengen Huang Chenxu Lv et\u00a0al. 2025. Qwen3 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.09388 (2025)."},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"crossref","unstructured":"Cheng Yang Yang Sui Jinqi Xiao Lingyi Huang Yu Gong Yuanlin Duan Wenqi Jia Miao Yin Yu Cheng and Bo Yuan. 2024. MoE-I2: Compressing Mixture of Experts Models through Inter-Expert Pruning and Intra-Expert Low-Rank Decomposition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.01016 (2024).","DOI":"10.18653\/v1\/2024.findings-emnlp.612"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","unstructured":"Junqi Yin Sajal Dash John Gounley Feiyi Wang and Georgia Tourassi. 2023. Evaluation of pre-training large language models on leadership-class supercomputers. The Journal of Supercomputing (06 2023) 1\u201322. 10.1007\/s11227-023-05479-7","DOI":"10.1007\/s11227-023-05479-7"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","unstructured":"Junqi Yin Aristeidis Tsaris Sajal Dash Ross Miller Feiyi Wang and Mallikarjun\u00a0(Arjun) Shankar. 2021. Comparative evaluation of deep learning workloads for leadership-class systems. BenchCouncil Transactions on Benchmarks Standards and Evaluations 1 1 (2021) 100005. 10.1016\/j.tbench.2021.100005","DOI":"10.1016\/j.tbench.2021.100005"},{"key":"e_1_3_3_1_52_2","unstructured":"Shukang Yin Chaoyou Fu Sirui Zhao Ke Li Xing Sun Tong Xu and Enhong Chen. 2023. A survey on multimodal large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.13549 (2023)."},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1472"},{"key":"e_1_3_3_1_55_2","unstructured":"Yi-Fan Zhang Huanyu Zhang Haochen Tian Chaoyou Fu Shuangqing Zhang Junfei Wu Feng Li Kun Wang Qingsong Wen Zhang Zhang et\u00a0al. 2024. MME-RealWorld: Could Your Multimodal LLM Challenge High-Resolution Real-World Scenarios that are Difficult for Humans? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.13257 (2024)."}],"event":{"name":"SC Workshops '25: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St Louis MO USA","acronym":"SC Workshops '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731599.3767706","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:35:37Z","timestamp":1767987337000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731599.3767706"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":54,"alternative-id":["10.1145\/3731599.3767706","10.1145\/3731599"],"URL":"https:\/\/doi.org\/10.1145\/3731599.3767706","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}