{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T01:53:15Z","timestamp":1773193995769,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","funder":[{"name":"National Key Research & Development Program of China","award":["2022YFB4502004"],"award-info":[{"award-number":["2022YFB4502004"]}]},{"name":"Natural Science Foundation of China","award":["92467102"],"award-info":[{"award-number":["92467102"]}]},{"name":"Tsinghua University Initiative Scientific Research Program, Young Elite Scientists Sponsorship Program by CAST","award":["2022QNRC001"],"award-info":[{"award-number":["2022QNRC001"]}]},{"name":"Beijing Natural Science Foundation","award":["L252014"],"award-info":[{"award-number":["L252014"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3731569.3764843","type":"proceedings-article","created":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:43:24Z","timestamp":1759322604000},"page":"1014-1029","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["KTransformers: Unleashing the Full Potential of CPU\/GPU Hybrid Inference for MoE Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5110-0413","authenticated-orcid":false,"given":"Hongtao","family":"Chen","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0173-1027","authenticated-orcid":false,"given":"Weiyu","family":"Xie","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2857-3713","authenticated-orcid":false,"given":"Boxin","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3300-9425","authenticated-orcid":false,"given":"Jingqi","family":"Tang","sequence":"additional","affiliation":[{"name":"Approaching.AI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8030-5976","authenticated-orcid":false,"given":"Jiahao","family":"Wang","sequence":"additional","affiliation":[{"name":"Approaching.Al, Beijing, China"},{"name":"Hangzhou Dianzi University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3175-1607","authenticated-orcid":false,"given":"Jianwei","family":"Dong","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3526-3241","authenticated-orcid":false,"given":"Shaoyuan","family":"Chen","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5797-2108","authenticated-orcid":false,"given":"Ziwei","family":"Yuan","sequence":"additional","affiliation":[{"name":"Approaching.AI, Beijing, China"},{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8554-0411","authenticated-orcid":false,"given":"Chen","family":"Lin","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4422-9368","authenticated-orcid":false,"given":"Chengyu","family":"Qiu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6716-5709","authenticated-orcid":false,"given":"Yuening","family":"Zhu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6762-8056","authenticated-orcid":false,"given":"Qingliang","family":"Ou","sequence":"additional","affiliation":[{"name":"Approaching.AI, Beijing, China"},{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8097-0846","authenticated-orcid":false,"given":"Jiaqi","family":"Liao","sequence":"additional","affiliation":[{"name":"Approaching.AI, Beijing, China"},{"name":"Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4975-4856","authenticated-orcid":false,"given":"Xianglin","family":"Chen","sequence":"additional","affiliation":[{"name":"Approaching.AI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8050-6288","authenticated-orcid":false,"given":"Zhiyuan","family":"Ai","sequence":"additional","affiliation":[{"name":"Approaching.AI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6651-7032","authenticated-orcid":false,"given":"Yongwei","family":"Wu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7518-0753","authenticated-orcid":false,"given":"Mingxing","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis","author":"Aminabadi Reza Yazdani","year":"2022","unstructured":"Reza Yazdani Aminabadi, Samyam Rajbhandari, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Olatunji Ruwase, Shaden Smith, Minjia Zhang, Jeff Rasley, and Yuxiong He. 2022. DeepSpeed-inference: enabling efficient inference of transformer models at unprecedented scale. In Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis (Dallas, Texas) (SC '22). IEEE Press, Article 46, 15 pages."},{"key":"e_1_3_2_1_2_1","unstructured":"Jacob Austin Augustus Odena Maxwell Nye Maarten Bosma Henryk Michalewski David Dohan Ellen Jiang Carrie Cai Michael Terry Quoc Le et al. 2021. Program Synthesis with Large Language Models. arXiv preprint arXiv:2108.07732 (2021)."},{"key":"e_1_3_2_1_3_1","unstructured":"Mark Chen Jerry Tworek Heewoo Jun Qiming Yuan Henrique Ponde de Oliveira Pinto Jared Kaplan Harri Edwards Yuri Burda Nicholas Joseph Greg Brockman Alex Ray Raul Puri Gretchen Krueger Michael Petrov Heidy Khlaaf Girish Sastry Pamela Mishkin Brooke Chan Scott Gray Nick Ryder Mikhail Pavlov Alethea Power Lukasz Kaiser Mohammad Bavarian Clemens Winter Philippe Tillet Felipe Petroski Such Dave Cummings Matthias Plappert Fotios Chantzis Elizabeth Barnes Ariel Herbert-Voss William Hebgen Guss Alex Nichol Alex Paino Nikolas Tezak Jie Tang Igor Babuschkin Suchir Balaji Shantanu Jain William Saunders Christopher Hesse Andrew N. Carr Jan Leike Josh Achiam Vedant Misra Evan Morikawa Alec Radford Matthew Knight Miles Brundage Mira Murati Katie Mayer Peter Welinder Bob McGrew Dario Amodei Sam McCandlish Ilya Sutskever and Wojciech Zaremba. 2021. Evaluating Large Language Models Trained on Code. arXiv:2107.03374 [cs.LG]"},{"key":"e_1_3_2_1_4_1","volume-title":"Training Verifiers to Solve Math Word Problems. arXiv preprint arXiv:2110.14168","author":"Cobbe Karl","year":"2021","unstructured":"Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, Christopher Hesse, and John Schulman. 2021. Training Verifiers to Solve Math Word Problems. arXiv preprint arXiv:2110.14168 (2021)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","first-page":"381","DOI":"10.1145\/2499368.2451157","article-title":"Traffic management: a holistic approach to memory placement on NUMA systems","volume":"48","author":"Dashti Mohammad","year":"2013","unstructured":"Mohammad Dashti, Alexandra Fedorova, Justin Funston, Fabien Gaud, Renaud Lachaize, Baptiste Lepers, Vivien Quema, and Mark Roth. 2013. Traffic management: a holistic approach to memory placement on NUMA systems. ACM SIGPLAN Notices 48, 4 (2013), 381\u2013394.","journal-title":"ACM SIGPLAN Notices"},{"key":"e_1_3_2_1_6_1","unstructured":"DeepSeek-AI. 2024. DeepSeek-V2: A Strong Economical and Efficient Mixture-of-Experts Language Model. arXiv:2405.04434 [cs.CL]"},{"key":"e_1_3_2_1_7_1","unstructured":"DeepSeek-AI. 2024. DeepSeek-V3 Technical Report. arXiv:2412.19437 [cs.CL] https:\/\/arxiv.org\/abs\/2412.19437"},{"key":"e_1_3_2_1_8_1","unstructured":"DeepSeek-AI. 2025. DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv:2501.12948 [cs.CL] https:\/\/arxiv.org\/abs\/2501.12948"},{"key":"e_1_3_2_1_9_1","volume-title":"int8 (): 8-bit matrix multiplication for transformers at scale. Advances in neural information processing systems 35","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. 2022. Gpt3. int8 (): 8-bit matrix multiplication for transformers at scale. Advances in neural information processing systems 35 (2022), 30318\u201330332."},{"key":"e_1_3_2_1_10_1","volume-title":"Fast inference of mixture-of-experts language models with offloading. arXiv preprint arXiv:2312.17238","author":"Eliseev Artyom","year":"2023","unstructured":"Artyom Eliseev and Denis Mazur. 2023. Fast inference of mixture-of-experts language models with offloading. arXiv preprint arXiv:2312.17238 (2023)."},{"key":"e_1_3_2_1_11_1","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research 23, 120 (2022), 1\u201339.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_12_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323 (2022)."},{"key":"e_1_3_2_1_13_1","volume-title":"MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models. arXiv preprint arXiv:2408.11743","author":"Frantar Elias","year":"2024","unstructured":"Elias Frantar, Roberto L Castro, Jiale Chen, Torsten Hoefler, and Dan Alistarh. 2024. MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models. arXiv preprint arXiv:2408.11743 (2024)."},{"key":"e_1_3_2_1_14_1","volume-title":"ggerganov\/llama.cpp. Retrieved","author":"Georgi Gerganov","year":"2025","unstructured":"Georgi Gerganov 2023. ggerganov\/llama.cpp. Retrieved Feb 8, 2025 from https:\/\/github.com\/ggerganov\/llama.cpp"},{"key":"e_1_3_2_1_15_1","volume-title":"Did Aristotle Use a Laptop? A Question Answering Benchmark with Implicit Reasoning Strategies. Transactions of the Association for Computational Linguistics (TACL)","author":"Geva Mor","year":"2021","unstructured":"Mor Geva, Daniel Khashabi, Elad Segal, Tushar Khot, Dan Roth, and Jonathan Berant. 2021. Did Aristotle Use a Laptop? A Question Answering Benchmark with Implicit Reasoning Strategies. Transactions of the Association for Computational Linguistics (TACL) (2021)."},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 770\u2013778","author":"He Kaiming","year":"2016","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition. 770\u2013778."},{"key":"e_1_3_2_1_17_1","volume-title":"Expertflow: Optimized expert activation and token allocation for efficient mixture-of-experts inference. arXiv preprint arXiv:2410.17954","author":"He Xin","year":"2024","unstructured":"Xin He, Shunkang Zhang, Yuxin Wang, Haiyan Yin, Zihao Zeng, Shaohuai Shi, Zhenheng Tang, Xiaowen Chu, Ivor Tsang, and Ong Yew Soon. 2024. Expertflow: Optimized expert activation and token allocation for efficient mixture-of-experts inference. arXiv preprint arXiv:2410.17954 (2024)."},{"key":"e_1_3_2_1_18_1","volume-title":"Mc-moe: Mixture compressor for mixture-of-experts llms gains more. arXiv preprint arXiv:2410.06270","author":"Huang Wei","year":"2024","unstructured":"Wei Huang, Yue Liao, Jianhui Liu, Ruifei He, Haoru Tan, Shiming Zhang, Hongsheng Li, Si Liu, and Xiaojuan Qi. 2024. Mc-moe: Mixture compressor for mixture-of-experts llms gains more. arXiv preprint arXiv:2410.06270 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"Retrieved","year":"2022","unstructured":"HuggingFace 2022. huggingface\/text-generation-inference. Retrieved Apr 17, 2025 from https:\/\/github.com\/huggingface\/text-generation-inference"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00078"},{"key":"e_1_3_2_1_21_1","volume-title":"Mixture of experts with mixture of precisions for tuning quality of service. arXiv preprint arXiv:2407.14417","author":"Imani HamidReza","year":"2024","unstructured":"HamidReza Imani, Abdolah Amirany, and Tarek El-Ghazawi. 2024. Mixture of experts with mixture of precisions for tuning quality of service. arXiv preprint arXiv:2407.14417 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"Intel Memory Latency Checker. Retrieved","author":"Intel","year":"2025","unstructured":"Intel 2013. Intel Memory Latency Checker. Retrieved Apr 5, 2025 from https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/articles\/tool\/intelr-memory-latency-checker.html"},{"key":"e_1_3_2_1_23_1","unstructured":"Albert Q. Jiang Alexandre Sablayrolles Antoine Roux Arthur Mensch Blanche Savary Chris Bamford Devendra Singh Chaplot Diego de las Casas Emma Bou Hanna Florian Bressand Gianna Lengyel Guillaume Bour Guillaume Lample L\u00e9lio Renard Lavaud Lucile Saulnier MarieAnne Lachaux Pierre Stock Sandeep Subramanian Sophia Yang Szymon Antoniak Teven Le Scao Th\u00e9ophile Gervet Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William El Sayed. 2024. Mixtral of Experts. arXiv:2401.04088 [cs.LG] https:\/\/arxiv.org\/abs\/2401.04088"},{"key":"e_1_3_2_1_24_1","volume-title":"Fiddler: CPU-GPU Orchestration for Fast Inference of Mixture-of-Experts Models. arXiv:2402.07033 [cs.LG]","author":"Kamahori Keisuke","year":"2024","unstructured":"Keisuke Kamahori, Yile Gu, Kan Zhu, and Baris Kasikci. 2024. Fiddler: CPU-GPU Orchestration for Fast Inference of Mixture-of-Experts Models. arXiv:2402.07033 [cs.LG]"},{"key":"e_1_3_2_1_25_1","volume-title":"Mixture of quantized experts (moqe): Complementary effect of low-bit quantization and robustness. arXiv preprint arXiv:2310.02410","author":"Kim Young Jin","year":"2023","unstructured":"Young Jin Kim, Raffy Fahim, and Hany Hassan Awadalla. 2023. Mixture of quantized experts (moqe): Complementary effect of low-bit quantization and robustness. arXiv preprint arXiv:2310.02410 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles. 611\u2013626","author":"Kwon Woosuk","year":"2023","unstructured":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph Gonzalez, Hao Zhang, and Ion Stoica. 2023. Efficient memory management for large language model serving with pagedattention. In Proceedings of the 29th Symposium on Operating Systems Principles. 611\u2013626."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 6159\u20136172","author":"Lu Xudong","year":"2024","unstructured":"Xudong Lu, Qi Liu, Yuhui Xu, Aojun Zhou, Siyuan Huang, Bo Zhang, Junchi Yan, and Hongsheng Li. 2024. Not All Experts are Equal: Efficient Expert Pruning and Skipping for Mixture-of-Experts Large Language Models. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 6159\u20136172."},{"key":"e_1_3_2_1_28_1","unstructured":"Stephen Merity Caiming Xiong James Bradbury and Richard Socher. 2016. Pointer Sentinel Mixture Models. arXiv:1609.07843 [cs.CL]"},{"key":"e_1_3_2_1_29_1","volume-title":"Retrieved","author":"Meta","year":"2025","unstructured":"Meta 2025. Llama 4. Retrieved Apr 17, 2025 from https:\/\/www.llama.com\/models\/llama-4\/"},{"key":"e_1_3_2_1_30_1","volume-title":"Retrieved","author":"NVIDIA","year":"2018","unstructured":"NVIDIA 2018. NVIDIA Nsight Systems. Retrieved Jul 25, 2024 from https:\/\/developer.nvidia.com\/nsight-systems"},{"key":"e_1_3_2_1_31_1","volume-title":"Retrieved","author":"NVIDIA","year":"2019","unstructured":"NVIDIA 2019. Getting Started with CUDA Graphs. Retrieved Apr 17, 2025 from https:\/\/developer.nvidia.com\/blog\/cuda-graphs\/"},{"key":"e_1_3_2_1_32_1","volume-title":"PyTorch: an imperative style, high-performance deep learning library","author":"Paszke Adam","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: an imperative style, high-performance deep learning library. Curran Associates Inc., Red Hook, NY, USA."},{"key":"e_1_3_2_1_33_1","volume-title":"Split-wise: Efficient generative LLM inference using phase splitting. arXiv:2311.18677 [cs.AR]","author":"Patel Pratyush","year":"2024","unstructured":"Pratyush Patel, Esha Choukse, Chaojie Zhang, Aashaka Shah, \u00cd\u00f1igo Goiri, Saeed Maleki, and Ricardo Bianchini. 2024. Split-wise: Efficient generative LLM inference using phase splitting. arXiv:2311.18677 [cs.AR]"},{"key":"e_1_3_2_1_34_1","volume-title":"Retrieved","author":"Pybind","year":"2015","unstructured":"Pybind 2015. pybind\/pybind11. Retrieved Jun 30, 2024 from https:\/\/github.com\/pybind\/pybind11"},{"key":"e_1_3_2_1_35_1","volume-title":"23rd USENIX Conference on File and Storage Technologies (FAST 25)","author":"Qin Ruoyu","year":"2025","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Jialei Cui, Feng Ren, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2025. Mooncake: Trading More Storage for Less Computation \u2013 A KVCache-centric Architecture for Serving LLM Chatbot. In 23rd USENIX Conference on File and Storage Technologies (FAST 25). USENIX Association, Santa Clara, CA, 155\u2013170. https:\/\/www.usenix.org\/conference\/fast25\/presentation\/qin"},{"key":"e_1_3_2_1_36_1","volume-title":"The sparsely-gated mixture-of-experts layer. Outrageously large neural networks","author":"Shazeer N","year":"2017","unstructured":"N Shazeer, A Mirhoseini, K Maziarz, A Davis, Q Le, G Hinton, and J Dean. 2017. The sparsely-gated mixture-of-experts layer. Outrageously large neural networks (2017)."},{"key":"e_1_3_2_1_37_1","volume-title":"Promoe: Fast moe-based llm serving using proactive caching. arXiv preprint arXiv:2410.22134","author":"Song Xiaoniu","year":"2024","unstructured":"Xiaoniu Song, Zihang Zhong, Rong Chen, and Haibo Chen. 2024. Promoe: Fast moe-based llm serving using proactive caching. arXiv preprint arXiv:2410.22134 (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles","author":"Song Yixin","year":"2024","unstructured":"Yixin Song, Zeyu Mi, Haotong Xie, and Haibo Chen. 2024. PowerInfer: Fast Large Language Model Serving with a Consumer-grade GPU. In Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles (Austin, TX, USA) (SOSP '24). Association for Computing Machinery, New York, NY, USA, 590\u2013606. 10.1145\/3694715.3695964"},{"key":"e_1_3_2_1_39_1","first-page":"16342","article-title":"Specexec: Massively parallel speculative decoding for interactive llm inference on consumer devices","volume":"37","author":"Svirschevski Ruslan","year":"2024","unstructured":"Ruslan Svirschevski, Avner May, Zhuoming Chen, Beidi Chen, Zhihao Jia, and Max Ryabinin. 2024. Specexec: Massively parallel speculative decoding for interactive llm inference on consumer devices. Advances in Neural Information Processing Systems 37 (2024), 16342\u201316368.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_40_1","volume-title":"Hobbit: A mixed precision expert offloading system for fast moe inference. arXiv preprint arXiv:2411.01433","author":"Tang Peng","year":"2024","unstructured":"Peng Tang, Jiacheng Liu, Xiaofeng Hou, Yifei Pu, Jing Wang, Pheng-Ann Heng, Chao Li, and Minyi Guo. 2024. Hobbit: A mixed precision expert offloading system for fast moe inference. arXiv preprint arXiv:2411.01433 (2024)."},{"key":"e_1_3_2_1_41_1","unstructured":"Kimi Team Yifan Bai Yiping Bao Guanduo Chen Jiahao Chen Ningxin Chen Ruijue Chen Yanru Chen Yuankun Chen Yutian Chen Zhuofu Chen Jialei Cui Hao Ding Mengnan Dong Angang Du Chenzhuang Du Dikang Du Yulun Du Yu Fan Yichen Feng Kelin Fu Bofei Gao Hongcheng Gao Peizhong Gao Tong Gao Xinran Gu Longyu Guan Haiqing Guo Jianhang Guo Hao Hu Xiaoru Hao Tianhong He Weiran He Wenyang He Chao Hong Yangyang Hu Zhenxing Hu Weixiao Huang Zhiqi Huang Zihao Huang Tao Jiang Zhejun Jiang Xinyi Jin Yongsheng Kang Guokun Lai Cheng Li Fang Li Haoyang Li Ming Li Wentao Li Yanhao Li Yiwei Li Zhaowei Li Zheming Li Hongzhan Lin Xiaohan Lin Zongyu Lin Chengyin Liu Chenyu Liu Hongzhang Liu Jingyuan Liu Junqi Liu Liang Liu Shaowei Liu T. Y. Liu Tianwei Liu Weizhou Liu Yangyang Liu Yibo Liu Yiping Liu Yue Liu Zhengying Liu Enzhe Lu Lijun Lu Shengling Ma Xinyu Ma Yingwei Ma Shaoguang Mao Jie Mei Xin Men Yibo Miao Siyuan Pan Yebo Peng Ruoyu Qin Bowen Qu Zeyu Shang Lidong Shi Shengyuan Shi Feifan Song Jianlin Su Zhengyuan Su Xinjie Sun Flood Sung Heyi Tang Jiawen Tao Qifeng Teng Chensi Wang Dinglu Wang Feng Wang Haiming Wang Jianzhou Wang Jiaxing Wang Jinhong Wang Shengjie Wang Shuyi Wang Yao Wang Yejie Wang Yiqin Wang Yuxin Wang Yuzhi Wang Zhaoji Wang Zhengtao Wang Zhexu Wang Chu Wei Qianqian Wei Wenhao Wu Xingzhe Wu Yuxin Wu Chenjun Xiao Xiaotong Xie Weimin Xiong Boyu Xu Jing Xu Jinjing Xu L. H. Xu Lin Xu Suting Xu Weixin Xu Xinran Xu Yangchuan Xu Ziyao Xu Junjie Yan Yuzi Yan Xiaofei Yang Ying Yang Zhen Yang Zhilin Yang Zonghan Yang Haotian Yao Xingcheng Yao Wenjie Ye Zhuorui Ye Bohong Yin Longhui Yu Enming Yuan Hong-bang Yuan Mengjie Yuan Haobing Zhan Dehao Zhang Hao Zhang Wanlu Zhang Xiaobin Zhang Yangkun Zhang Yizhi Zhang Yongting Zhang Yu Zhang Yutao Zhang Yutong Zhang Zheng Zhang Haotian Zhao Yikai Zhao Huabin Zheng Shaojie Zheng Jianren Zhou Xinyu Zhou Zaida Zhou Zhen Zhu Weiyu Zhuang and Xinxing Zu. 2025. Kimi K2: Open Agentic Intelligence. arXiv:2507.20534 [cs.LG] https:\/\/arxiv.org\/abs\/2507.20534"},{"key":"e_1_3_2_1_42_1","unstructured":"Qwen Team. 2024. Qwen1.5-MoE: Matching 7B Model Performance with 1\/3 Activated Parameters\". https:\/\/qwenlm.github.io\/blog\/qwenmoe\/"},{"key":"e_1_3_2_1_43_1","volume-title":"uxlfoundation\/oneDNN. Retrieved","author":"Unified Acceleration","year":"2025","unstructured":"Unified Acceleration (UXL) Foundation 2016. uxlfoundation\/oneDNN. Retrieved Apr 6, 2025 from https:\/\/github.com\/uxlfoundation\/oneDNN"},{"key":"e_1_3_2_1_44_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_45_1","volume-title":"Bitnet: Scaling 1-bit transformers for large language models. arXiv preprint arXiv:2310.11453","author":"Wang Hongyu","year":"2023","unstructured":"Hongyu Wang, Shuming Ma, Li Dong, Shaohan Huang, Huaijie Wang, Lingxiao Ma, Fan Yang, Ruiping Wang, Yi Wu, and Furu Wei. 2023. Bitnet: Scaling 1-bit transformers for large language models. arXiv preprint arXiv:2310.11453 (2023)."},{"key":"e_1_3_2_1_46_1","volume-title":"Contamination-Free LLM Benchmark. In The Thirteenth International Conference on Learning Representations.","author":"White Colin","year":"2025","unstructured":"Colin White, Samuel Dooley, Manley Roberts, Arka Pal, Benjamin Feuer, Siddhartha Jain, Ravid Shwartz-Ziv, Neel Jain, Khalid Saifullah, Sreemanti Dey, Shubh-Agrawal, Sandeep Singh Sandha, Siddartha Venkat Naidu, Chinmay Hegde, Yann LeCun, Tom Goldstein, Willie Neiswanger, and Micah Goldblum. 2025. LiveBench: A Challenging, Contamination-Free LLM Benchmark. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38\u201345","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. 2020. Transformers: State-of-the-Art Natural Language Processing. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38\u201345. https:\/\/www.aclweb.org\/anthology\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_48_1","volume-title":"Moe-infinity: Activation-aware expert offloading for efficient moe serving. arXiv e-prints","author":"Xue Leyang","year":"2024","unstructured":"Leyang Xue, Yao Fu, Zhan Lu, Luo Mai, and Mahesh Marina. 2024. Moe-infinity: Activation-aware expert offloading for efficient moe serving. arXiv e-prints (2024), arXiv\u20132401."},{"key":"e_1_3_2_1_49_1","volume-title":"Powerinfer-2: Fast large language model inference on a smartphone. arXiv preprint arXiv:2406.06282","author":"Xue Zhenliang","year":"2024","unstructured":"Zhenliang Xue, Yixin Song, Zeyu Mi, Xinrui Zheng, Yubin Xia, and Haibo Chen. 2024. Powerinfer-2: Fast large language model inference on a smartphone. arXiv preprint arXiv:2406.06282 (2024)."},{"key":"e_1_3_2_1_50_1","unstructured":"An Yang Baosong Yang Binyuan Hui Bo Zheng Bowen Yu Chang Zhou Chengpeng Li Chengyuan Li Dayiheng Liu Fei Huang Guanting Dong Haoran Wei Huan Lin Jialong Tang Jialin Wang Jian Yang Jianhong Tu Jianwei Zhang Jianxin Ma Jin Xu Jingren Zhou Jinze Bai Jinzheng He Junyang Lin Kai Dang Keming Lu Keqin Chen Kexin Yang Mei Li Mingfeng Xue Na Ni Pei Zhang Peng Wang Ru Peng Rui Men Ruize Gao Runji Lin Shijie Wang Shuai Bai Sinan Tan Tianhang Zhu Tianhao Li Tianyu Liu Wenbin Ge Xiaodong Deng Xiaohuan Zhou Xingzhang Ren Xinyu Zhang Xipin Wei Xuancheng Ren Yang Fan Yang Yao Yichang Zhang Yu Wan Yunfei Chu Yuqiong Liu Zeyu Cui Zhenru Zhang and Zhihao Fan. 2024. Qwen2 Technical Report. arXiv preprint arXiv:2407.10671 (2024)."},{"key":"e_1_3_2_1_51_1","volume-title":"Flashinfer: Efficient and customizable attention engine for llm inference serving. arXiv preprint arXiv:2501.01005","author":"Ye Zihao","year":"2025","unstructured":"Zihao Ye, Lequn Chen, Ruihang Lai, Wuwei Lin, Yineng Zhang, Stephanie Wang, Tianqi Chen, Baris Kasikci, Vinod Grover, Arvind Krishnamurthy, et al. 2025. Flashinfer: Efficient and customizable attention engine for llm inference serving. arXiv preprint arXiv:2501.01005 (2025)."},{"key":"e_1_3_2_1_52_1","volume-title":"EdgeMoE: Empowering Sparse Large Language Models on Mobile Devices","author":"Yi Rongjie","year":"2025","unstructured":"Rongjie Yi, Liwei Guo, Shiyun Wei, Ao Zhou, Shangguang Wang, and Mengwei Xu. 2025. EdgeMoE: Empowering Sparse Large Language Models on Mobile Devices. IEEE Transactions on Mobile Computing (2025)."},{"key":"e_1_3_2_1_53_1","volume-title":"The Thirteenth International Conference on Learning Representations.","author":"Yue Tongtian","year":"2024","unstructured":"Tongtian Yue, Longteng Guo, Jie Cheng, Xuange Gao, Hua Huang, and Jing Liu. 2024. Ada-k routing: Boosting the efficiency of moe-based llms. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_54_1","volume-title":"Understanding the overheads of launching CUDA kernels. ICPP19","author":"Zhang Lingqi","year":"2019","unstructured":"Lingqi Zhang, Mohamed Wahib, and Satoshi Matsuoka. 2019. Understanding the overheads of launching CUDA kernels. ICPP19 (2019), 5\u20138."},{"key":"e_1_3_2_1_55_1","first-page":"196","article-title":"Atom: Low-bit quantization for efficient and accurate llm serving","volume":"6","author":"Zhao Yilong","year":"2024","unstructured":"Yilong Zhao, Chien-Yu Lin, Kan Zhu, Zihao Ye, Lequn Chen, Size Zheng, Luis Ceze, Arvind Krishnamurthy, Tianqi Chen, and Baris Kasikci. 2024. Atom: Low-bit quantization for efficient and accurate llm serving. Proceedings of Machine Learning and Systems 6 (2024), 196\u2013209.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_56_1","first-page":"62557","article-title":"Sglang: Efficient execution of structured language model programs","volume":"37","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Chuyue Livia Sun, Jeff Huang, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. 2024. Sglang: Efficient execution of structured language model programs. Advances in Neural Information Processing Systems 37 (2024), 62557\u201362583.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_57_1","volume-title":"Proceedings of the 43rd IEEE\/ACM International Conference on Computer-Aided Design. 1\u20139.","author":"Zhong Shuzhang","year":"2024","unstructured":"Shuzhang Zhong, Ling Liang, Yuan Wang, Runsheng Wang, Ru Huang, and Meng Li. 2024. Adapmoe: Adaptive sensitivity-based expert gating and management for efficient moe inference. In Proceedings of the 43rd IEEE\/ACM International Conference on Computer-Aided Design. 1\u20139."},{"key":"e_1_3_2_1_58_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. {DistServe}: Disaggregating prefill and decoding for goodput-optimized large language model serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 193\u2013210."}],"event":{"name":"SOSP '25: ACM SIGOPS 31st Symposium on Operating Systems Principles","location":"Lotte Hotel World Seoul Republic of Korea","acronym":"SOSP '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","USENIX"]},"container-title":["Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles"],"original-title":[],"deposited":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:45:55Z","timestamp":1759322755000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731569.3764843"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":58,"alternative-id":["10.1145\/3731569.3764843","10.1145\/3731569"],"URL":"https:\/\/doi.org\/10.1145\/3731569.3764843","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}