{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T13:05:19Z","timestamp":1780664719165,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,4,26]],"date-time":"2026-04-26T00:00:00Z","timestamp":1777161600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2527416"],"award-info":[{"award-number":["2527416"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2534241"],"award-info":[{"award-number":["2534241"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2523997"],"award-info":[{"award-number":["2523997"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2340125"],"award-info":[{"award-number":["2340125"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2127918"],"award-info":[{"award-number":["2127918"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000002","name":"NIH (National Institutes of Health)","doi-asserted-by":"publisher","award":["R01CA297832"],"award-info":[{"award-number":["R01CA297832"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3769319","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"176-191","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Taming Latency-Memory Trade-Off in MoE-Based LLM Serving via Fine-Grained Expert Offloading"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5790-4981","authenticated-orcid":false,"given":"Hanfei","family":"Yu","sequence":"first","affiliation":[{"name":"Stevens Institute of Technology, Hoboken, NJ, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9561-0118","authenticated-orcid":false,"given":"Xingqi","family":"Cui","sequence":"additional","affiliation":[{"name":"Rice University, Houston, Texas, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7468-3746","authenticated-orcid":false,"given":"Hong","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Waterloo, Waterloo, Ontario, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7308-938X","authenticated-orcid":false,"given":"Hao","family":"Wang","sequence":"additional","affiliation":[{"name":"Rutgers University, New Brunswick, NJ, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1444-2657","authenticated-orcid":false,"given":"Hao","family":"Wang","sequence":"additional","affiliation":[{"name":"Stevens Institue of Technology, Hoboken, NJ, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Nguyen Bach, Amit Bahree, Arash Bakhtiari, Jianmin Bao, Harkirat Behl, et al.","author":"Abdin Marah","year":"2024","unstructured":"Marah Abdin, Jyoti Aneja, Hany Awadalla, Ahmed Awadallah, Ammar Ahmad Awan, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Jianmin Bao, Harkirat Behl, et al. 2024. Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone. arXiv preprint arXiv:2404.14219 (2024)."},{"key":"e_1_3_2_1_2_1","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et al. 2023. GPT-4 Technical Report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI).","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_5_1","unstructured":"AWS. 2006. AWS EC2: Secure and Resizable Compute Capacity in the Cloud. https:\/\/aws.amazon.com\/ec2\/."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3721146.3721940"},{"key":"e_1_3_2_1_7_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language Models are Few-Shot Learners. Advances in neural information processing systems (2020)."},{"key":"e_1_3_2_1_8_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Cai Ruisi","year":"2024","unstructured":"Ruisi Cai, Yeonju Ro, Geon-Woo Kim, Peihao Wang, Babak Ehteshami Bejnordi, Aditya Akella, and Zhangyang Wang. 2024. Read-ME: Refactorizing LLMs as Router-Decoupled Mixture of Experts with System Co-Design. In The Thirty-eighth Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-00296-0"},{"key":"e_1_3_2_1_10_1","volume-title":"Introduction to Algorithms","author":"Cormen Thomas H","unstructured":"Thomas H Cormen, Charles E Leiserson, Ronald L Rivest, and Clifford Stein. 2022. Introduction to Algorithms. MIT press."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Damai Dai Chengqi Deng Chenggang Zhao RX Xu Huazuo Gao Deli Chen Jiashi Li Wangding Zeng Xingkai Yu Y Wu et al. 2024. DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models. arXiv preprint arXiv:2401.06066 (2024).","DOI":"10.18653\/v1\/2024.acl-long.70"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671882"},{"key":"e_1_3_2_1_13_1","volume-title":"GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. In International Conference on Learning Representations (ICLR).","author":"Lepikhin Dmitry","year":"2021","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, Zhifeng Chen. 2021. GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of Machine Learning and Systems (MLSys)","author":"Du Zhixu","year":"2024","unstructured":"Zhixu Du, Shiyu Li, Yuhao Wu, Xiangyu Jiang, Jingwei Sun, Qilin Zheng, Yongkai Wu, Ang Li, Hai Li, and Yiran Chen. 2024. SiDA: Sparsity-Inspired Data-Aware Serving for Efficient and Scalable Large Mixture-of-Experts Models. Proceedings of Machine Learning and Systems (MLSys) (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Covering Spheres with Spheres. Discrete & Computational Geometry","author":"Dumer Ilya","year":"2007","unstructured":"Ilya Dumer. 2007. Covering Spheres with Spheres. Discrete & Computational Geometry (2007)."},{"key":"e_1_3_2_1_16_1","volume-title":"Fast Inference of Mixture-of-Experts Language Models with Offloading. arXiv preprint arXiv:2312.17238","author":"Eliseev Artyom","year":"2023","unstructured":"Artyom Eliseev and Denis Mazur. 2023. Fast Inference of Mixture-of-Experts Language Models with Offloading. arXiv preprint arXiv:2312.17238 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"The Minimum Covering Sphere Problem. Management Science","author":"Jack Elzinga D","year":"1972","unstructured":"D Jack Elzinga and Donald W Hearn. 1972. The Minimum Covering Sphere Problem. Management Science (1972)."},{"key":"e_1_3_2_1_18_1","volume-title":"Lynx: Enabling Efficient MoE Inference through Dynamic Batch-Aware Expert Selection. arXiv preprint arXiv:2411.08982","author":"Gupta Vima","year":"2024","unstructured":"Vima Gupta, Kartik Sinha, Ada Gavrilovska, and Anand Padmanabha Iyer. 2024. Lynx: Enabling Efficient MoE Inference through Dynamic Batch-Aware Expert Selection. arXiv preprint arXiv:2411.08982 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"Mark Wiebe, Pearu Peterson, Pierre G\u00e9rard-Marchant, Kevin Sheppard, Tyler Reddy, Warren Weckesser, Hameer Abbasi, Christoph Gohlke, and Travis E. Oliphant.","author":"Harris Charles R.","year":"2020","unstructured":"Charles R. Harris, K. Jarrod Millman, St\u00e9fan J. van der Walt, Ralf Gommers, Pauli Virtanen, David Cournapeau, Eric Wieser, Julian Taylor, Sebastian Berg, Nathaniel J. Smith, Robert Kern, Matti Picus, Stephan Hoyer, Marten H. van Kerkwijk, Matthew Brett, Allan Haldane, Jaime Fern\u00e1ndez del R\u00edo, Mark Wiebe, Pearu Peterson, Pierre G\u00e9rard-Marchant, Kevin Sheppard, Tyler Reddy, Warren Weckesser, Hameer Abbasi, Christoph Gohlke, and Travis E. Oliphant. 2020. Array Programming with NumPy. Nature (2020)."},{"key":"e_1_3_2_1_20_1","volume-title":"Capacity-Aware Inference: Mitigating the Straggler Effect in Mixture of Experts. arXiv preprint arXiv:2503.05066","author":"He Shwai","year":"2025","unstructured":"Shwai He, Weilin Cai, Jiayi Huang, and Ang Li. 2025. Capacity-Aware Inference: Mitigating the Straggler Effect in Mixture of Experts. arXiv preprint arXiv:2503.05066 (2025)."},{"key":"e_1_3_2_1_21_1","volume-title":"Towards Efficient Mixture of Experts: A Holistic Study of Compression Techniques. arXiv preprint arXiv:2406.02500","author":"He Shwai","year":"2024","unstructured":"Shwai He, Daize Dong, Liang Ding, and Ang Li. 2024. Towards Efficient Mixture of Experts: A Holistic Study of Compression Techniques. arXiv preprint arXiv:2406.02500 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"Pre-gated MoE: An Algorithm-System Co-Design for Fast and Scalable Mixture-of-Expert Inference. In 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA).","author":"Hwang Ranggi","year":"2024","unstructured":"Ranggi Hwang, Jianyu Wei, Shijie Cao, Changho Hwang, Xiaohu Tang, Ting Cao, and Mao Yang. 2024. Pre-gated MoE: An Algorithm-System Co-Design for Fast and Scalable Mixture-of-Expert Inference. In 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA)."},{"key":"e_1_3_2_1_23_1","volume-title":"Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al.","author":"Jiang Albert Q","year":"2024","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al. 2024. Mixtral of Experts. arXiv preprint arXiv:2401.04088 (2024)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643733"},{"key":"e_1_3_2_1_25_1","volume-title":"Mixture of Lookup Experts. In International Conference on Machine Learning (ICML).","author":"Jie Shibo","year":"2025","unstructured":"Shibo Jie, Yehui Tang, Kai Han, Yitong Li, Duyu Tang, Zhi-Hong Deng, and Yunhe Wang. 2025. Mixture of Lookup Experts. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_26_1","volume-title":"Fiddler: CPU-GPU Orchestration for Fast Inference of Mixture-of-Experts Models. In International Conference on Learning Representations (ICLR).","author":"Kamahori Keisuke","year":"2025","unstructured":"Keisuke Kamahori, Tian Tang, Yile Gu, Kan Zhu, and Baris Kasikci. 2025. Fiddler: CPU-GPU Orchestration for Fast Inference of Mixture-of-Experts Models. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_27_1","volume-title":"Mixture of Quantized Experts (MoQE): Complementary Effect of Low-bit Quantization and Robustness. arXiv preprint arXiv:2310.02410","author":"Kim Young Jin","year":"2023","unstructured":"Young Jin Kim, Raffy Fahim, and Hany Hassan Awadalla. 2023. Mixture of Quantized Experts (MoQE): Complementary Effect of Low-bit Quantization and Robustness. arXiv preprint arXiv:2310.02410 (2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_29_1","volume-title":"When It Doesn't, and Why. ACM Transactions on Architecture and Code Optimization (TACO)","author":"Lee Jaekyu","year":"2012","unstructured":"Jaekyu Lee, Hyesoon Kim, and Richard Vuduc. 2012. When Prefetching Works, When It Doesn't, and Why. ACM Transactions on Architecture and Code Optimization (TACO) (2012)."},{"key":"e_1_3_2_1_30_1","volume-title":"STUN: Structured-Then-Unstructured Pruning for Scalable MoE Pruning. arXiv preprint arXiv:2409.06211","author":"Lee Jaeseong","year":"2024","unstructured":"Jaeseong Lee, Aurick Qiao, Daniel F Campos, Zhewei Yao, Yuxiong He, et al. 2024. STUN: Structured-Then-Unstructured Pruning for Scalable MoE Pruning. arXiv preprint arXiv:2409.06211 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"InfiniGen: Efficient Generative Inference of Large Language Models with Dynamic KV Cache Management. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI).","author":"Lee Wonbeom","year":"2024","unstructured":"Wonbeom Lee, Jungi Lee, Junghwan Seo, and Jaewoong Sim. 2024. InfiniGen: Efficient Generative Inference of Large Language Models with Dynamic KV Cache Management. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_32_1","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Li Jiamin","year":"2023","unstructured":"Jiamin Li, Yimin Jiang, Yibo Zhu, Cong Wang, and Hong Xu. 2023. Accelerating Distributed MoE training and inference with Lina. In 2023 USENIX Annual Technical Conference (USENIX ATC 23)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643754"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657807"},{"key":"e_1_3_2_1_35_1","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024. DeepSeek-V3 Technical Report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Optimizing Distributed Deployment of Mixture-of-Experts Model Inference in Serverless Computing. In IEEE Conference on Computer Communications (INFOCOM).","author":"Liu Mengfan","year":"2025","unstructured":"Mengfan Liu, Wei Wang, and Chuan Wu. 2025. Optimizing Distributed Deployment of Mixture-of-Experts Model Inference in Serverless Computing. In IEEE Conference on Computer Communications (INFOCOM)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672274"},{"key":"e_1_3_2_1_38_1","unstructured":"Tomas Mikolov Kai Chen Greg Corrado and Jeffrey Dean. 2013. Efficient Estimation of Word Representations in Vector Space. (2013)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3597503.3639187"},{"key":"e_1_3_2_1_40_1","unstructured":"NVIDIA. 2024. CUDA Runtime API :: CUDA Toolkit Documentation. https:\/\/docs.nvidia.com\/cuda\/cuda-runtime-api\/index.html."},{"key":"e_1_3_2_1_41_1","unstructured":"Ollama. 2024. Get Up and Running with Large Language Models. https:\/\/ollama.com\/."},{"key":"e_1_3_2_1_42_1","unstructured":"Adam Paszke Sam Gross Francisco Massa Adam Lerer James Bradbury Gregory Chanan Trevor Killeen Zeming Lin Natalia Gimelshein Luca Antiga et al. 2019. PyTorch: An Imperative Style High-Performance Deep Learning Library. Advances in Neural Information Processing Systems (NIPS) (2019)."},{"key":"e_1_3_2_1_43_1","volume-title":"Splitwise: Efficient Generative LLM Inference Using Phase Splitting. In 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA).","author":"Patel Pratyush","year":"2024","unstructured":"Pratyush Patel, Esha Choukse, Chaojie Zhang, Aashaka Shah, \u00cd\u00f1igo Goiri, Saeed Maleki, and Ricardo Bianchini. 2024. Splitwise: Efficient Generative LLM Inference Using Phase Splitting. In 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA)."},{"key":"e_1_3_2_1_44_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Li Pingzhi","year":"2024","unstructured":"Pingzhi Li, Zhenyu Zhang, Prateek Yadav, Yi-Lin Sung, Yu Cheng, Mohit Bansal, Tianlong Chen. 2024. Merge, Then Compress: Demystify Efficient SMoE with Hints from Its Routing Policy. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_45_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language Models are Unsupervised Multi-task Learners. OpenAI blog (2019)."},{"key":"e_1_3_2_1_46_1","volume-title":"On the Closest Packing of Spheres in N Dimensions. Annals of Mathematics","author":"Rankin Robert Alexander","year":"1947","unstructured":"Robert Alexander Rankin. 1947. On the Closest Packing of Spheres in N Dimensions. Annals of Mathematics (1947)."},{"key":"e_1_3_2_1_47_1","volume-title":"SwapMoE: Serving Off-the-shelf MoE-based Large Language Models with Tunable Memory Budget. arXiv preprint arXiv:2308.15030","author":"Kong Rui","year":"2023","unstructured":"Rui Kong, Yuanchun Li, Qingtian Feng, Weijun Wang, Xiaozhou Ye, Ye Ouyang, Linghe Kong, Yunxin Liu. 2023. SwapMoE: Serving Off-the-shelf MoE-based Large Language Models with Tunable Memory Budget. arXiv preprint arXiv:2308.15030 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"A Mathematical Theory of Communication. The Bell System Technical Journal","author":"Shannon Claude Elwood","year":"1948","unstructured":"Claude Elwood Shannon. 1948. A Mathematical Theory of Communication. The Bell System Technical Journal (1948)."},{"key":"e_1_3_2_1_49_1","unstructured":"ShareGPT. 2022. ShareGPT: Share Your Wildest ChatGPT Conversations. https:\/\/sharegpt.com\/."},{"key":"e_1_3_2_1_50_1","volume-title":"Snowflake Arctic: The Best LLM for Enterprise AI. https:\/\/www.snowflake.com\/en\/data-cloud\/arctic\/.","year":"2024","unstructured":"Snowflake. 2024. Snowflake Arctic: The Best LLM for Enterprise AI. https:\/\/www.snowflake.com\/en\/data-cloud\/arctic\/."},{"key":"e_1_3_2_1_51_1","volume-title":"ProMoE: Fast MoE-based LLM Serving using Proactive Caching. arXiv preprint arXiv:2410.22134","author":"Song Xiaoniu","year":"2024","unstructured":"Xiaoniu Song, Zihang Zhong, and Rong Chen. 2024. ProMoE: Fast MoE-based LLM Serving using Proactive Caching. arXiv preprint arXiv:2410.22134 (2024)."},{"key":"e_1_3_2_1_52_1","volume-title":"DynamoLLM: Designing LLM Inference Clusters for Performance and Energy Efficiency. In International Symposium on High-Performance Computer Architecture (HPCA).","author":"Stojkovic Jovan","year":"2025","unstructured":"Jovan Stojkovic, Chaojie Zhang, \u00cd\u00f1igo Goiri, Josep Torrellas, and Esha Choukse. 2025. DynamoLLM: Designing LLM Inference Clusters for Performance and Energy Efficiency. In International Symposium on High-Performance Computer Architecture (HPCA)."},{"key":"e_1_3_2_1_53_1","volume-title":"Hobbit: A Mixed Precision Expert Offloading System for Fast MoE Inference. arXiv preprint arXiv:2411.01433","author":"Tang Peng","year":"2024","unstructured":"Peng Tang, Jiacheng Liu, Xiaofeng Hou, Yifei Pu, Jing Wang, Pheng-Ann Heng, Chao Li, and Minyi Guo. 2024. Hobbit: A Mixed Precision Expert Offloading System for Fast MoE Inference. arXiv preprint arXiv:2411.01433 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Attention is all you need. Advances in Neural Information Processing Systems","author":"Vaswani A","year":"2017","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_1_55_1","volume-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations.","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, Remi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander Rush. 2020. Hugging-Face's Transformers: State-of-the-Art Natural Language Processing. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3717455"},{"key":"e_1_3_2_1_57_1","unstructured":"xAI. 2023. Announcing Grok. https:\/\/x.ai\/blog\/grok."},{"key":"e_1_3_2_1_58_1","volume-title":"MoE-Infinity: Efficient MoE Inference on Personal Machines with Sparsity-Aware Expert Cache. arXiv preprint arXiv:2401.14361","author":"Xue Leyang","year":"2024","unstructured":"Leyang Xue, Yao Fu, Zhan Lu, Luo Mai, and Mahesh Marina. 2024. MoE-Infinity: Efficient MoE Inference on Personal Machines with Sparsity-Aware Expert Cache. arXiv preprint arXiv:2401.14361 (2024)."},{"key":"e_1_3_2_1_59_1","unstructured":"Xue Leyang and Fu Yao and Lu Zhan and Mai Luo and Marina Mahesh. [n. d.]. MoE-Infinity Codebase. https:\/\/github.com\/TorchMoE\/MoE-Infinity."},{"key":"e_1_3_2_1_60_1","unstructured":"An Yang Baosong Yang Binyuan Hui Bo Zheng Bowen Yu Chang Zhou Chengpeng Li Chengyuan Li Dayiheng Liu Fei Huang et al. 2024. Qwen2 Technical Report. arXiv preprint arXiv:2407.10671 (2024)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2012.2200299"},{"key":"e_1_3_2_1_62_1","volume-title":"DAOP: Data-Aware Offloading and Predictive Pre-Calculation for Efficient MoE Inference. In Design Automation and Test in Europe (DATE).","author":"Zhang Yujie","year":"2025","unstructured":"Yujie Zhang, Shivam Aggarwal, and Tulika Mitra. 2025. DAOP: Data-Aware Offloading and Predictive Pre-Calculation for Efficient MoE Inference. In Design Automation and Test in Europe (DATE)."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657828"},{"key":"e_1_3_2_1_64_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Tianle Li Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zhuohan Li Zi Lin Eric P Xing et al. 2023. LMSYS-Chat-1M: A Large-Scale Real-World LLM Conversation Dataset. arXiv preprint arXiv:2309.11998 (2023)."},{"key":"e_1_3_2_1_65_1","volume-title":"DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI).","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_66_1","volume-title":"FloE: On-the-Fly MoE Inference on Memory-constrained GPU. In International Conference on Machine Learning (ICML).","author":"Zhou Yuxin","year":"2025","unstructured":"Yuxin Zhou, Zheng Li, Jun Zhang, Jue Wang, Yiping Wang, Zhongle Xie, Ke Chen, and Lidan Shou. 2025. FloE: On-the-Fly MoE Inference on Memory-constrained GPU. In International Conference on Machine Learning (ICML)."}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3767295.3769319","content-type":"text\/html","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3767295.3769319","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3767295.3769319","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T12:12:07Z","timestamp":1780661527000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3769319"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":66,"alternative-id":["10.1145\/3767295.3769319","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3769319","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}