{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T15:31:19Z","timestamp":1773588679807,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China","award":["U24A20234"],"award-info":[{"award-number":["U24A20234"]}]},{"name":"Young Elite Scientists Sponsorship Program by CAST","award":["YESS20240529"],"award-info":[{"award-number":["YESS20240529"]}]},{"name":"Research Grants Council of the Hong Kong SAR, China","award":["T45-401&#x5c;&#x2f;22-N"],"award-info":[{"award-number":["T45-401&#x5c;&#x2f;22-N"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,22]]},"DOI":"10.1145\/3779212.3790187","type":"proceedings-article","created":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T13:55:26Z","timestamp":1773150926000},"page":"1185-1200","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MoE-APEX: An Efficient MoE Inference System with Adaptive Precision Expert Offloading"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-8196-3953","authenticated-orcid":false,"given":"Peng","family":"Tang","sequence":"first","affiliation":[{"name":"Shanghai Jiaotong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0378-2311","authenticated-orcid":false,"given":"Jiacheng","family":"Liu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4372-7851","authenticated-orcid":false,"given":"Xiaofeng","family":"Hou","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7220-1643","authenticated-orcid":false,"given":"Yifei","family":"Pu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7260-0521","authenticated-orcid":false,"given":"Jing","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3055-5034","authenticated-orcid":false,"given":"Pheng-Ann","family":"Heng","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6218-4659","authenticated-orcid":false,"given":"Chao","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0034-2302","authenticated-orcid":false,"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,3,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, et al.","author":"Abdin Marah","year":"2024","unstructured":"Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, et al., 2024. Phi-3 technical report: A highly capable language model locally on your phone. arXiv preprint arXiv:2404.14219 (2024)."},{"key":"e_1_3_2_1_2_1","volume-title":"Gkd: Generalized knowledge distillation for auto-regressive sequence models. arXiv preprint arXiv:2306.13649","author":"Agarwal Rishabh","year":"2023","unstructured":"Rishabh Agarwal, Nino Vieillard, Piotr Stanczyk, Sabela Ramos, Matthieu Geist, and Olivier Bachem. 2023. Gkd: Generalized knowledge distillation for auto-regressive sequence models. arXiv preprint arXiv:2306.13649 (2023)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_4_1","unstructured":"Apple. 2024. Introducing Apple's On-Device and Server Foundation Models. https:\/\/machinelearning.apple.com\/research\/introducing-apple-foundation-models."},{"key":"e_1_3_2_1_5_1","volume-title":"Compressing large language models by streamlining the unimportant layer. arXiv preprint arXiv:2403.19135","author":"Chen Xiaodong","year":"2024","unstructured":"Xiaodong Chen, Yuxuan Hu, and Jing Zhang. 2024. Compressing large language models by streamlining the unimportant layer. arXiv preprint arXiv:2403.19135 (2024)."},{"key":"e_1_3_2_1_6_1","unstructured":"Xiangxiang Chu Limeng Qiao Xinyu Zhang Shuang Xu Fei Wei Yang Yang Xiaofei Sun Yiming Hu Xinyang Lin Bo Zhang et al. 2024. Mobilevlm v2: Faster and stronger baseline for vision language model. arXiv preprint arXiv:2402.03766 (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge. arXiv:1803.05457v1","author":"Clark Peter","year":"2018","unstructured":"Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, and Oyvind Tafjord. 2018. Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge. arXiv:1803.05457v1 (2018)."},{"key":"e_1_3_2_1_8_1","volume-title":"Training Verifiers to Solve Math Word Problems. arXiv preprint arXiv:2110.14168","author":"Cobbe Karl","year":"2021","unstructured":"Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, Christopher Hesse, and John Schulman. 2021. Training Verifiers to Solve Math Word Problems. arXiv preprint arXiv:2110.14168 (2021)."},{"key":"e_1_3_2_1_9_1","volume-title":"DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models. CoRR","author":"Dai Damai","year":"2024","unstructured":"Damai Dai, Chengqi Deng, Chenggang Zhao, R. X. Xu, Huazuo Gao, Deli Chen, Jiashi Li, Wangding Zeng, Xingkai Yu, Y. Wu, Zhenda Xie, Y. K. Li, Panpan Huang, Fuli Luo, Chong Ruan, Zhifang Sui, and Wenfeng Liang. 2024. DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models. CoRR, Vol. abs\/2401.06066 (2024)."},{"key":"e_1_3_2_1_10_1","unstructured":"DeepSeek-AI. 2024. DeepSeek-V2: A Strong Economical and Efficient Mixture-of-Experts Language Model. arXiv:2405.04434 [cs.CL]"},{"key":"e_1_3_2_1_11_1","first-page":"30318","article-title":"Gpt3. int8 (): 8-bit matrix multiplication for transformers at scale","volume":"35","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. 2022. Gpt3. int8 (): 8-bit matrix multiplication for transformers at scale. Advances in Neural Information Processing Systems, Vol. 35 (2022), 30318-30332.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_12_1","volume-title":"Learning Factored Representations in a Deep Mixture of Experts. CoRR","author":"Eigen David","year":"2013","unstructured":"David Eigen, Marc'Aurelio Ranzato, and Ilya Sutskever. 2013. Learning Factored Representations in a Deep Mixture of Experts. CoRR, Vol. abs\/1312.4314 (2013)."},{"key":"e_1_3_2_1_13_1","volume-title":"Fast inference of mixture-of-experts language models with offloading. arXiv preprint arXiv:2312.17238","author":"Eliseev Artyom","year":"2023","unstructured":"Artyom Eliseev and Denis Mazur. 2023. Fast inference of mixture-of-experts language models with offloading. arXiv preprint arXiv:2312.17238 (2023)."},{"key":"e_1_3_2_1_14_1","unstructured":"Hugging Face. 2024. Hugging Face Hub. https:\/\/huggingface.co\/docs\/hub\/index."},{"key":"e_1_3_2_1_15_1","article-title":"Switch transformers: scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2022. Switch transformers: scaling to trillion parameter models with simple and efficient sparsity. J. Mach. Learn. Res., Vol. 23, 1, Article 120 (jan 2022), 39 pages.","journal-title":"J. Mach. Learn. Res."},{"key":"e_1_3_2_1_16_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323 (2022)."},{"key":"e_1_3_2_1_17_1","volume-title":"Burak Kantarci, Burak Cakmak, Arda Ozgun, and Nassira Ghoualmi-Zine.","author":"Friha Othmane","year":"2024","unstructured":"Othmane Friha, Mohamed Amine Ferrag, Burak Kantarci, Burak Cakmak, Arda Ozgun, and Nassira Ghoualmi-Zine. 2024. LLM-Based Edge Intelligence: A Comprehensive Survey on Architectures, Applications, Security and Trustworthiness. IEEE Open Journal of the Communications Society (2024)."},{"key":"e_1_3_2_1_18_1","unstructured":"Georgi Gerganov. 2023. ggerganov\/llama.cpp: Port of facebook's llama model in c\/c. https:\/\/github.com\/ggerganov\/llama.cpp."},{"key":"e_1_3_2_1_19_1","volume-title":"Knowledge distillation of large language models. arXiv preprint arXiv:2306.08543","author":"Gu Yuxian","year":"2023","unstructured":"Yuxian Gu, Li Dong, Furu Wei, and Minlie Huang. 2023. Knowledge distillation of large language models. arXiv preprint arXiv:2306.08543 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"Accelerate: Training and inference at scale made simple, efficient and adaptable. https:\/\/github.com\/huggingface\/accelerate.","author":"Gugger Sylvain","year":"2022","unstructured":"Sylvain Gugger, Lysandre Debut, Thomas Wolf, Philipp Schmid, Zachary Mueller, Sourab Mangrulkar, Marc Sun, and Benjamin Bossan. 2022. Accelerate: Training and inference at scale made simple, efficient and adaptable. https:\/\/github.com\/huggingface\/accelerate."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/34.142911"},{"key":"e_1_3_2_1_22_1","volume-title":"Mixture Compressor for Mixture-of-Experts LLMs Gains More. arXiv preprint arXiv:2410.06270","author":"Huang Wei","year":"2024","unstructured":"Wei Huang, Yue Liao, Jianhui Liu, Ruifei He, Haoru Tan, Shiming Zhang, Hongsheng Li, Si Liu, and Xiaojuan Qi. 2024. Mixture Compressor for Mixture-of-Experts LLMs Gains More. arXiv preprint arXiv:2410.06270 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"In-context learning distillation: Transferring few-shot learning ability of pre-trained language models. arXiv preprint arXiv:2212.10670","author":"Huang Yukun","year":"2022","unstructured":"Yukun Huang, Yanda Chen, Zhou Yu, and Kathleen McKeown. 2022. In-context learning distillation: Transferring few-shot learning ability of pre-trained language models. arXiv preprint arXiv:2212.10670 (2022)."},{"key":"e_1_3_2_1_24_1","unstructured":"Huawei. 2023. Beating Google and Apple Huawei brings large AI model to mobile voice assistant. https:\/\/www.huaweicentral.com\/beating-google-and-apple-huawei-brings-large-ai-model-to-mobile-voice-assistant\/."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00078"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1991.3.1.79"},{"key":"e_1_3_2_1_27_1","volume-title":"Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al.","author":"Jiang Albert Q","year":"2024","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al., 2024. Mixtral of experts. arXiv preprint arXiv:2401.04088 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"Fiddler: CPU-GPU Orchestration for Fast Inference of Mixture-of-Experts Models. arXiv preprint arXiv:2402.07033","author":"Kamahori Keisuke","year":"2024","unstructured":"Keisuke Kamahori, Yile Gu, Kan Zhu, and Baris Kasikci. 2024. Fiddler: CPU-GPU Orchestration for Fast Inference of Mixture-of-Experts Models. arXiv preprint arXiv:2402.07033 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"Shortened llama: A simple depth pruning for large language models. arXiv preprint arXiv:2402.02834","author":"Kim Bo-Kyeong","year":"2024","unstructured":"Bo-Kyeong Kim, Geonmin Kim, Tae-Ho Kim, Thibault Castells, Shinkook Choi, Junho Shin, and Hyoung-Kyu Song. 2024. Shortened llama: A simple depth pruning for large language models. arXiv preprint arXiv:2402.02834 (2024)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.363"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_32_1","volume-title":"CATS: Contextually-Aware Thresholding for Sparsity in Large Language Models. arXiv preprint arXiv:2404.08763","author":"Lee Je-Yong","year":"2024","unstructured":"Je-Yong Lee, Donghyun Lee, Genghan Zhang, Mo Tiwari, and Azalia Mirhoseini. 2024. CATS: Contextually-Aware Thresholding for Sparsity in Large Language Models. arXiv preprint arXiv:2404.08763 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"Adaptive gating in mixture-of-experts based language models. arXiv preprint arXiv:2310.07188","author":"Li Jiamin","year":"2023","unstructured":"Jiamin Li, Qiang Su, Yitao Yang, Yimin Jiang, Cong Wang, and Hong Xu. 2023a. Adaptive gating in mixture-of-experts based language models. arXiv preprint arXiv:2310.07188 (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"then compress: Demystify efficient SMoe with hints from its routing policy. arXiv preprint arXiv:2310.01334","author":"Li Pingzhi","year":"2023","unstructured":"Pingzhi Li, Zhenyu Zhang, Prateek Yadav, Yi-Lin Sung, Yu Cheng, Mohit Bansal, and Tianlong Chen. 2023c. Merge, then compress: Demystify efficient SMoe with hints from its routing policy. arXiv preprint arXiv:2310.01334 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Li Yixiao","year":"2023","unstructured":"Yixiao Li, Yifan Yu, Qingru Zhang, Chen Liang, Pengcheng He, Weizhu Chen, and Tuo Zhao. 2023b. Losparse: Structured compression of large language models based on low-rank and sparse approximation. In International Conference on Machine Learning. PMLR, 20336-20350."},{"key":"e_1_3_2_1_36_1","volume-title":"Truthfulqa: Measuring how models mimic human falsehoods. arXiv preprint arXiv:2109.07958","author":"Lin Stephanie","year":"2021","unstructured":"Stephanie Lin, Jacob Hilton, and Owain Evans. 2021. Truthfulqa: Measuring how models mimic human falsehoods. arXiv preprint arXiv:2109.07958 (2021)."},{"key":"e_1_3_2_1_37_1","volume-title":"International Conference on Machine Learning. PMLR, 22137-22176","author":"Liu Zichang","year":"2023","unstructured":"Zichang Liu, Jue Wang, Tri Dao, Tianyi Zhou, Binhang Yuan, Zhao Song, Anshumali Shrivastava, Ce Zhang, Yuandong Tian, Christopher Re, et al., 2023. Deja vu: Contextual sparsity for efficient llms at inference time. In International Conference on Machine Learning. PMLR, 22137-22176."},{"key":"e_1_3_2_1_38_1","volume-title":"Llm-pruner: On the structural pruning of large language models. Advances in neural information processing systems","author":"Ma Xinyin","year":"2023","unstructured":"Xinyin Ma, Gongfan Fang, and Xinchao Wang. 2023. Llm-pruner: On the structural pruning of large language models. Advances in neural information processing systems, Vol. 36 (2023), 21702-21720."},{"key":"e_1_3_2_1_39_1","volume-title":"Shortgpt: Layers in large language models are more redundant than you expect. arXiv preprint arXiv:2403.03853","author":"Men Xin","year":"2024","unstructured":"Xin Men, Mingyu Xu, Qingyu Zhang, Bingning Wang, Hongyu Lin, Yaojie Lu, Xianpei Han, and Weipeng Chen. 2024. Shortgpt: Layers in large language models are more redundant than you expect. arXiv preprint arXiv:2403.03853 (2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"GeForce RTX 2080 Ti. https:\/\/www.nvidia.com\/content\/geforce-gtx\/GEFORCE_RTX_2080Ti_User_Guide.pdf.","author":"NVIDIA.","year":"2019","unstructured":"NVIDIA. 2019. GeForce RTX 2080 Ti. https:\/\/www.nvidia.com\/content\/geforce-gtx\/GEFORCE_RTX_2080Ti_User_Guide.pdf."},{"key":"e_1_3_2_1_41_1","unstructured":"NVIDIA. 2024a. GeForce RTX 4090. https:\/\/www.nvidia.com\/en-us\/geforce\/graphics-cards\/40-series\/rtx-4090\/."},{"key":"e_1_3_2_1_42_1","unstructured":"NVIDIA. 2024b. Jetson AGX Orin Developer Kit. https:\/\/www.nvidia.com\/en-us\/autonomous-machines\/embedded-systems\/jetson-orin\/."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_44_1","unstructured":"Qualcomm AI Research. 2023. World's first on-device demonstration of Stable Diffusion on an Android phone. https:\/\/www.qualcomm.com\/news\/onq\/2023\/02\/worlds-first-on-device-demonstration-of-stable-diffusion-on-android."},{"key":"e_1_3_2_1_45_1","volume-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538 (2017)."},{"key":"e_1_3_2_1_46_1","volume-title":"Promoe: Fast moe-based llm serving using proactive caching. arXiv preprint arXiv:2410.22134","author":"Song Xiaoniu","year":"2024","unstructured":"Xiaoniu Song, Zihang Zhong, Rong Chen, and Haibo Chen. 2024. Promoe: Fast moe-based llm serving using proactive caching. arXiv preprint arXiv:2410.22134 (2024)."},{"key":"e_1_3_2_1_47_1","volume-title":"Powerinfer: Fast large language model serving with a consumer-grade gpu. arXiv preprint arXiv:2312.12456","author":"Song Yixin","year":"2023","unstructured":"Yixin Song, Zeyu Mi, Haotong Xie, and Haibo Chen. 2023. Powerinfer: Fast large language model serving with a consumer-grade gpu. arXiv preprint arXiv:2312.12456 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"A simple and effective pruning approach for large language models. arXiv preprint arXiv:2306.11695","author":"Sun Mingjie","year":"2023","unstructured":"Mingjie Sun, Zhuang Liu, Anna Bair, and J Zico Kolter. 2023. A simple and effective pruning approach for large language models. arXiv preprint arXiv:2306.11695 (2023)."},{"key":"e_1_3_2_1_49_1","volume-title":"Hashimoto","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. 2023. Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca."},{"key":"e_1_3_2_1_50_1","unstructured":"Qwen Team. 2024. Qwen1.5-MoE: Matching 7B Model Performance with 1\/3 Activated Parameters''. https:\/\/qwenlm.github.io\/blog\/qwen-moe\/"},{"key":"e_1_3_2_1_51_1","volume-title":"Advances in Neural Information Processing Systems","author":"Tresp Volker","unstructured":"Volker Tresp. 2000. Mixtures of Gaussian Processes. In Advances in Neural Information Processing Systems, T. Leen, T. Dietterich, and V. Tresp (Eds.), Vol. 13. MIT Press."},{"key":"e_1_3_2_1_52_1","volume-title":"Bitnet: Scaling 1-bit transformers for large language models. arXiv preprint arXiv:2310.11453","author":"Wang Hongyu","year":"2023","unstructured":"Hongyu Wang, Shuming Ma, Li Dong, Shaohan Huang, Huaijie Wang, Lingxiao Ma, Fan Yang, Ruiping Wang, Yi Wu, and Furu Wei. 2023. Bitnet: Scaling 1-bit transformers for large language models. arXiv preprint arXiv:2310.11453 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"Svd-llm: Truncation-aware singular value decomposition for large language model compression. arXiv preprint arXiv:2403.07378","author":"Wang Xin","year":"2024","unstructured":"Xin Wang, Yu Zheng, Zhongwei Wan, and Mi Zhang. 2024. Svd-llm: Truncation-aware singular value decomposition for large language model compression. arXiv preprint arXiv:2403.07378 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"International Symposium on Advanced Parallel Processing Technologies. Springer, 231-245","author":"Wang Xinkai","year":"2025","unstructured":"Xinkai Wang, Yiming Zhuansun, Chao Li, Jing Wang, Xiaofeng Hou, Lingyu Sun, Luping Wang, and Minyi Guo. 2025. Asymserve: Demystifying and optimizing llm serving efficiency on cpu acceleration units. In International Symposium on Advanced Parallel Processing Technologies. Springer, 231-245."},{"key":"e_1_3_2_1_55_1","volume-title":"Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush.","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. 2020. Transformers: State-of-the-Art Natural Language Processing. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38-45."},{"key":"e_1_3_2_1_56_1","volume-title":"International Symposium on Advanced Parallel Processing Technologies. Springer, 257-266","author":"Wu Feiyang","year":"2025","unstructured":"Feiyang Wu, Zhuohang Bian, Guoyang Duan, Tianle Xu, Junchi Wu, Teng Ma, Yongqiang Yao, Ruihao Gong, and Youwei Zhuo. 2025. TokenSim: Enabling hardware and software exploration for large language model inference systems. In International Symposium on Advanced Parallel Processing Technologies. Springer, 257-266."},{"key":"e_1_3_2_1_57_1","volume-title":"International Conference on Machine Learning. PMLR, 38087-38099","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning. PMLR, 38087-38099."},{"key":"e_1_3_2_1_58_1","volume-title":"Moe-infinity: Activation-aware expert offloading for efficient moe serving. arXiv preprint arXiv:2401.14361","author":"Xue Leyang","year":"2024","unstructured":"Leyang Xue, Yao Fu, Zhan Lu, Luo Mai, and Mahesh Marina. 2024a. Moe-infinity: Activation-aware expert offloading for efficient moe serving. arXiv preprint arXiv:2401.14361 (2024)."},{"key":"e_1_3_2_1_59_1","volume-title":"PowerInfer-2: Fast Large Language Model Inference on a Smartphone. arXiv preprint arXiv:2406.06282","author":"Xue Zhenliang","year":"2024","unstructured":"Zhenliang Xue, Yixin Song, Zeyu Mi, Le Chen, Yubin Xia, and Haibo Chen. 2024b. PowerInfer-2: Fast Large Language Model Inference on a Smartphone. arXiv preprint arXiv:2406.06282 (2024)."},{"key":"e_1_3_2_1_60_1","volume-title":"Edgemoe: Fast on-device inference of moe-based large language models. arXiv preprint arXiv:2308.14352","author":"Yi Rongjie","year":"2023","unstructured":"Rongjie Yi, Liwei Guo, Shiyun Wei, Ao Zhou, Shangguang Wang, and Mengwei Xu. 2023. Edgemoe: Fast on-device inference of moe-based large language models. arXiv preprint arXiv:2308.14352 (2023)."},{"key":"e_1_3_2_1_61_1","volume-title":"fMoE: Fine-Grained Expert Offloading for Large Mixture-of-Experts Serving. arXiv preprint arXiv:2502.05370","author":"Yu Hanfei","year":"2025","unstructured":"Hanfei Yu, Xingqi Cui, Hong Zhang, and Hao Wang. 2025. fMoE: Fine-Grained Expert Offloading for Large Mixture-of-Experts Serving. arXiv preprint arXiv:2502.05370 (2025)."},{"key":"e_1_3_2_1_62_1","volume-title":"Asvd: Activation-aware singular value decomposition for compressing large language models. arXiv preprint arXiv:2312.05821","author":"Yuan Zhihang","year":"2023","unstructured":"Zhihang Yuan, Yuzhang Shang, Yue Song, Qiang Wu, Yan Yan, and Guangyu Sun. 2023. Asvd: Activation-aware singular value decomposition for compressing large language models. arXiv preprint arXiv:2312.05821 (2023)."},{"key":"e_1_3_2_1_63_1","unstructured":"Peiyuan Zhang Guangtao Zeng Tianduo Wang and Wei Lu. 2024. TinyLlama: An Open-Source Small Language Model. arXiv:2401.02385 [cs.CL]"},{"key":"e_1_3_2_1_64_1","volume-title":"Automation & Test in Europe Conference (DATE). IEEE, 1-7.","author":"Zhang Yujie","year":"2025","unstructured":"Yujie Zhang, Shivam Aggarwal, and Tulika Mitra. 2025. DAOP: Data-Aware Offloading and Predictive Pre-Calculation for Efficient MoE Inference. In 2025 Design, Automation & Test in Europe Conference (DATE). IEEE, 1-7."},{"key":"e_1_3_2_1_65_1","unstructured":"Wayne Xin Zhao Kun Zhou Junyi Li Tianyi Tang Xiaolei Wang Yupeng Hou Yingqian Min Beichen Zhang Junjie Zhang Zican Dong et al. 2023. A survey of large language models. arXiv preprint arXiv:2303.18223 (2023)."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/3676536.3676741"}],"event":{"name":"ASPLOS '26: 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Pittsburgh PA USA","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"deposited":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T14:05:51Z","timestamp":1773583551000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3779212.3790187"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,22]]},"references-count":66,"alternative-id":["10.1145\/3779212.3790187","10.1145\/3779212"],"URL":"https:\/\/doi.org\/10.1145\/3779212.3790187","relation":{},"subject":[],"published":{"date-parts":[[2026,3,22]]},"assertion":[{"value":"2026-03-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}