{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:57:21Z","timestamp":1776887841898,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":106,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China","award":["No.2022ZD0119103"],"award-info":[{"award-number":["No.2022ZD0119103"]}]},{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62325201"],"award-info":[{"award-number":["62325201"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3669940.3707239","type":"proceedings-article","created":{"date-parts":[[2025,2,6]],"date-time":"2025-02-06T12:28:01Z","timestamp":1738844881000},"page":"445-462","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":19,"title":["Fast On-device LLM Inference with NPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6775-0688","authenticated-orcid":false,"given":"Daliang","family":"Xu","sequence":"first","affiliation":[{"name":"Key Lab of HCST (PKU), MOE; SCS, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1107-4688","authenticated-orcid":false,"given":"Hao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1163-028X","authenticated-orcid":false,"given":"Liming","family":"Yang","sequence":"additional","affiliation":[{"name":"Key Lab of HCST (PKU), MOE; SCS, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3188-0976","authenticated-orcid":false,"given":"Ruiqi","family":"Liu","sequence":"additional","affiliation":[{"name":"Key Lab of HCST (PKU), MOE; SCS, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4686-3181","authenticated-orcid":false,"given":"Gang","family":"Huang","sequence":"additional","affiliation":[{"name":"Key Lab of HCST (PKU), MOE; SCS, Peking University, Beijing, China, &amp; National Key Laboratory of Data Space Technology and System, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6271-6993","authenticated-orcid":false,"given":"Mengwei","family":"Xu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7908-8484","authenticated-orcid":false,"given":"Xuanzhe","family":"Liu","sequence":"additional","affiliation":[{"name":"Key Lab of HCST (PKU), MOE; SCS, Peking University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2019. WinoGrande: An Adversarial Winograd Schema Challenge at Scale."},{"key":"e_1_3_2_1_2_1","unstructured":"2021. General data protection regulation. https:\/\/gdpr-info.eu\/."},{"key":"e_1_3_2_1_3_1","unstructured":"2023. AMD Strix Point (Ryzen 300). https:\/\/www.sellcell.com\/blog\/how-often-do-people-upgrade-their-phone-2023-statistics. https:\/\/www.anandtech.com\/show\/21469\/amd-details-ryzen-ai-300-series-for-mobile-strix-point-with-rdna-35-igpu-xdna-2-npu\/2"},{"key":"e_1_3_2_1_4_1","unstructured":"2023. Ascend NPU. https:\/\/www.hisilicon.com\/en\/products\/Kirin\/Kirin-flagship-chips\/Kirin-9000. https:\/\/www.hisilicon.com\/en\/products\/Kirin\/Kirin-flagship-chips\/Kirin-9000"},{"key":"e_1_3_2_1_5_1","unstructured":"2023. Edge TPU API. https:\/\/coral.ai\/docs\/edgetpu\/inference\/#general-purpose-operating-systems"},{"key":"e_1_3_2_1_6_1","unstructured":"2023. FasterTransformer. https:\/\/github.com\/NVIDIA\/FasterTransformer."},{"key":"e_1_3_2_1_7_1","unstructured":"2023. Gemma-2B. https:\/\/huggingface.co\/google\/gemma-2b. https:\/\/huggingface.co\/google\/gemma-2b"},{"key":"e_1_3_2_1_8_1","unstructured":"2023. HiAI Engine. https:\/\/developer.huawei.com\/consumer\/cn\/doc\/hiai-References\/overview-0000001053824513. https:\/\/developer.huawei.com\/consumer\/cn\/doc\/hiai-References\/overview-0000001053824513"},{"key":"e_1_3_2_1_9_1","unstructured":"2023. Llama-2-7b. https:\/\/huggingface.co\/meta-llama\/Llama-2-7bchat-hf. https:\/\/huggingface.co\/meta-llama\/Llama-2-7b-chat-hf"},{"key":"e_1_3_2_1_10_1","unstructured":"2023. MediaTek APU 790. https:\/\/corp.mediatek.com\/news-events\/press-releases\/mediateks-new-all-big-core-design-for-flagship-dimensity-9300-chipset-maximizes-smartphone-performance-and-efficiency. https:\/\/corp.mediatek.com\/news-events\/press-releases\/mediateks-new-all-big-core-design-for-flagship-dimensity-9300-chipset-maximizes-smartphone-performance-and-efficiency"},{"key":"e_1_3_2_1_11_1","unstructured":"2023. Meteor Lake. https:\/\/www.sellcell.com\/blog\/how-often-do-people-upgrade-their-phone-2023-statistics. https:\/\/en.wikipedia.org\/wiki\/Meteor_Lake"},{"key":"e_1_3_2_1_12_1","unstructured":"2023. Mistral-7B. https:\/\/huggingface.co\/mistralai\/Mistral-7B-Instruct-v0.3. https:\/\/huggingface.co\/mistralai\/Mistral-7B-Instruct-v0.3"},{"key":"e_1_3_2_1_13_1","unstructured":"2023. Neuro Pilot. https:\/\/neuropilot.mediatek.com\/. https:\/\/neuropilot.mediatek.com\/"},{"key":"e_1_3_2_1_14_1","unstructured":"2023. Phi-2. https:\/\/https:\/\/huggingface.co\/microsoft\/phi-2. https:\/\/https:\/\/huggingface.co\/microsoft\/phi-2"},{"key":"e_1_3_2_1_15_1","unstructured":"2023. Snapdragon 8gen3 SoC. https:\/\/www.qualcomm.com\/products\/mobile\/snapdragon\/smartphones\/snapdragon-8-series-mobile-platforms\/snapdragon-8-gen-3-mobile-platform."},{"key":"e_1_3_2_1_16_1","unstructured":"2024. AI Benchmark. https:\/\/ai-benchmark.com\/ranking_detailed.html."},{"key":"e_1_3_2_1_17_1","unstructured":"2024. AI Core. https:\/\/developer.android.com\/ai\/aicore."},{"key":"e_1_3_2_1_18_1","unstructured":"2024. Apple Intelligence. https:\/\/www.apple.com\/apple-intelligence\/."},{"key":"e_1_3_2_1_19_1","unstructured":"2024. EdgeTPU. https:\/\/cloud.google.com\/edge-tpu."},{"key":"e_1_3_2_1_20_1","unstructured":"2024. GPT-based email writer. https:\/\/hix.ai\/ai-email-writer-email-generator."},{"key":"e_1_3_2_1_21_1","unstructured":"2024. Hugging Face. https:\/\/huggingface.co\/."},{"key":"e_1_3_2_1_22_1","unstructured":"2024. LlamaTouch. https:\/\/github.com\/LlamaTouch\/LlamaTouch."},{"key":"e_1_3_2_1_23_1","unstructured":"2024. MLLM. https:\/\/github.com\/UbiquitousLearning\/mllm."},{"key":"e_1_3_2_1_24_1","unstructured":"2024. MMLU leader board. https:\/\/paperswithcode.com\/sota\/multi-task-language-understanding-on-mmlu."},{"key":"e_1_3_2_1_25_1","unstructured":"2024. QNN. https:\/\/www.qualcomm.com\/developer\/software\/qualcomm-ai-engine-direct-sdk."},{"key":"e_1_3_2_1_26_1","volume-title":"Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, et al.","author":"Abdin Marah","year":"2024","unstructured":"Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, et al. 2024. Phi-3 technical report: A highly capable language model locally on your phone. arXiv preprint arXiv:2404.14219 (2024)."},{"key":"e_1_3_2_1_27_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang Binyuan Hui Luo Ji Mei Li Junyang Lin Runji Lin Dayiheng Liu Gao Liu Chengqiang Lu Keming Lu Jianxin Ma Rui Men Xingzhang Ren Xuancheng Ren Chuanqi Tan Sinan Tan Jianhong Tu Peng Wang Shijie Wang Wei Wang ShengguangWu Benfeng Xu Jin Xu An Yang Hao Yang Jian Yang Shusheng Yang Yang Yao Bowen Yu Hongyi Yuan Zheng Yuan Jianwei Zhang Xingxuan Zhang Yichang Zhang Zhenru Zhang Chang Zhou Jingren Zhou Xiaohuan Zhou and Tianhang Zhu. 2023. Qwen Technical Report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"Longbench: A bilingual, multitask benchmark for long context understanding. arXiv preprint arXiv:2308.14508","author":"Bai Yushi","year":"2023","unstructured":"Yushi Bai, Xin Lv, Jiajie Zhang, Hongchang Lyu, Jiankai Tang, Zhidian Huang, Zhengxiao Du, Xiao Liu, Aohan Zeng, Lei Hou, et al. 2023. Longbench: A bilingual, multitask benchmark for long context understanding. arXiv preprint arXiv:2308.14508 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"Token merging: Your vit but faster. arXiv preprint arXiv:2210.09461","author":"Bolya Daniel","year":"2022","unstructured":"Daniel Bolya, Cheng-Yang Fu, Xiaoliang Dai, Peizhao Zhang, Christoph Feichtenhofer, and Judy Hoffman. 2022. Token merging: Your vit but faster. arXiv preprint arXiv:2210.09461 (2022)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3570361.3592505"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3486618"},{"key":"e_1_3_2_1_32_1","volume-title":"Medusa: Simple llm inference acceleration framework with multiple decoding heads. arXiv preprint arXiv:2401.10774","author":"Cai Tianle","year":"2024","unstructured":"Tianle Cai, Yuhong Li, Zhengyang Geng, Hongwu Peng, Jason D Lee, Deming Chen, and Tri Dao. 2024. Medusa: Simple llm inference acceleration framework with multiple decoding heads. arXiv preprint arXiv:2401.10774 (2024)."},{"key":"e_1_3_2_1_33_1","first-page":"30318","article-title":"Gpt3. int8 (): 8-bit matrix multiplication for transformers at scale","volume":"35","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. 2022. Gpt3. int8 (): 8-bit matrix multiplication for transformers at scale. Advances in Neural Information Processing Systems 35 (2022), 30318--30332.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-78791-4_8"},{"key":"e_1_3_2_1_35_1","volume-title":"Gptq: Accurate post-training quantization for generative pretrained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022. Gptq: Accurate post-training quantization for generative pretrained transformers. arXiv preprint arXiv:2210.17323 (2022)."},{"key":"e_1_3_2_1_36_1","volume-title":"Break the sequential dependency of llm inference using lookahead decoding. arXiv preprint arXiv:2402.02057","author":"Fu Yichao","year":"2024","unstructured":"Yichao Fu, Peter Bailis, Ion Stoica, and Hao Zhang. 2024. Break the sequential dependency of llm inference using lookahead decoding. arXiv preprint arXiv:2402.02057 (2024)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/2668332.2668349"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3418297"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.3390\/s21072364"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643832.3661878"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2019.00021"},{"key":"e_1_3_2_1_42_1","volume-title":"REST: Retrieval-Based Speculative Decoding. arXiv preprint arXiv:2311.08252","author":"He Zhenyu","year":"2023","unstructured":"Zhenyu He, Zexuan Zhong, Tianle Cai, Jason D Lee, and Di He. 2023. REST: Retrieval-Based Speculative Decoding. arXiv preprint arXiv:2311.08252 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Hendrycks Dan","year":"2021","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2021. Measuring Massive Multitask Language Understanding. Proceedings of the International Conference on Learning Representations (ICLR) (2021)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Karla L Hoffman Manfred Padberg Giovanni Rinaldi et al. 2013. Traveling salesman problem. Encyclopedia of operations research and management science 1 (2013) 1573--1578.","DOI":"10.1007\/978-1-4419-1153-7_1068"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3614285"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3081333.3081360"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00286"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Pegah Jandaghi XiangHai Sheng Xinyi Bai Jay Pujara and Hakim Sidahmed. 2023. Faithful Persona-based Conversational Dataset Generation with Large Language Models. arXiv:2312.10007 [cs.CL]","DOI":"10.18653\/v1\/2024.findings-acl.904"},{"key":"e_1_3_2_1_49_1","volume-title":"MNN: A universal and efficient inference engine. arXiv preprint arXiv:2002.12418","author":"Jiang Xiaotang","year":"2020","unstructured":"Xiaotang Jiang, Huan Wang, Yiliu Chen, Ziqi Wu, Lichuan Wang, Bin Zou, Yafeng Yang, Zongyang Cui, Yu Cai, Tianhang Yu, et al. 2020. MNN: A universal and efficient inference engine. arXiv preprint arXiv:2002.12418 (2020)."},{"key":"e_1_3_2_1_50_1","volume-title":"Big little transformer decoder. arXiv preprint arXiv:2302.07863","author":"Kim Sehoon","year":"2023","unstructured":"Sehoon Kim, Karttikeya Mangalam, Jitendra Malik, Michael W Mahoney, Amir Gholami, and Kurt Keutzer. 2023. Big little transformer decoder. arXiv preprint arXiv:2302.07863 (2023)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539260"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303950"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPSN.2016.7460664"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3300061.3345455"},{"key":"e_1_3_2_1_55_1","unstructured":"Yuanchun Li Hao Wen Weijun Wang Xiangyu Li Yizhen Yuan Guohong Liu Jiacheng Liu Wenxing Xu Xiang Wang Yi Sun et al. 2024. Personal llm agents: Insights and survey about the capability efficiency and security. arXiv preprint arXiv:2401.05459 (2024)."},{"key":"e_1_3_2_1_56_1","volume-title":"Awq: Activation-aware weight quantization for llm compression and acceleration. arXiv preprint arXiv:2306.00978","author":"Lin Ji","year":"2023","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Xingyu Dang, and Song Han. 2023. Awq: Activation-aware weight quantization for llm compression and acceleration. arXiv preprint arXiv:2306.00978 (2023)."},{"key":"e_1_3_2_1_57_1","unstructured":"TensorFlow Lite. 2019. Deploy machine learning models on mobile and IoT devices."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/S11432-021-3528-7"},{"key":"e_1_3_2_1_59_1","volume-title":"Year of publication. Port of Facebook's LLaMA model in C\/C++ Resources. https:\/\/github.com\/ggerganov\/llama.cpp. Accessed","year":"2023","unstructured":"llama.cpp. Year of publication. Port of Facebook's LLaMA model in C\/C++ Resources. https:\/\/github.com\/ggerganov\/llama.cpp. Accessed: (2023.7)."},{"key":"e_1_3_2_1_60_1","volume-title":"Small Language Models: Survey, Measurements, and Insights. arXiv preprint arXiv:2409.15790","author":"Lu Zhenyan","year":"2024","unstructured":"Zhenyan Lu, Xiang Li, Dongqi Cai, Rongjie Yi, Fangming Liu, Xiwen Zhang, Nicholas D Lane, and Mengwei Xu. 2024. Small Language Models: Survey, Measurements, and Insights. arXiv preprint arXiv:2409.15790 (2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia.","author":"Miao Xupeng","year":"2023","unstructured":"Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. 2023. SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification. arXiv preprint arXiv:2305.09781 (2023)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Todor Mihaylov Peter Clark Tushar Khot and Ashish Sabharwal. 2018. Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering. In EMNLP.","DOI":"10.18653\/v1\/D18-1260"},{"key":"e_1_3_2_1_63_1","volume-title":"SoD2: Statically Optimizing Dynamic Deep Neural Network. arXiv preprint arXiv:2403.00176","author":"Niu Wei","year":"2024","unstructured":"Wei Niu, Gagan Agrawal, and Bin Ren. 2024. SoD2: Statically Optimizing Dynamic Deep Neural Network. arXiv preprint arXiv:2403.00176 (2024)."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00044"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378534"},{"key":"e_1_3_2_1_66_1","volume-title":"Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"3","author":"Niu Wei","year":"2024","unstructured":"Wei Niu, Md Musfiqur Rahman Sanim, Zhihao Shu, Jiexiong Guan, Xipeng Shen, Miao Yin, Gagan Agrawal, and Bin Ren. 2024. Smart-Mem: Layout Transformation Elimination and Adaptation for Efficient DNN Execution on Mobile. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3. 916--931."},{"key":"e_1_3_2_1_67_1","volume-title":"Tensorflow-serving: Flexible, high-performance ml serving. arXiv preprint arXiv:1712.06139","author":"Olston Christopher","year":"2017","unstructured":"Christopher Olston, Noah Fiedel, Kiril Gorovoy, Jeremiah Harmsen, Li Lao, Fangwei Li, Vinu Rajashekhar, Sukriti Ramesh, and Jordan Soyke. 2017. Tensorflow-serving: Flexible, high-performance ml serving. arXiv preprint arXiv:1712.06139 (2017)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1144"},{"key":"e_1_3_2_1_69_1","volume-title":"Splitwise: Efficient generative llm inference using phase splitting. arXiv preprint arXiv:2311.18677","author":"Patel Pratyush","year":"2023","unstructured":"Pratyush Patel, Esha Choukse, Chaojie Zhang, \u00cd\u00f1igo Goiri, Aashaka Shah, Saeed Maleki, and Ricardo Bianchini. 2023. Splitwise: Efficient generative llm inference using phase splitting. arXiv preprint arXiv:2311.18677 (2023)."},{"key":"e_1_3_2_1_70_1","volume-title":"Proceedings of Machine Learning and Systems 5","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. 2023. Efficiently scaling transformer inference. Proceedings of Machine Learning and Systems 5 (2023)."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/2499370.2462176"},{"key":"e_1_3_2_1_72_1","volume-title":"Dynamicvit: Efficient vision transformers with dynamic token sparsification. Advances in neural information processing systems 34","author":"Rao Yongming","year":"2021","unstructured":"Yongming Rao, Wenliang Zhao, Benlin Liu, Jiwen Lu, Jie Zhou, and Cho-Jui Hsieh. 2021. Dynamicvit: Efficient vision transformers with dynamic token sparsification. Advances in neural information processing systems 34 (2021), 13937--13949."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3623278.3624768"},{"key":"e_1_3_2_1_74_1","volume-title":"Powerinfer: Fast large language model serving with a consumer-grade gpu. arXiv preprint arXiv:2312.12456","author":"Song Yixin","year":"2023","unstructured":"Yixin Song, Zeyu Mi, Haotong Xie, and Haibo Chen. 2023. Powerinfer: Fast large language model serving with a consumer-grade gpu. arXiv preprint arXiv:2312.12456 (2023)."},{"key":"e_1_3_2_1_75_1","unstructured":"MLC team. 2023. MLC-LLM. https:\/\/github.com\/mlc-ai\/mlc-llm"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"crossref","unstructured":"Samuel Thomas and James Bornholt. 2024. Automatic Generation of Vectorizing Compilers for Customizable Digital Signal Processors. (2024).","DOI":"10.1145\/3617232.3624873"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/3106343"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"e_1_3_2_1_80_1","first-page":"5776","article-title":"Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers","volume":"33","author":"Wang Wenhui","year":"2020","unstructured":"Wenhui Wang, Furu Wei, Li Dong, Hangbo Bao, Nan Yang, and Ming Zhou. 2020. Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers. Advances in Neural Information Processing Systems 33 (2020), 5776--5788.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_81_1","volume-title":"Shiqi Jiang, Yunhao Liu, Yaqin Zhang, and Yunxin Liu.","author":"Wen Hao","year":"2023","unstructured":"Hao Wen, Yuanchun Li, Guohong Liu, Shanhui Zhao, Tao Yu, Toby Jia-Jun Li, Shiqi Jiang, Yunhao Liu, Yaqin Zhang, and Yunxin Liu. 2023. Empowering llm to use smartphone for intelligent task automation. arXiv preprint arXiv:2308.15272 (2023)."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649379"},{"key":"e_1_3_2_1_83_1","volume-title":"LoongServe: Efficiently Serving Long-context Large Language Models with Elastic Sequence Parallelism. arXiv preprint arXiv:2404.09526","author":"Wu Bingyang","year":"2024","unstructured":"Bingyang Wu, Shengyu Liu, Yinmin Zhong, Peng Sun, Xuanzhe Liu, and Xin Jin. 2024. LoongServe: Efficiently Serving Long-context Large Language Models with Elastic Sequence Parallelism. arXiv preprint arXiv:2404.09526 (2024)."},{"key":"e_1_3_2_1_84_1","volume-title":"International Conference on Machine Learning. PMLR, 38087--38099","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning. PMLR, 38087--38099."},{"key":"e_1_3_2_1_85_1","volume-title":"Niagara: Scheduling DNN Inference Services on Heterogeneous Edge Processors. In International Conference on Service-Oriented Computing. Springer, 67--85","author":"Xu Daliang","year":"2023","unstructured":"Daliang Xu, Qing Li, Mengwei Xu, Kang Huang, Gang Huang, Shangguang Wang, Xin Jin, Yun Ma, and Xuanzhe Liu. 2023. Niagara: Scheduling DNN Inference Services on Heterogeneous Edge Processors. In International Conference on Service-Oriented Computing. Springer, 67--85."},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1145\/3495243.3560545"},{"key":"e_1_3_2_1_87_1","volume-title":"Llmcad: Fast and scalable on-device large language model inference. arXiv preprint arXiv:2309.04255","author":"Xu Daliang","year":"2023","unstructured":"Daliang Xu, Wangsong Yin, Xin Jin, Ying Zhang, Shiyun Wei, Mengwei Xu, and Xuanzhe Liu. 2023. Llmcad: Fast and scalable on-device large language model inference. arXiv preprint arXiv:2309.04255 (2023)."},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"publisher","DOI":"10.1145\/3662006.3662066"},{"key":"e_1_3_2_1_89_1","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Xu Mengwei","year":"2024","unstructured":"Mengwei Xu, Dongqi Cai, Yaozong Wu, Xiang Li, and Shangguang Wang. 2024. {FwdLLM}: Efficient Federated Finetuning of Large Language Models with Perturbed Inferences. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). 579--596."},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"publisher","DOI":"10.1145\/3308558.3313591"},{"key":"e_1_3_2_1_91_1","unstructured":"Mengwei Xu Wangsong Yin Dongqi Cai Rongjie Yi Daliang Xu Qipeng Wang Bingyang Wu Yihao Zhao Chen Yang Shihe Wang et al. 2024. A survey of resource-efficient llm and multimodal foundation models. arXiv preprint arXiv:2401.08092 (2024)."},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","DOI":"10.1145\/3241539.3241563"},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4227-2"},{"key":"e_1_3_2_1_94_1","volume-title":"PowerInfer-2: Fast Large Language Model Inference on a Smartphone. arXiv preprint arXiv:2406.06282","author":"Xue Zhenliang","year":"2024","unstructured":"Zhenliang Xue, Yixin Song, Zeyu Mi, Le Chen, Yubin Xia, and Haibo Chen. 2024. PowerInfer-2: Fast Large Language Model Inference on a Smartphone. arXiv preprint arXiv:2406.06282 (2024)."},{"key":"e_1_3_2_1_95_1","volume-title":"Predictive Pipelined Decoding: A Compute-Latency Trade-off for Exact LLM Decoding. arXiv preprint arXiv:2307.05908","author":"Yang Seongjun","year":"2023","unstructured":"Seongjun Yang, Gibbeum Lee, Jaewoong Cho, Dimitris Papailiopoulos, and Kangwook Lee. 2023. Predictive Pipelined Decoding: A Compute-Latency Trade-off for Exact LLM Decoding. arXiv preprint arXiv:2307.05908 (2023)."},{"key":"e_1_3_2_1_96_1","first-page":"27168","article-title":"Zeroquant: Efficient and affordable post-training quantization for large-scale transformers","volume":"35","author":"Yao Zhewei","year":"2022","unstructured":"Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, and Yuxiong He. 2022. Zeroquant: Efficient and affordable post-training quantization for large-scale transformers. Advances in Neural Information Processing Systems 35 (2022), 27168--27183.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372224.3419192"},{"key":"e_1_3_2_1_98_1","volume-title":"Edgemoe: Fast on-device inference of moe-based large language models. arXiv preprint arXiv:2308.14352","author":"Yi Rongjie","year":"2023","unstructured":"Rongjie Yi, Liwei Guo, ShiyunWei, Ao Zhou, ShangguangWang, and Mengwei Xu. 2023. Edgemoe: Fast on-device inference of moe-based large language models. arXiv preprint arXiv:2308.14352 (2023)."},{"key":"e_1_3_2_1_99_1","volume-title":"Llm as a system service on mobile devices. arXiv preprint arXiv:2403.11805","author":"Yin Wangsong","year":"2024","unstructured":"Wangsong Yin, Mengwei Xu, Yuanchun Li, and Xuanzhe Liu. 2024. Llm as a system service on mobile devices. arXiv preprint arXiv:2403.11805 (2024)."},{"key":"e_1_3_2_1_100_1","volume-title":"ELMS: Elasticized Large Language Models On Mobile Devices. arXiv preprint arXiv:2409.09071","author":"Yin Wangsong","year":"2024","unstructured":"Wangsong Yin, Rongjie Yi, Daliang Xu, Gang Huang, Mengwei Xu, and Xuanzhe Liu. 2024. ELMS: Elasticized Large Language Models On Mobile Devices. arXiv preprint arXiv:2409.09071 (2024)."},{"key":"e_1_3_2_1_101_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747065"},{"key":"e_1_3_2_1_102_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649361"},{"key":"e_1_3_2_1_103_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1472"},{"key":"e_1_3_2_1_104_1","doi-asserted-by":"publisher","DOI":"10.1109\/TWC.2021.3088910"},{"key":"e_1_3_2_1_105_1","doi-asserted-by":"publisher","DOI":"10.1145\/3384419.3430726"},{"key":"e_1_3_2_1_106_1","volume-title":"DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. arXiv preprint arXiv:2401.09670","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. arXiv preprint arXiv:2401.09670 (2024)."}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707239","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3669940.3707239","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T14:49:25Z","timestamp":1755787765000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707239"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":106,"alternative-id":["10.1145\/3669940.3707239","10.1145\/3669940"],"URL":"https:\/\/doi.org\/10.1145\/3669940.3707239","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}