{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T19:08:40Z","timestamp":1776107320354,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":93,"publisher":"ACM","funder":[{"name":"The National Natural Science Foundation of China under Grant","award":["6232520"],"award-info":[{"award-number":["6232520"]}]},{"name":"The Shenzhen Science and Technology Program","award":["JCYJ20241202124021028"],"award-info":[{"award-number":["JCYJ20241202124021028"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,3]]},"DOI":"10.1145\/3680207.3765259","type":"proceedings-article","created":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T13:20:32Z","timestamp":1763731232000},"page":"984-999","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Elastic On-Device LLM Service"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6242-4368","authenticated-orcid":false,"given":"Wangsong","family":"Yin","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6040-9596","authenticated-orcid":false,"given":"Rongjie","family":"Yi","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6775-0688","authenticated-orcid":false,"given":"Daliang","family":"Xu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4686-3181","authenticated-orcid":false,"given":"Gang","family":"Huang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6271-6993","authenticated-orcid":false,"given":"Mengwei","family":"Xu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, and Beiyou Shenzhen Institute, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7908-8484","authenticated-orcid":false,"given":"Xuanzhe","family":"Liu","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,11,21]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. AICore. https:\/\/developer.android.com\/ml\/aicore."},{"key":"e_1_3_2_1_2_1","unstructured":"2024. alpaca cleaned. https:\/\/huggingface.co\/datasets\/yahma\/alpacacleaned."},{"key":"e_1_3_2_1_3_1","unstructured":"2024. Apple Intelligence. https:\/\/www.apple.com\/apple-intelligence\/."},{"key":"e_1_3_2_1_4_1","unstructured":"2024. ARM NEON. https:\/\/developer.arm.com\/Architectures\/Neon."},{"key":"e_1_3_2_1_5_1","unstructured":"2024. Gboard Smart Reply. https:\/\/developers.google.com\/ml-kit\/language\/smart-reply."},{"key":"e_1_3_2_1_6_1","unstructured":"2024. Hey Siri: An On-device DNN-powered Voice Trigger for Apple's Personal Assistant. https:\/\/machinelearning.apple.com\/research\/heysiri."},{"key":"e_1_3_2_1_7_1","unstructured":"2024. Huggingface GPU pricing. https:\/\/huggingface.co\/pricing."},{"key":"e_1_3_2_1_8_1","unstructured":"2024. Llama.cpp. https:\/\/github.com\/ggerganov\/llama.cpp."},{"key":"e_1_3_2_1_9_1","unstructured":"2024. LLM layer pruning. https:\/\/github.com\/horseee\/LLM-Pruner\/blob\/cbe488944ed772f342e99d3d0efbab9df6520c21\/hf_prune.py#L219."},{"key":"e_1_3_2_1_10_1","volume-title":"MeetingBank compressed. https:\/\/huggingface.co\/datasets\/microsoft\/MeetingBank-LLMCompressed.","unstructured":"2024. MeetingBank compressed. https:\/\/huggingface.co\/datasets\/microsoft\/MeetingBank-LLMCompressed."},{"key":"e_1_3_2_1_11_1","unstructured":"2024. mllm. https:\/\/github.com\/UbiquitousLearning\/mllm."},{"key":"e_1_3_2_1_12_1","unstructured":"2024. redmi-k60-champion-edition. https:\/\/www.giztop.com\/redmi-k60-champion-edition.html."},{"key":"e_1_3_2_1_13_1","unstructured":"2024. Redmi K70 Pro. https:\/\/www.mi.com\/redmi-k70-pro."},{"key":"e_1_3_2_1_14_1","unstructured":"2024. rewind. https:\/\/www.rewind.ai\/."},{"key":"e_1_3_2_1_15_1","unstructured":"2024. XiaoAi smart assistant. https:\/\/xiaoai.mi.com\/."},{"key":"e_1_3_2_1_16_1","unstructured":"2024. xiaomi-14. https:\/\/www.mi.com\/global\/product\/xiaomi-14\/."},{"key":"e_1_3_2_1_17_1","unstructured":"OpenAI: Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya et al. 2023. GPT-4 Technical Report. arXiv:2303.08774 [cs.CL]"},{"key":"e_1_3_2_1_18_1","unstructured":"Abien Fred Agarap. 2019. Deep Learning using Rectified Linear Units (ReLU). arXiv:1803.08375 [cs.NE] https:\/\/arxiv.org\/abs\/1803.08375"},{"key":"e_1_3_2_1_19_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan et al. 2023. Qwen Technical Report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Han Cai Chuang Gan Tianzhe Wang Zhekai Zhang and Song Han. 2020. Once-for-All: Train One Network and Specialize it for Efficient Deployment. arXiv:1908.09791 [cs.LG] https:\/\/arxiv.org\/abs\/1908.09791","DOI":"10.1145\/3366423.3380259"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01147"},{"key":"e_1_3_2_1_22_1","unstructured":"Wei Chen and Zhiyuan Li. 2024. Octopus v2: On-device language model for super agent. arXiv:2404.01744 [cs.CL]"},{"key":"e_1_3_2_1_23_1","volume-title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge. arXiv:1803.05457v1","author":"Clark Peter","year":"2018","unstructured":"Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, and Oyvind Tafjord. 2018. Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge. arXiv:1803.05457v1 (2018)."},{"key":"e_1_3_2_1_24_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL] https:\/\/arxiv.org\/abs\/1810.04805","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL] https:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Qingxiu Dong Lei Li Damai Dai Ce Zheng Jingyuan Ma Rui Li Heming Xia Jingjing Xu Zhiyong Wu Baobao Chang Xu Sun Lei Li and Zhifang Sui. 2024. A Survey on In-context Learning. arXiv:2301.00234 [cs.CL] https:\/\/arxiv.org\/abs\/2301.00234","DOI":"10.18653\/v1\/2024.emnlp-main.64"},{"key":"e_1_3_2_1_26_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri et al. 2024. The Llama 3 Herd of Models. arXiv:2407.21783 [cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3241539.3241559"},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 16091\u201316101","author":"Fang Gongfan","year":"2023","unstructured":"Gongfan Fang, Xinyin Ma, Mingli Song, Michael Bi Mi, and Xinchao Wang. 2023. Depgraph: Towards any structural pruning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 16091\u201316101."},{"key":"e_1_3_2_1_29_1","unstructured":"Yingqiang Ge Yujie Ren Wenyue Hua Shuyuan Xu Juntao Tan and Yongfeng Zhang. 2023. LLM as OS Agents as Apps: Envisioning AIOS Agents and the AIOS-Agent Ecosystem. arXiv:2312.03815 [cs.OS] https:\/\/arxiv.org\/abs\/2312.03815"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447993.3483249"},{"key":"e_1_3_2_1_31_1","volume-title":"Dally","author":"Han Song","year":"2015","unstructured":"Song Han, Jeff Pool, John Tran, and William J. Dally. 2015. Learning both Weights and Connections for Efficient Neural Networks. arXiv:1506.02626 [cs.NE] https:\/\/arxiv.org\/abs\/1506.02626"},{"key":"e_1_3_2_1_32_1","unstructured":"Shwai He Guoheng Sun Zheyu Shen and Ang Li. 2024. What Matters in Transformers? Not All Attention is Needed. arXiv:2406.15786 [cs.LG] https:\/\/arxiv.org\/abs\/2406.15786"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Hendrycks Dan","year":"2021","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2021. Measuring Massive Multitask Language Understanding. Proceedings of the International Conference on Learning Representations (ICLR) (2021)."},{"key":"e_1_3_2_1_34_1","unstructured":"Edward J. Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2021. LoRA: Low-Rank Adaptation of Large Language Models. arXiv:2106.09685 [cs.CL] https:\/\/arxiv.org\/abs\/2106.09685"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581791.3596852"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1103\/PhysRevA.39.6600"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Huiqiang Jiang Qianhui Wu Chin-Yew Lin Yuqing Yang and Lili Qiu. 2023. LLMLingua: Compressing Prompts for Accelerated Inference of Large Language Models. arXiv:2310.05736 [cs.CL] https:\/\/arxiv.org\/abs\/2310.05736","DOI":"10.18653\/v1\/2023.emnlp-main.825"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.91"},{"key":"e_1_3_2_1_39_1","volume-title":"MNN: A Universal and Efficient Inference Engine. In MLSys.","author":"Jiang Xiaotang","year":"2020","unstructured":"Xiaotang Jiang, Huan Wang, Yiliu Chen, Ziqi Wu, Lichuan Wang, Bin Zou, Yafeng Yang, Zongyang Cui, Yu Cai, Tianhang Yu, Chengfei Lv, and Zhihua Wu. 2020. MNN: A Universal and Efficient Inference Engine. In MLSys."},{"key":"e_1_3_2_1_40_1","unstructured":"Matt Gardner Johannes Welbl Nelson F. Liu. 2017. Crowdsourcing Multiple Choice Science Questions. arXiv:1707.06209v1."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581791.3596831"},{"key":"e_1_3_2_1_42_1","volume-title":"Joseph E. Gonzalez, Hao Zhang, and Ion Stoica.","author":"Kwon Woosuk","year":"2023","unstructured":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph E. Gonzalez, Hao Zhang, and Ion Stoica. 2023. Efficient Memory Management for Large Language Model Serving with PagedAttention. arXiv:2309.06180 [cs.LG] https:\/\/arxiv.org\/abs\/2309.06180"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447993.3483278"},{"key":"e_1_3_2_1_44_1","unstructured":"Hao Li Asim Kadav Igor Durdanovic Hanan Samet and Hans Peter Graf. 2017. Pruning Filters for Efficient ConvNets. arXiv:1608.08710 [cs.CV] https:\/\/arxiv.org\/abs\/1608.08710"},{"key":"e_1_3_2_1_45_1","volume-title":"Personal LLM Agents: Insights and Survey about the Capability, Efficiency and Security. arXiv preprint arXiv:2401.05459","author":"Li Yuanchun","year":"2024","unstructured":"Yuanchun Li, Hao Wen, Weijun Wang, Xiangyu Li, Yizhen Yuan, Guohong Liu, Jiacheng Liu, Wenxing Xu, Xiang Wang, Yi Sun, Rui Kong, Yile Wang, Hanfei Geng, Jian Luan, Xuefeng Jin, Zilong Ye, Guanjing Xiong, Fan Zhang, Xiang Li, Mengwei Xu, Zhijun Li, Peng Li, Yang Liu, Ya-Qin Zhang, and Yunxin Liu. 2024. Personal LLM Agents: Insights and Survey about the Capability, Efficiency and Security. arXiv preprint arXiv:2401.05459 (2024)."},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"22176","author":"Liu Zichang","year":"2023","unstructured":"Zichang Liu, Jue Wang, Tri Dao, Tianyi Zhou, Binhang Yuan, Zhao Song, Anshumali Shrivastava, Ce Zhang, Yuandong Tian, Christopher Re, and Beidi Chen. 2023. Deja Vu: Contextual Sparsity for Efficient LLMs at Inference Time. In Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 22137\u201322176. https:\/\/proceedings.mlr.press\/v202\/liu23am.html"},{"key":"e_1_3_2_1_47_1","unstructured":"Xinyin Ma Gongfan Fang and Xinchao Wang. 2023. LLM-Pruner: On the Structural Pruning of Large Language Models. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_48_1","volume-title":"AIOS: LLM Agent Operating System. arXiv:2403.16971 [cs.OS] https:\/\/arxiv.org\/abs\/2403.16971","author":"Mei Kai","year":"2024","unstructured":"Kai Mei, Zelong Li, Shuyuan Xu, Ruosong Ye, Yingqiang Ge, and Yongfeng Zhang. 2024. AIOS: LLM Agent Operating System. arXiv:2403.16971 [cs.OS] https:\/\/arxiv.org\/abs\/2403.16971"},{"key":"e_1_3_2_1_49_1","unstructured":"Xin Men Mingyu Xu Qingyu Zhang Bingning Wang Hongyu Lin Yaojie Lu Xianpei Han and Weipeng Chen. 2024. ShortGPT: Layers in Large Language Models are More Redundant Than You Expect. arXiv:2403.03853 [cs.CL] https:\/\/arxiv.org\/abs\/2403.03853"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651335"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Todor Mihaylov Peter Clark Tushar Khot and Ashish Sabharwal. 2018. Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering. In EMNLP.","DOI":"10.18653\/v1\/D18-1260"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.1"},{"key":"e_1_3_2_1_53_1","volume-title":"Orca: Progressive Learning from Complex Explanation Traces of GPT-4. arXiv:2306.02707 [cs.CL]","author":"Mukherjee Subhabrata","year":"2023","unstructured":"Subhabrata Mukherjee, Arindam Mitra, Ganesh Jawahar, Sahaj Agarwal, Hamid Palangi, and Ahmed Awadallah. 2023. Orca: Progressive Learning from Complex Explanation Traces of GPT-4. arXiv:2306.02707 [cs.CL]"},{"key":"e_1_3_2_1_54_1","unstructured":"Long Ouyang Jeff Wu Xu Jiang Diogo Almeida Carroll L. Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray John Schulman Jacob Hilton Fraser Kelton Luke Miller Maddie Simens Amanda Askell Peter Welinder Paul Christiano Jan Leike and Ryan Lowe. 2022. Training language models to follow instructions with human feedback. arXiv:2203.02155 [cs.CL] https:\/\/arxiv.org\/abs\/2203.02155"},{"key":"e_1_3_2_1_55_1","volume-title":"Gonzalez","author":"Packer Charles","year":"2024","unstructured":"Charles Packer, Sarah Wooders, Kevin Lin, Vivian Fang, Shishir G. Patil, Ion Stoica, and Joseph E. Gonzalez. 2024. MemGPT: Towards LLMs as Operating Systems. arXiv:2310.08560 [cs.AI] https:\/\/arxiv.org\/abs\/2310.08560"},{"key":"e_1_3_2_1_56_1","volume-title":"LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression. ArXiv preprint abs\/2403.12968","author":"Pan Zhuoshi","year":"2024","unstructured":"Zhuoshi Pan, Qianhui Wu, Huiqiang Jiang, Menglin Xia, Xufang Luo, Jue Zhang, Qingwei Lin, Victor Ruhle, Yuqing Yang, Chin-Yew Lin, H. Vicky Zhao, Lili Qiu, and Dongmei Zhang. 2024. LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression. ArXiv preprint abs\/2403.12968 (2024). https:\/\/arxiv.org\/abs\/2403.12968"},{"key":"e_1_3_2_1_57_1","unstructured":"Adam Paszke Sam Gross Francisco Massa et al. 2019. PyTorch: An Imperative Style High-Performance Deep Learning Library. arXiv:1912.01703 [cs.LG] https:\/\/arxiv.org\/abs\/1912.01703"},{"key":"e_1_3_2_1_58_1","volume-title":"Liu","author":"Raffel Colin","year":"2023","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. 2023. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. arXiv:1910.10683 [cs.LG] https:\/\/arxiv.org\/abs\/1910.10683"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01228-7"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"crossref","unstructured":"Yixin Song Zeyu Mi Haotong Xie and Haibo Chen. 2023. PowerInfer: Fast Large Language Model Serving with a Consumer-grade GPU. arXiv:2312.12456 [cs.LG]","DOI":"10.1145\/3694715.3695964"},{"key":"e_1_3_2_1_61_1","unstructured":"Jianlin Su Yu Lu Shengfeng Pan Ahmed Murtadha Bo Wen and Yunfeng Liu. 2023. RoFormer: Enhanced Transformer with Rotary Position Embedding. arXiv:2104.09864 [cs.CL] https:\/\/arxiv.org\/abs\/2104.09864"},{"key":"e_1_3_2_1_62_1","unstructured":"Mingjie Sun Zhuang Liu Anna Bair and J. Zico Kolter. 2024. A Simple and Effective Pruning Approach for Large Language Models. arXiv:2306.11695 [cs.CL] https:\/\/arxiv.org\/abs\/2306.11695"},{"key":"e_1_3_2_1_63_1","unstructured":"Zhiqing Sun Hongkun Yu Xiaodan Song Renjie Liu Yiming Yang and Denny Zhou. 2020. MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices. arXiv:2004.02984 [cs.CL] https:\/\/arxiv.org\/abs\/2004.02984"},{"key":"e_1_3_2_1_64_1","unstructured":"Mukund Sundararajan Ankur Taly and Qiqi Yan. 2017. Axiomatic Attribution for Deep Networks. arXiv:1703.01365 [cs.LG] https:\/\/arxiv.org\/abs\/1703.01365"},{"key":"e_1_3_2_1_65_1","volume-title":"Calculus with analytic geometry","author":"Swokowski Earl William","unstructured":"Earl William Swokowski. 1979. Calculus with analytic geometry. Taylor & Francis."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"crossref","unstructured":"Thierry Tambe Coleman Hooper Lillian Pentecost Tianyu Jia En-Yu Yang Marco Donato Victor Sanh Paul N. Whatmough Alexander M. Rush David Brooks and Gu-Yeon Wei. 2021. EdgeBERT: Sentence-Level Energy Optimizations for Latency-Aware Multi-Task NLP Inference. arXiv:2011.14203 [cs.AR] https:\/\/arxiv.org\/abs\/2011.14203","DOI":"10.1145\/3466752.3480095"},{"key":"e_1_3_2_1_67_1","volume-title":"Hashimoto","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. 2023. Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca."},{"key":"e_1_3_2_1_68_1","unstructured":"MLC team. 2023. MLC-LLM. https:\/\/github.com\/mlc-ai\/mlc-llm"},{"key":"e_1_3_2_1_69_1","unstructured":"Surat Teerapittayanon Bradley McDanel and H. T. Kung. 2017. BranchyNet: Fast Inference via Early Exiting from Deep Neural Networks. arXiv:1709.01686 [cs.NE] https:\/\/arxiv.org\/abs\/1709.01686"},{"key":"e_1_3_2_1_70_1","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arXiv:2302.13971 [cs.CL] https:\/\/arxiv.org\/abs\/2302.13971"},{"key":"e_1_3_2_1_71_1","volume-title":"Bowman","author":"Wang Alex","year":"2019","unstructured":"Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R. Bowman. 2019. GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding. arXiv:1804.07461 [cs.CL] https:\/\/arxiv.org\/abs\/1804.07461"},{"key":"e_1_3_2_1_72_1","unstructured":"Yubo Wang Xueguang Ma et al. 2024. MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark. arXiv:2406.01574 [cs.CL] https:\/\/arxiv.org\/abs\/2406.01574"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649379"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"crossref","unstructured":"Hao Wen Yuanchun Li Zunshuai Zhang Shiqi Jiang Xiaozhou Ye Ye Ouyang Ya-Qin Zhang and Yunxin Liu. 2023. AdaptiveNet: Post-deployment Neural Architecture Adaptation for Diverse Edge Environments. arXiv:2303.07129 [cs.LG] https:\/\/arxiv.org\/abs\/2303.07129","DOI":"10.1145\/3570361.3592529"},{"key":"e_1_3_2_1_75_1","unstructured":"Noam Wies Yoav Levine and Amnon Shashua. 2023. The Learnability of In-Context Learning. arXiv:2303.07895 [cs.CL] https:\/\/arxiv.org\/abs\/2303.07895"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_77_1","volume-title":"LaMini-LM: A Diverse Herd of Distilled Models from Large-Scale Instructions. CoRR abs\/2304.14402","author":"Wu Minghao","year":"2023","unstructured":"Minghao Wu, Abdul Waheed, Chiyu Zhang, Muhammad Abdul-Mageed, and Alham Fikri Aji. 2023. LaMini-LM: A Diverse Herd of Distilled Models from Large-Scale Instructions. CoRR abs\/2304.14402 (2023). arXiv:2304.14402 https:\/\/arxiv.org\/abs\/2304.14402"},{"key":"e_1_3_2_1_78_1","unstructured":"Mengzhou Xia Tianyu Gao Zhiyuan Zeng and Danqi Chen. 2024. Sheared LLaMA: Accelerating Language Model Pre-training via Structured Pruning. arXiv:2310.06694 [cs.CL] https:\/\/arxiv.org\/abs\/2310.06694"},{"key":"e_1_3_2_1_79_1","volume-title":"DroidCall: A Dataset for LLM-powered Android Intent Invocation. arXiv preprint arXiv:2412.00402","author":"Xie Weikai","year":"2024","unstructured":"Weikai Xie, Li Zhang, Shihe Wang, Rongjie Yi, and Mengwei Xu. 2024. DroidCall: A Dataset for LLM-powered Android Intent Invocation. arXiv preprint arXiv:2412.00402 (2024)."},{"key":"e_1_3_2_1_80_1","unstructured":"Daliang Xu Wangsong Yin Xin Jin Ying Zhang Shiyun Wei Mengwei Xu and Xuanzhe Liu. 2023. LLMCad: Fast and Scalable On-device Large Language Model Inference. arXiv:2309.04255 [cs.NI] https:\/\/arxiv.org\/abs\/2309.04255"},{"key":"e_1_3_2_1_81_1","unstructured":"Daliang Xu Hao Zhang Liming Yang Ruiqi Liu Gang Huang Mengwei Xu and Xuanzhe Liu. 2024. Empowering 1000 tokens\/second on-device LLM prefilling with mllm-NPU. arXiv:2407.05858 [cs.AI] https:\/\/arxiv.org\/abs\/2407.05858"},{"key":"e_1_3_2_1_82_1","unstructured":"Zhenliang Xue Yixin Song Zeyu Mi Le Chen Yubin Xia and Haibo Chen. 2024. PowerInfer-2: Fast Large Language Model Inference on a Smartphone. arXiv:2406.06282 [cs.LG] https:\/\/arxiv.org\/abs\/2406.06282"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"crossref","unstructured":"Sohee Yang Jonghyeon Kim Joel Jang Seonghyeon Ye Hyunji Lee and Minjoon Seo. 2024. Improving Probability-based Prompt Selection Through Unified Evaluation and Analysis. arXiv:2305.14877 [cs.CL] https:\/\/arxiv.org\/abs\/2305.14877","DOI":"10.1162\/tacl_a_00666"},{"key":"e_1_3_2_1_84_1","volume-title":"Laco: Large language model pruning via layer collapse. arXiv preprint arXiv:2402.11187","author":"Yang Yifei","year":"2024","unstructured":"Yifei Yang, Zouying Cao, and Hai Zhao. 2024. Laco: Large language model pruning via layer collapse. arXiv preprint arXiv:2402.11187 (2024)."},{"key":"e_1_3_2_1_85_1","unstructured":"Wangsong Yin Mengwei Xu Yuanchun Li and Xuanzhe Liu. 2024. LLM as a System Service on Mobile Devices. arXiv:2403.11805 [cs.OS] https:\/\/arxiv.org\/abs\/2403.11805"},{"key":"e_1_3_2_1_86_1","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 521\u2013538. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/yu"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649361"},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"crossref","unstructured":"Li Zhang Shihe Wang Xianqing Jia Zhihan Zheng Yunhe Yan Longxi Gao Yuanchun Li and Mengwei Xu. 2024. LlamaTouch: A Faithful and Scalable Testbed for Mobile UI Task Automation. arXiv:2404.16054 [cs.HC] https:\/\/arxiv.org\/abs\/2404.16054","DOI":"10.1145\/3654777.3676382"},{"key":"e_1_3_2_1_89_1","volume-title":"OPT: Open Pre-trained Transformer Language Models. arXiv:2205.01068 [cs.CL] https:\/\/arxiv.org\/abs\/2205.01068","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, et al. 2022. OPT: Open Pre-trained Transformer Language Models. arXiv:2205.01068 [cs.CL] https:\/\/arxiv.org\/abs\/2205.01068"},{"key":"e_1_3_2_1_90_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric. P Xing Hao Zhang Joseph E. Gonzalez and Ion Stoica. 2023. Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. arXiv:2306.05685 [cs.CL]"},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613139"},{"key":"e_1_3_2_1_92_1","unstructured":"Jeffrey Zhou Tianjian Lu Swaroop Mishra Siddhartha Brahma Sujoy Basu Yi Luan Denny Zhou and Le Hou. 2023. Instruction-Following Evaluation for Large Language Models. arXiv:2311.07911 [cs.CL] https:\/\/arxiv.org\/abs\/2311.07911"},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.11"}],"event":{"name":"ACM MOBICOM '25: 31st Annual International Conference on Mobile Computing and Networking","location":"Kerry Hotel, Hong Kong Hong Kong China","acronym":"ACM MOBICOM '25","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing"]},"container-title":["Proceedings of the 31st Annual International Conference on Mobile Computing and Networking"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3680207.3765259","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T13:22:05Z","timestamp":1763731325000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680207.3765259"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,3]]},"references-count":93,"alternative-id":["10.1145\/3680207.3765259","10.1145\/3680207"],"URL":"https:\/\/doi.org\/10.1145\/3680207.3765259","relation":{},"subject":[],"published":{"date-parts":[[2025,11,3]]},"assertion":[{"value":"2025-11-21","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}