{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T13:14:29Z","timestamp":1780060469796,"version":"3.54.0"},"publisher-location":"New York, NY, USA","reference-count":86,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T00:00:00Z","timestamp":1781913600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"National Natural Science Foundation of China","award":["62325201"],"award-info":[{"award-number":["62325201"]}]},{"name":"NSFC","award":["62522202"],"award-info":[{"award-number":["62522202"]}]},{"name":"Beijing Natural Science Foundation","award":["L253005"],"award-info":[{"award-number":["L253005"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,21]]},"DOI":"10.1145\/3745756.3809205","type":"proceedings-article","created":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:52:21Z","timestamp":1780059141000},"page":"294-308","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ShadowNPU: System and Algorithm Co-design for NPU-Centric On-Device LLM Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6242-4368","authenticated-orcid":false,"given":"Wangsong","family":"Yin","sequence":"first","affiliation":[{"name":"Key Lab of HCST (PKU), MOE; SCS, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6775-0688","authenticated-orcid":false,"given":"Daliang","family":"Xu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6271-6993","authenticated-orcid":false,"given":"Mengwei","family":"Xu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4686-3181","authenticated-orcid":false,"given":"Gang","family":"Huang","sequence":"additional","affiliation":[{"name":"Key Lab of HCST (PKU), MOE; SCS, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7908-8484","authenticated-orcid":false,"given":"Xuanzhe","family":"Liu","sequence":"additional","affiliation":[{"name":"Key Lab of HCST (PKU), MOE; SCS, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,20]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2025. ARM NEON. https:\/\/www.arm.com\/technologies\/neon."},{"key":"e_1_3_2_1_2_1","unstructured":"2025. Hexagon NPU SDK. https:\/\/www.qualcomm.com\/developer\/software\/hexagon-npu-sdk."},{"key":"e_1_3_2_1_3_1","unstructured":"2025. LLVM. https:\/\/llvm.org\/."},{"key":"e_1_3_2_1_4_1","unstructured":"2025. Open CL. https:\/\/en.wikipedia.org\/wiki\/OpenCL."},{"key":"e_1_3_2_1_5_1","unstructured":"2025. QNN SDK. https:\/\/docs.qualcomm.com\/bundle\/publicresource\/topics\/80-63442-50\/introduction.html."},{"key":"e_1_3_2_1_6_1","unstructured":"2025. Qualcomm Neural Processing Engine. https:\/\/docs.qualcomm.com\/bundle\/publicresource\/topics\/80-70015-15BY\/snpe.html."},{"key":"e_1_3_2_1_7_1","unstructured":"2025. rewind. https:\/\/www.rewind.ai\/."},{"key":"e_1_3_2_1_8_1","unstructured":"2025. Snapdragon 8 gen 3 mobile platform product brief. https:\/\/docs.qualcomm.com\/bundle\/publicresource\/87-71408-1_REV_C_Snapdragon_8_gen_3_Mobile_Platform_Product_Brief.pdf."},{"key":"e_1_3_2_1_9_1","unstructured":"2025. TMS320F2812 platform product brief. https:\/\/www.ti.com\/product\/TMS320F2812."},{"key":"e_1_3_2_1_10_1","unstructured":"2026. ChnSentiCorp dataset. https:\/\/huggingface.co\/datasets\/lansinuote\/ChnSentiCorp."},{"key":"e_1_3_2_1_11_1","unstructured":"Marah Abdin and etc. Jyoti Aneja. 2024. Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone. arXiv:2404.14219 [cs.CL] https:\/\/arxiv.org\/abs\/2404.14219"},{"key":"e_1_3_2_1_12_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang Binyuan Hui Luo Ji Mei Li Junyang Lin Runji Lin Dayiheng Liu Gao Liu Chengqiang Lu Keming Lu Jianxin Ma Rui Men Xingzhang Ren Xuancheng Ren Chuanqi Tan Sinan Tan Jianhong Tu Peng Wang Shijie Wang Wei Wang Shengguang Wu Benfeng Xu Jin Xu An Yang Hao Yang Jian Yang Shusheng Yang Yang Yao Bowen Yu Hongyi Yuan Zheng Yuan Jianwei Zhang Xingxuan Zhang Yichang Zhang Zhenru Zhang Chang Zhou Jingren Zhou Xiaohuan Zhou and Tianhang Zhu. 2023. Qwen Technical Report. arXiv:2309.16609 [cs.CL] https:\/\/arxiv.org\/abs\/2309.16609"},{"key":"e_1_3_2_1_13_1","volume-title":"Paul Pu Liang, and Mani Srivastava","author":"Baris Ozan","year":"2025","unstructured":"Ozan Baris, Yizhuo Chen, Gaofeng Dong, Liying Han, Tomoyoshi Kimura, Pengrui Quan, Ruijie Wang, Tianchen Wang, Tarek Abdelzaher, Mario Berg\u00e9s, Paul Pu Liang, and Mani Srivastava. 2025. Foundation Models for CPS-IoT: Opportunities and Challenges. arXiv:2501.16368 [cs.LG] https:\/\/arxiv.org\/abs\/2501.16368"},{"key":"e_1_3_2_1_14_1","volume-title":"Yingyan Celine Lin, and Pavlo Molchanov","author":"Belcak Peter","year":"2025","unstructured":"Peter Belcak, Greg Heinrich, Shizhe Diao, Yonggan Fu, Xin Dong, Saurav Muralidharan, Yingyan Celine Lin, and Pavlo Molchanov. 2025. Small Language Models are the Future of Agentic AI. arXiv:2506.02153 [cs.AI] https:\/\/arxiv.org\/abs\/2506.02153"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/tkde.2025.3554028"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Le Chen Dahu Feng Erhu Feng Rong Zhao Yingrui Wang Yubin Xia Haibo Chen and Pinjie Xu. 2025. HeteroLLM: Accelerating Large Language Model Inference on Mobile SoCs platform with Heterogeneous AI Accelerators. arXiv:2501.14794 [cs.DC] https:\/\/arxiv.org\/abs\/2501.14794","DOI":"10.20944\/preprints202501.0901.v1"},{"key":"e_1_3_2_1_17_1","unstructured":"Wei Chen and Zhiyuan Li. 2024. Octopus v2: On-device language model for super agent. arXiv:2404.01744 [cs.CL] https:\/\/arxiv.org\/abs\/2404.01744"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-2097"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM48880.2022.9796661"},{"key":"e_1_3_2_1_20_1","unstructured":"Suyu Ge Yunan Zhang Liyuan Liu Minjia Zhang Jiawei Han and Jianfeng Gao. 2024. Model Tells You What to Discard: Adaptive KV Cache Compression for LLMs. arXiv:2310.01801 [cs.CL] https:\/\/arxiv.org\/abs\/2310.01801"},{"key":"e_1_3_2_1_21_1","unstructured":"ggml. 2025. llama.cpp. https:\/\/github.com\/ggml-org\/llama.cpp"},{"key":"e_1_3_2_1_22_1","volume-title":"Scaling LLM Test-Time Compute with Mobile NPU on Smartphones. arXiv preprint arXiv:2509.23324","author":"Hao Zixu","year":"2025","unstructured":"Zixu Hao, Jianyu Wei, Tuowei Wang, Minxing Huang, Huiqiang Jiang, Shiqi Jiang, Ting Cao, and Ju Ren. 2025. Scaling LLM Test-Time Compute with Mobile NPU on Smartphones. arXiv preprint arXiv:2509.23324 (2025)."},{"key":"e_1_3_2_1_23_1","unstructured":"Zhiyuan He Yike Zhang Chengruidong Zhang Huiqiang Jiang Yuqing Yang and Lili Qiu. 2025. TriangleMix: Accelerating Prefilling via Decoding-time Contribution Sparsity. arXiv:2507.21526 [cs.CL] https:\/\/arxiv.org\/abs\/2507.21526"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3498361.3538948"},{"key":"e_1_3_2_1_25_1","unstructured":"Albert Q. Jiang Alexandre Sablayrolles Arthur Mensch Chris Bamford Devendra Singh Chaplot Diego de las Casas Florian Bressand Gianna Lengyel Guillaume Lample Lucile Saulnier L\u00e9lio Renard Lavaud Marie-Anne Lachaux Pierre Stock Teven Le Scao Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William El Sayed. 2023. Mistral 7B. arXiv:2310.06825 [cs.CL] https:\/\/arxiv.org\/abs\/2310.06825"},{"key":"e_1_3_2_1_26_1","unstructured":"Huiqiang Jiang Yucheng Li Chengruidong Zhang Qianhui Wu Xufang Luo Surin Ahn Zhenhua Han Amir H. Abdi Dongsheng Li Chin-Yew Lin Yuqing Yang and Lili Qiu. 2024. MInference 1.0: Accelerating Pre-filling for Long-Context LLMs via Dynamic Sparse Attention. arXiv:2407.02490 [cs.CL] https:\/\/arxiv.org\/abs\/2407.02490"},{"key":"e_1_3_2_1_27_1","unstructured":"Xunhao Lai Jianqiao Lu Yao Luo Yiyuan Ma and Xun Zhou. 2025. FlexPre-fill: A Context-Aware Sparse Attention Mechanism for Efficient Long-Sequence Inference. arXiv:2502.20766 [cs.LG] https:\/\/arxiv.org\/abs\/2502.20766"},{"key":"e_1_3_2_1_28_1","volume-title":"Hojun Choi, Steven Y. Ko, Sangeun Oh, and Insik Shin.","author":"Lee Sunjae","year":"2024","unstructured":"Sunjae Lee, Junyoung Choi, Jungjae Lee, Munim Hasan Wasi, Hojun Choi, Steven Y. Ko, Sangeun Oh, and Insik Shin. 2024. Explore, Select, Derive, and Recall: Augmenting LLM with Human-like Memory for Mobile Task Automation. arXiv:2312.03003 [cs.HC] https:\/\/arxiv.org\/abs\/2312.03003"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Liang Li Xingke Yang Wen Wu Hao Wang Tomoaki Ohtsuki Xin Fu Miao Pan and Xuemin Shen. 2025. MobiLLM: Enabling LLM Fine-Tuning on the Mobile Device via Server Assisted Side Tuning. arXiv:2502.20421 [cs.LG] https:\/\/arxiv.org\/abs\/2502.20421","DOI":"10.1109\/JSTSP.2025.3633550"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3662006.3662059"},{"key":"e_1_3_2_1_31_1","volume-title":"AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. arXiv:2306.00978 [cs.CL] https:\/\/arxiv.org\/abs\/2306.00978","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, Wei-Chen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, and Song Han. 2024. AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. arXiv:2306.00978 [cs.CL] https:\/\/arxiv.org\/abs\/2306.00978"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3715014.3722070"},{"key":"e_1_3_2_1_33_1","unstructured":"Mukul Lokhande Gopal Raut and Santosh Kumar Vishvakarma. 2024. Flex-PE: Flexible and SIMD Multi-Precision Processing Element for AI Workloads. arXiv:2412.11702 [cs.AR] https:\/\/arxiv.org\/abs\/2412.11702"},{"key":"e_1_3_2_1_34_1","volume-title":"POLARON: Precision-aware On-device Learning and Adaptive Runtime-cONfigurable AI acceleration. arXiv:2506.08785 [cs.AR] https:\/\/arxiv.org\/abs\/2506.08785","author":"Lokhande Mukul","year":"2025","unstructured":"Mukul Lokhande and Santosh Kumar Vishvakarma. 2025. POLARON: Precision-aware On-device Learning and Adaptive Runtime-cONfigurable AI acceleration. arXiv:2506.08785 [cs.AR] https:\/\/arxiv.org\/abs\/2506.08785"},{"key":"e_1_3_2_1_35_1","unstructured":"Enzhe Lu Zhejun Jiang Jingyuan Liu Yulun Du Tao Jiang Chao Hong Shaowei Liu Weiran He Enming Yuan Yuzhi Wang Zhiqi Huang Huan Yuan Suting Xu Xinran Xu Guokun Lai Yanru Chen Huabin Zheng Junjie Yan Jianlin Su Yuxin Wu Neo Y. Zhang Zhilin Yang Xinyu Zhou Mingxing Zhang and Jiezhong Qiu. 2025. MoBA: Mixture of Block Attention for Long-Context LLMs. arXiv:2502.13189 [cs.LG] https:\/\/arxiv.org\/abs\/2502.13189"},{"key":"e_1_3_2_1_36_1","unstructured":"Shuming Ma Hongyu Wang Lingxiao Ma Lei Wang Wenhui Wang Shaohan Huang Li Dong Ruiping Wang Jilong Xue and Furu Wei. 2024. The Era of 1-bit LLMs: All Large Language Models are in 1.58 Bits. arXiv:2402.17764 [cs.CL] https:\/\/arxiv.org\/abs\/2402.17764"},{"key":"e_1_3_2_1_37_1","unstructured":"Stephen Merity Caiming Xiong James Bradbury and Richard Socher. 2016. Pointer Sentinel Mixture Models. arXiv:1609.07843 [cs.CL]"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.128111"},{"key":"e_1_3_2_1_39_1","unstructured":"MLC team. 2023-2025. MLC-LLM. https:\/\/github.com\/mlc-ai\/mlc-llm"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Mozhgan Navardi Romina Aalishah Yuzhe Fu Yueqian Lin Hai Li Yiran Chen and Tinoosh Mohsenin. 2025. GenAI at the Edge: Comprehensive Survey on Empowering Edge Devices. arXiv:2502.15816 [cs.DC] https:\/\/arxiv.org\/abs\/2502.15816","DOI":"10.32388\/JEU3U0"},{"key":"e_1_3_2_1_41_1","volume-title":"Timothy Kwok, and Guoliang Xing.","author":"Ouyang Xiaomin","year":"2024","unstructured":"Xiaomin Ouyang, Xian Shuai, Yang Li, Li Pan, Xifan Zhang, Heming Fu, Sitong Cheng, Xinyan Wang, Shihua Cao, Jiang Xin, Hazel Mok, Zhenyu Yan, Doris Sau Fung Yu, Timothy Kwok, and Guoliang Xing. 2024. ADMarker: A Multi-Modal Federated Learning System for Monitoring Digital Biomarkers of Alzheimer's Disease. arXiv:2310.15301 [cs.LG] https:\/\/arxiv.org\/abs\/2310.15301"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2022.3205713"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Dan Peng Zhihui Fu and Jun Wang. 2024. PocketLLM: Enabling On-Device Fine-Tuning for Personalized LLMs. arXiv:2407.01031 [cs.LG] https:\/\/arxiv.org\/abs\/2407.01031","DOI":"10.18653\/v1\/2024.privatenlp-1.10"},{"key":"e_1_3_2_1_44_1","unstructured":"phonelm. 2025. PhoneLM-0.5B. https:\/\/huggingface.co\/unsloth\/PhoneLM-0.5B"},{"key":"e_1_3_2_1_45_1","unstructured":"phonelm. 2025. PhoneLM-1.5B. https:\/\/huggingface.co\/unsloth\/PhoneLM-1.5B"},{"key":"e_1_3_2_1_46_1","unstructured":"Qwen : An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chengyuan Li Dayiheng Liu Fei Huang Haoran Wei Huan Lin Jian Yang Jianhong Tu Jianwei Zhang Jianxin Yang Jiaxi Yang Jingren Zhou Junyang Lin Kai Dang Keming Lu Keqin Bao Kexin Yang Le Yu Mei Li Mingfeng Xue Pei Zhang Qin Zhu Rui Men Runji Lin Tianhao Li Tianyi Tang Tingyu Xia Xingzhang Ren Xuancheng Ren Yang Fan Yang Su Yichang Zhang Yu Wan Yuqiong Liu Zeyu Cui Zhenru Zhang and Zihan Qiu. 2025. Qwen2.5 Technical Report. arXiv:2412.15115 [cs.CL] https:\/\/arxiv.org\/abs\/2412.15115"},{"key":"e_1_3_2_1_47_1","unstructured":"qwen. 2025. Qwen2-0.5B. https:\/\/huggingface.co\/unsloth\/Qwen2-0.5B"},{"key":"e_1_3_2_1_48_1","unstructured":"qwen. 2025. Qwen2-1.5B. https:\/\/huggingface.co\/unsloth\/Qwen2-1.5B"},{"key":"e_1_3_2_1_49_1","unstructured":"redmi. 2025. Redmi K60 Champion Edition Smartphone. https:\/\/www.gsmarena.com\/xiaomi_redmi_k60_pro-12046.php"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696067"},{"key":"e_1_3_2_1_51_1","unstructured":"Andrii Skliar Ties van Rozendaal Romain Lepert Todor Boinovski Mart van Baalen Markus Nagel Paul Whatmough and Babak Ehteshami Bejnordi. 2025. Mixture of Cache-Conditional Experts for Efficient Mobile Device Inference. arXiv:2412.00099 [cs.LG] https:\/\/arxiv.org\/abs\/2412.00099"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D13-1170"},{"key":"e_1_3_2_1_53_1","unstructured":"Jianlin Su Yu Lu Shengfeng Pan Ahmed Murtadha Bo Wen and Yunfeng Liu. 2023. RoFormer: Enhanced Transformer with Rotary Position Embedding. arXiv:2104.09864 [cs.CL] https:\/\/arxiv.org\/abs\/2104.09864"},{"key":"e_1_3_2_1_54_1","unstructured":"Shreyas Subramanian Vikram Elango and Mecit Gungor. 2025. Small Language Models (SLMs) Can Still Pack a Punch: A survey. arXiv:2501.05465 [cs.CL] https:\/\/arxiv.org\/abs\/2501.05465"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3716278"},{"key":"e_1_3_2_1_56_1","volume-title":"Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference. arXiv:2406.10774 [cs.CL] https:\/\/arxiv.org\/abs\/2406.10774","author":"Tang Jiaming","year":"2024","unstructured":"Jiaming Tang, Yilong Zhao, Kan Zhu, Guangxuan Xiao, Baris Kasikci, and Song Han. 2024. Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference. arXiv:2406.10774 [cs.CL] https:\/\/arxiv.org\/abs\/2406.10774"},{"key":"e_1_3_2_1_57_1","unstructured":"TFLite team. 2025. mediapipe. https:\/\/ai.google.dev\/edge\/mediapipe\/solutions\/guide"},{"key":"e_1_3_2_1_58_1","unstructured":"Omkar Thawakar Ashmal Vayani Salman Khan Hisham Cholakal Rao M. Anwer Michael Felsberg Tim Baldwin Eric P. Xing and Fahad Shahbaz Khan. 2024. MobiLlama: Towards Accurate and Lightweight Fully Transparent GPT. arXiv:2402.16840 [cs.CL] https:\/\/arxiv.org\/abs\/2402.16840"},{"key":"e_1_3_2_1_59_1","volume-title":"SpAtten: Efficient Sparse Attention Architecture with Cascade Token and Head Pruning. HPCA","author":"Wang Hanrui","year":"2021","unstructured":"Hanrui Wang, Zhekai Zhang, and Song Han. 2021. SpAtten: Efficient Sparse Attention Architecture with Cascade Token and Head Pruning. HPCA (2021)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581791.3596870"},{"key":"e_1_3_2_1_61_1","unstructured":"Haocheng Xi Shuo Yang Yilong Zhao Chenfeng Xu Muyang Li Xiuyu Li Yujun Lin Han Cai Jintao Zhang Dacheng Li Jianfei Chen Ion Stoica Kurt Keutzer and Song Han. 2025. Sparse VideoGen: Accelerating Video Diffusion Transformers with Spatial-Temporal Sparsity. arXiv:2502.01776 [cs.CV] https:\/\/arxiv.org\/abs\/2502.01776"},{"key":"e_1_3_2_1_62_1","unstructured":"Guangxuan Xiao Ji Lin Mickael Seznec Hao Wu Julien Demouth and Song Han. 2024. SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models. arXiv:2211.10438 [cs.CL] https:\/\/arxiv.org\/abs\/2211.10438"},{"key":"e_1_3_2_1_63_1","unstructured":"Guangxuan Xiao Jiaming Tang Jingwei Zuo Junxian Guo Shang Yang Haotian Tang Yao Fu and Song Han. 2024. DuoAttention: Efficient Long-Context LLM Inference with Retrieval and Streaming Heads. arXiv:2410.10819 [cs.CL] https:\/\/arxiv.org\/abs\/2410.10819"},{"key":"e_1_3_2_1_64_1","unstructured":"xiaomi. 2025. MI14 Smartphone. https:\/\/www.mi.com\/global\/product\/xiaomi-14\/specs\/"},{"key":"e_1_3_2_1_65_1","unstructured":"Weikai Xie Li Zhang Shihe Wang Rongjie Yi and Mengwei Xu. 2024. DroidCall: A Dataset for LLM-powered Android Intent Invocation. arXiv:2412.00402 [cs.AI] https:\/\/arxiv.org\/abs\/2412.00402"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2024.3513457"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707239"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3706418"},{"key":"e_1_3_2_1_69_1","unstructured":"Mengwei Xu Wangsong Yin Dongqi Cai Rongjie Yi Daliang Xu Qipeng Wang Bingyang Wu Yihao Zhao Chen Yang Shihe Wang Qiyang Zhang Zhenyan Lu Li Zhang Shangguang Wang Yuanchun Li Yunxin Liu Xin Jin and Xuanzhe Liu. 2024. A Survey of Resource-efficient LLM and Multimodal Foundation Models. arXiv:2401.08092 [cs.LG] https:\/\/arxiv.org\/abs\/2401.08092"},{"key":"e_1_3_2_1_70_1","unstructured":"Ruyi Xu Guangxuan Xiao Haofeng Huang Junxian Guo and Song Han. 2025. XAttention: Block Sparse Attention with Antidiagonal Scoring. arXiv:2503.16428 [cs.CL] https:\/\/arxiv.org\/abs\/2503.16428"},{"key":"e_1_3_2_1_71_1","unstructured":"Zhenliang Xue Yixin Song Zeyu Mi Xinrui Zheng Yubin Xia and Haibo Chen. 2024. PowerInfer-2: Fast Large Language Model Inference on a Smartphone. arXiv:2406.06282 [cs.LG] https:\/\/arxiv.org\/abs\/2406.06282"},{"key":"e_1_3_2_1_72_1","unstructured":"Bufang Yang Lilin Xu Liekang Zeng Kaiwei Liu Siyang Jiang Wenrui Lu Hongkai Chen Xiaofan Jiang Guoliang Xing and Zhenyu Yan. 2025. ContextAgent: Context-Aware Proactive LLM Agents with Open-World Sensory Perceptions. arXiv:2505.14668 [cs.AI] https:\/\/arxiv.org\/abs\/2505.14668"},{"key":"e_1_3_2_1_73_1","unstructured":"Shang Yang Junxian Guo Haotian Tang Qinghao Hu Guangxuan Xiao Jiaming Tang Yujun Lin Zhijian Liu Yao Lu and Song Han. 2025. LServe: Efficient Long-sequence LLM Serving with Unified Sparse Attention. arXiv:2502.14866 [cs.CL] https:\/\/arxiv.org\/abs\/2502.14866"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372224.3419192"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/3666025.3699327"},{"key":"e_1_3_2_1_76_1","unstructured":"Wangsong Yin Mengwei Xu Yuanchun Li and Xuanzhe Liu. 2024. LLM as a System Service on Mobile Devices. arXiv:2403.11805 [cs.OS] https:\/\/arxiv.org\/abs\/2403.11805"},{"key":"e_1_3_2_1_77_1","volume-title":"ELMS: Elasticized Large Language Models On Mobile Devices. arXiv:2409.09071 [cs.DC] https:\/\/arxiv.org\/abs\/2409.09071","author":"Yin Wangsong","year":"2024","unstructured":"Wangsong Yin, Rongjie Yi, Daliang Xu, Gang Huang, Mengwei Xu, and Xuanzhe Liu. 2024. ELMS: Elasticized Large Language Models On Mobile Devices. arXiv:2409.09071 [cs.DC] https:\/\/arxiv.org\/abs\/2409.09071"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640390"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"crossref","unstructured":"Jingyang Yuan Huazuo Gao Damai Dai Junyu Luo Liang Zhao Zhengyan Zhang Zhenda Xie Y. X. Wei Lean Wang Zhiping Xiao Yuqing Wang Chong Ruan Ming Zhang Wenfeng Liang and Wangding Zeng. 2025. Native Sparse Attention: Hardware-Aligned and Natively Trainable Sparse Attention. arXiv:2502.11089 [cs.CL] https:\/\/arxiv.org\/abs\/2502.11089","DOI":"10.18653\/v1\/2025.acl-long.1126"},{"key":"e_1_3_2_1_80_1","volume-title":"International Conference on Machine Learning (ICML).","author":"Zhang Jintao","year":"2025","unstructured":"Jintao Zhang, Chendong Xiang, Haofeng Huang, Jia Wei, Haocheng Xi, Jun Zhu, and Jianfei Chen. 2025. Spargeattn: Accurate sparse attention accelerating any model inference. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"crossref","unstructured":"Li Zhang Shihe Wang Xianqing Jia Zhihan Zheng Yunhe Yan Longxi Gao Yuanchun Li and Mengwei Xu. 2024. LlamaTouch: A Faithful and Scalable Testbed for Mobile UI Task Automation. arXiv:2404.16054 [cs.HC] https:\/\/arxiv.org\/abs\/2404.16054","DOI":"10.1145\/3654777.3676382"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"crossref","unstructured":"Li Zhang Shihe Wang Xianqing Jia Zhihan Zheng Yunhe Yan Longxi Gao Yuanchun Li and Mengwei Xu. 2024. LlamaTouch: A Faithful and Scalable Testbed for Mobile UI Task Automation. arXiv:2404.16054 [cs.HC] https:\/\/arxiv.org\/abs\/2404.16054","DOI":"10.1145\/3654777.3676382"},{"key":"e_1_3_2_1_83_1","unstructured":"Peiyuan Zhang Guangtao Zeng Tianduo Wang and Wei Lu. 2024. TinyLlama: An Open-Source Small Language Model. arXiv:2401.02385 [cs.CL] https:\/\/arxiv.org\/abs\/2401.02385"},{"key":"e_1_3_2_1_84_1","volume-title":"Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer.","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. 2022. OPT: Open Pre-trained Transformer Language Models. arXiv:2205.01068 [cs.CL] https:\/\/arxiv.org\/abs\/2205.01068"},{"key":"e_1_3_2_1_85_1","unstructured":"Zhenyu Zhang Ying Sheng Tianyi Zhou Tianlong Chen Lianmin Zheng Ruisi Cai Zhao Song Yuandong Tian Christopher R\u00e9 Clark Barrett Zhangyang Wang and Beidi Chen. 2023. H2O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models. arXiv:2306.14048 [cs.LG] https:\/\/arxiv.org\/abs\/2306.14048"},{"key":"e_1_3_2_1_86_1","unstructured":"An Zou Yuankai Xu Yinchen Ni Jintao Chen Yehan Ma Jing Li Christopher Gill Xuan Zhang and Yier Jin. 2025. A Survey of Real-time Scheduling on Accelerator-based Heterogeneous Architecture for Time Critical Applications. arXiv:2505.11970 [cs.DC] https:\/\/arxiv.org\/abs\/2505.11970"}],"event":{"name":"MobiSys '26: 24th Annual International Conference on Mobile Systems, Applications and Services","location":"University of Cambridge Cambridge United Kingdom","acronym":"MobiSys '26","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 24th Annual International Conference on Mobile Systems, Applications and Services"],"original-title":[],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:54:33Z","timestamp":1780059273000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3745756.3809205"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,20]]},"references-count":86,"alternative-id":["10.1145\/3745756.3809205","10.1145\/3745756"],"URL":"https:\/\/doi.org\/10.1145\/3745756.3809205","relation":{},"subject":[],"published":{"date-parts":[[2026,6,20]]},"assertion":[{"value":"2026-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}