{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T18:10:26Z","timestamp":1775067026711,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["623B2074"],"award-info":[{"award-number":["623B2074"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62088102"],"award-info":[{"award-number":["62088102"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62472279"],"award-info":[{"award-number":["62472279"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2021ZD0200300"],"award-info":[{"award-number":["2021ZD0200300"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3731569.3764808","type":"proceedings-article","created":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:43:24Z","timestamp":1759322604000},"page":"359-374","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Characterizing Mobile SoC for Accelerating Heterogeneous LLM Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-6562-5485","authenticated-orcid":false,"given":"Le","family":"Chen","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1146-8552","authenticated-orcid":false,"given":"Dahu","family":"Feng","sequence":"additional","affiliation":[{"name":"Tsinghua university, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5957-3024","authenticated-orcid":false,"given":"Erhu","family":"Feng","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0673-8193","authenticated-orcid":false,"given":"Yingrui","family":"Wang","sequence":"additional","affiliation":[{"name":"SenseTime, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2320-0326","authenticated-orcid":false,"given":"Rong","family":"Zhao","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6558-5298","authenticated-orcid":false,"given":"Yubin","family":"Xia","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0882-6520","authenticated-orcid":false,"given":"Pinjie","family":"Xu","sequence":"additional","affiliation":[{"name":"SenseTime Research, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9720-0361","authenticated-orcid":false,"given":"Haibo","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai JiaoTong University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"283","volume-title":"12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. Tensorflow: A system for large-scale machine learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16), pages 265\u2013283, Savannah, GA, November 2016. USENIX Association."},{"key":"e_1_3_2_1_2_1","first-page":"134","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. Taming Throughput-Latency tradeoff in LLM inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pages 117\u2013134, Santa Clara, CA, July 2024. USENIX Association."},{"key":"e_1_3_2_1_3_1","volume-title":"Claude - right-sized for any task, the claude family of models offers the best combination of speed and performance. https:\/\/www.anthropic.com\/","year":"2024","unstructured":"Anthropic. Claude - right-sized for any task, the claude family of models offers the best combination of speed and performance. https:\/\/www.anthropic.com\/, 2024. Referenced December 2024."},{"key":"e_1_3_2_1_4_1","volume-title":"https:\/\/nanoreview.net\/en\/soc\/apple-a18","year":"2024","unstructured":"Apple. A18. https:\/\/nanoreview.net\/en\/soc\/apple-a18, 2024. Referenced December 2024."},{"key":"e_1_3_2_1_5_1","volume-title":"Longbench: A bilingual, multitask benchmark for long context understanding","author":"Bai Yushi","year":"2024","unstructured":"Yushi Bai, Xin Lv, Jiajie Zhang, Hongchang Lyu, Jiankai Tang, Zhidian Huang, Zhengxiao Du, Xiao Liu, Aohan Zeng, Lei Hou, Yuxiao Dong, Jie Tang, and Juanzi Li. Longbench: A bilingual, multitask benchmark for long context understanding, 2024."},{"key":"e_1_3_2_1_6_1","first-page":"594","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. TVM: An automated End-to-End optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18), pages 578\u2013594, Carlsbad, CA, October 2018. USENIX Association."},{"key":"e_1_3_2_1_7_1","volume-title":"Teq: Trainable equivalent transformation for quantization of llms. arXiv preprint arXiv:2310.10944","author":"Cheng Wenhua","year":"2023","unstructured":"Wenhua Cheng, Yiyang Cai, Kaokao Lv, and Haihao Shen. Teq: Trainable equivalent transformation for quantization of llms. arXiv preprint arXiv:2310.10944, 2023."},{"key":"e_1_3_2_1_8_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323, 2022."},{"key":"e_1_3_2_1_9_1","first-page":"153","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Fu Yao","year":"2024","unstructured":"Yao Fu, Leyang Xue, Yeqi Huang, Andrei-Octavian Brabete, Dmitrii Ustiugov, Yuvraj Patel, and Luo Mai. ServerlessLLM: Low-Latency serverless inference for large language models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pages 135\u2013153, Santa Clara, CA, July 2024. USENIX Association."},{"issue":"5","key":"e_1_3_2_1_10_1","article-title":"A unified programmable edge matrix processor for deep neural networks and matrix algebra","volume":"21","author":"George Biji","year":"2022","unstructured":"Biji George, Om Ji Omer, Ziaul Choudhury, Anoop V, and Sreenivas Subramoney. A unified programmable edge matrix processor for deep neural networks and matrix algebra. ACM Trans. Embed. Comput. Syst., 21(5), October 2022.","journal-title":"ACM Trans. Embed. Comput. Syst."},{"key":"e_1_3_2_1_11_1","volume-title":"llama.cpp - LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware. https:\/\/github.com\/ggerganov\/llama.cpp","year":"2023","unstructured":"Ggerganov. llama.cpp - LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware. https:\/\/github.com\/ggerganov\/llama.cpp, 2023. Referenced December 2024."},{"key":"e_1_3_2_1_12_1","unstructured":"Team GLM Aohan Zeng Bin Xu Bowen Wang Chenhui Zhang Da Yin Diego Rojas Guanyu Feng Hanlin Zhao Hanyu Lai Hao Yu Hongning Wang Jiadai Sun Jiajie Zhang Jiale Cheng Jiayi Gui Jie Tang Jing Zhang Juanzi Li Lei Zhao Lindong Wu Lucen Zhong Mingdao Liu Minlie Huang Peng Zhang Qinkai Zheng Rui Lu Shuaiqi Duan Shudan Zhang Shulin Cao Shuxun Yang Weng Lam Tam Wenyi Zhao Xiao Liu Xiao Xia Xiaohan Zhang Xiaotao Gu Xin Lv Xinghan Liu Xinyi Liu Xinyue Yang Xixuan Song Xunkai Zhang Yifan An Yifan Xu Yilin Niu Yuantao Yang Yueyan Li Yushi Bai Yuxiao Dong Zehan Qi Zhaoyu Wang Zhen Yang Zhengxiao Du Zhenyu Hou and Zihan Wang. Chatglm: A family of large language models from glm-130b to glm-4 all tools 2024."},{"key":"e_1_3_2_1_13_1","volume-title":"TensorFlow Lite - Google's high-performance runtime for on-device AI. https:\/\/tensorflow.google.cn\/lite","year":"2017","unstructured":"Google. TensorFlow Lite - Google's high-performance runtime for on-device AI. https:\/\/tensorflow.google.cn\/lite, 2017. Referenced December 2024."},{"key":"e_1_3_2_1_14_1","volume-title":"Cogvideo: Large-scale pretraining for text-to-video generation via transformers. arXiv preprint arXiv:2205.15868","author":"Hong Wenyi","year":"2022","unstructured":"Wenyi Hong, Ming Ding, Wendi Zheng, Xinghan Liu, and Jie Tang. Cogvideo: Large-scale pretraining for text-to-video generation via transformers. arXiv preprint arXiv:2205.15868, 2022."},{"key":"e_1_3_2_1_15_1","first-page":"14290","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Hong Wenyi","year":"2024","unstructured":"Wenyi Hong, Weihan Wang, Qingsong Lv, Jiazheng Xu, Wenmeng Yu, Junhui Ji, Yan Wang, Zihan Wang, Yuxiao Dong, Ming Ding, et al. Cogagent: A visual language model for gui agents. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 14281\u201314290, 2024."},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC '21","author":"Hsu Kuan-Chieh","year":"2021","unstructured":"Kuan-Chieh Hsu and Hung-Wei Tseng. Accelerating applications using edge tensor processing units. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC '21, New York, NY, USA, 2021. Association for Computing Machinery."},{"key":"e_1_3_2_1_17_1","first-page":"152","volume-title":"Proceedings of the 56th Annual IEEE\/ACM International Symposium on Microarchitecture","author":"Hsu Kuan-Chieh","year":"2023","unstructured":"Kuan-Chieh Hsu and Hung-Wei Tseng. Simultaneous and heterogenous multithreading. In Proceedings of the 56th Annual IEEE\/ACM International Symposium on Microarchitecture, pages 137\u2013152, 2023."},{"key":"e_1_3_2_1_18_1","volume-title":"Shmt: Exploiting simultaneous and heterogeneous parallelism in accelerator-rich architectures","author":"Hsu Kuan-Chieh","year":"2024","unstructured":"Kuan-Chieh Hsu and Hung-Wei Tseng. Shmt: Exploiting simultaneous and heterogeneous parallelism in accelerator-rich architectures. IEEE Micro, 2024."},{"key":"e_1_3_2_1_19_1","volume-title":"https:\/\/www.hisilicon.com\/en\/products\/Kirin\/Kirin-flagship-chips\/Kirin-9000","year":"2020","unstructured":"Huawei. Kirin-9000. https:\/\/www.hisilicon.com\/en\/products\/Kirin\/Kirin-flagship-chips\/Kirin-9000, 2020. Referenced December 2024."},{"key":"e_1_3_2_1_20_1","volume-title":"Huawei hiai engine - about the service. https:\/\/developer.huawei.com\/consumer\/en\/doc\/hiai-References\/ir-overview-0000001052569365","year":"2024","unstructured":"Huawei. Huawei hiai engine - about the service. https:\/\/developer.huawei.com\/consumer\/en\/doc\/hiai-References\/ir-overview-0000001052569365, 2024. Referenced December 2024."},{"issue":"3","key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3626793","article-title":"Automated backend allocation for multi-model, on-device ai inference","volume":"7","author":"Iyer Venkatraman","year":"2023","unstructured":"Venkatraman Iyer, Sungho Lee, Semun Lee, Juitem Joonwoo Kim, Hyunjun Kim, and Youngjae Shin. Automated backend allocation for multi-model, on-device ai inference. Proceedings of the ACM on Measurement and Analysis of Computing Systems, 7(3):1\u201333, 2023.","journal-title":"Proceedings of the ACM on Measurement and Analysis of Computing Systems"},{"key":"e_1_3_2_1_22_1","first-page":"688","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Jia Xianyan","year":"2022","unstructured":"Xianyan Jia, Le Jiang, Ang Wang, Wencong Xiao, Ziji Shi, Jie Zhang, Xinyuan Li, Langshi Chen, Yong Li, Zhen Zheng, et al. Whale: Efficient giant model training over heterogeneous {GPUs}. In 2022 USENIX Annual Technical Conference (USENIX ATC 22), pages 673\u2013688, 2022."},{"key":"e_1_3_2_1_23_1","first-page":"479","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Jiang Yimin","year":"2020","unstructured":"Yimin Jiang, Yibo Zhu, Chang Lan, Bairen Yi, Yong Cui, and Chuanxiong Guo. A unified architecture for accelerating distributed {DNN} training in heterogeneous {GPU\/CPU} clusters. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pages 463\u2013479, 2020."},{"key":"e_1_3_2_1_24_1","first-page":"12","volume-title":"Proceedings of the 44th Annual International Symposium on Computer Architecture, ISCA '17","author":"Jouppi Norman P.","year":"2017","unstructured":"Norman P. Jouppi, Cliff Young, Nishant Patil, David Patterson, Gaurav Agrawal, Raminder Bajwa, Sarah Bates, Suresh Bhatia, Nan Boden, Al Borchers, Rick Boyle, Pierre-luc Cantin, Clifford Chao, Chris Clark, Jeremy Coriell, Mike Daley, Matt Dau, Jeffrey Dean, Ben Gelb, Tara Vazir Ghaemmaghami, Rajendra Gottipati, William Gulland, Robert Hagmann, C. Richard Ho, Doug Hogberg, John Hu, Robert Hundt, Dan Hurt, Julian Ibarz, Aaron Jaffey, Alek Jaworski, Alexander Kaplan, Harshit Khaitan, Daniel Killebrew, Andy Koch, Naveen Kumar, Steve Lacy, James Laudon, James Law, Diemthu Le, Chris Leary, Zhuyuan Liu, Kyle Lucke, Alan Lundin, Gordon MacKean, Adriana Maggiore, Maire Mahony, Kieran Miller, Rahul Nagarajan, Ravi Narayanaswami, Ray Ni, Kathy Nix, Thomas Norrie, Mark Omernick, Narayana Penukonda, Andy Phelps, Jonathan Ross, Matt Ross, Amir Salek, Emad Samadiani, Chris Severn, Gregory Sizikov, Matthew Snelham, Jed Souter, Dan Steinberg, Andy Swing, Mercedes Tan, Gregory Thorson, Bo Tian, Horia Toma, Erick Tuttle, Vijay Vasudevan, Richard Walter, Walter Wang, Eric Wilcox, and Doe Hyun Yoon. In-datacenter performance analysis of a tensor processing unit. In Proceedings of the 44th Annual International Symposium on Computer Architecture, ISCA '17, pages 1\u201312, New York, NY, USA, 2017. Association for Computing Machinery."},{"key":"e_1_3_2_1_25_1","first-page":"83","volume-title":"2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","author":"Kwon Hyoukjun","unstructured":"Hyoukjun Kwon, Liangzhen Lai, Michael Pellauer, Tushar Krishna, Yu-Hsin Chen, and Vikas Chandra. Heterogeneous dataflow accelerators for multi-dnn workloads. In 2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA), pages 71\u201383. IEEE, 2021."},{"key":"e_1_3_2_1_26_1","first-page":"626","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles","author":"Kwon Woosuk","year":"2023","unstructured":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph Gonzalez, Hao Zhang, and Ion Stoica. Efficient memory management for large language model serving with pagedattention. In Proceedings of the 29th Symposium on Operating Systems Principles, pages 611\u2013626, 2023."},{"key":"e_1_3_2_1_27_1","volume-title":"Large language model inference acceleration: A comprehensive hardware perspective. arXiv preprint arXiv:2410.04466","author":"Li Jinhao","year":"2024","unstructured":"Jinhao Li, Jiaming Xu, Shan Huang, Yonghua Chen, Wen Li, Jun Liu, Yaoxiu Lian, Jiayi Pan, Li Ding, Hao Zhou, et al. Large language model inference acceleration: A comprehensive hardware perspective. arXiv preprint arXiv:2410.04466, 2024."},{"key":"e_1_3_2_1_28_1","first-page":"679","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. AlpaServe: Statistical multiplexing with model parallelism for deep learning serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23), pages 663\u2013679, Boston, MA, July 2023. USENIX Association."},{"key":"e_1_3_2_1_29_1","volume-title":"Belle: Be everyone's large language model engine. https:\/\/github.com\/LianjiaTech\/BELLE","year":"2024","unstructured":"LianjiaTech. Belle: Be everyone's large language model engine. https:\/\/github.com\/LianjiaTech\/BELLE, 2024."},{"key":"e_1_3_2_1_30_1","first-page":"87","article-title":"Activation-aware weight quantization for on-device llm compression and acceleration","volume":"6","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, WeiChen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, and Song Han. Awq: Activation-aware weight quantization for on-device llm compression and acceleration. Proceedings of Machine Learning and Systems, 6:87\u2013100, 2024.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_31_1","first-page":"1040","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Liu Yizhi","year":"2019","unstructured":"Yizhi Liu, Yao Wang, Ruofei Yu, Mu Li, Vin Sharma, and Yida Wang. Optimizing CNN model inference on CPUs. In 2019 USENIX Annual Technical Conference (USENIX ATC 19), pages 1025\u20131040, Renton, WA, July 2019. USENIX Association."},{"key":"e_1_3_2_1_32_1","volume-title":"From google gemini to openai q*(q-star): A survey of reshaping the generative artificial intelligence (ai) research landscape. arXiv preprint arXiv:2312.10868","author":"McIntosh Timothy R","year":"2023","unstructured":"Timothy R McIntosh, Teo Susnjak, Tong Liu, Paul Watters, and Malka N Halgamuge. From google gemini to openai q*(q-star): A survey of reshaping the generative artificial intelligence (ai) research landscape. arXiv preprint arXiv:2312.10868, 2023."},{"key":"e_1_3_2_1_33_1","volume-title":"Accelerated edge machine learning. https:\/\/onnxruntime.ai\/","year":"2020","unstructured":"Microsoft. Accelerated edge machine learning. https:\/\/onnxruntime.ai\/, 2020. Referenced April 2023."},{"key":"e_1_3_2_1_34_1","volume-title":"MLC-LLM - Universal LLM Deployment Engine with ML Compilation. https:\/\/github.com\/mlc-ai\/mlc-llm","author":"MLC","year":"2023","unstructured":"MLC team. MLC-LLM - Universal LLM Deployment Engine with ML Compilation. https:\/\/github.com\/mlc-ai\/mlc-llm, 2023-2024. Referenced December 2024."},{"key":"e_1_3_2_1_35_1","volume-title":"https:\/\/nanoreview.net\/en\/soc-list\/rating","year":"2023","unstructured":"Nanoreview. Online. https:\/\/nanoreview.net\/en\/soc-list\/rating, 2023-2024. Referenced December 2024."},{"key":"e_1_3_2_1_36_1","first-page":"579","volume-title":"2024 57th IEEE\/ACM International Symposium on Microarchitecture (MICRO)","author":"Odema Mohanad","year":"2024","unstructured":"Mohanad Odema, Luke Chen, Hyoukjun Kwon, and Mohammad Abdullah Al Faruque. Scar: Scheduling multi-model ai workloads on heterogeneous multi-chiplet module accelerators. In 2024 57th IEEE\/ACM International Symposium on Microarchitecture (MICRO), pages 565\u2013579, 2024."},{"key":"e_1_3_2_1_37_1","volume-title":"https:\/\/huggingface.co\/datasets\/openai\/gsm8k","author":"AI.","year":"2024","unstructured":"OpenAI. Gsm8k (grade school math 8k). https:\/\/huggingface.co\/datasets\/openai\/gsm8k, 2024."},{"key":"e_1_3_2_1_38_1","volume-title":"Introducing chatgpt. https:\/\/openai.com\/index\/chatgpt\/","author":"AI.","year":"2024","unstructured":"OpenAI. Introducing chatgpt. https:\/\/openai.com\/index\/chatgpt\/, 2024. Referenced January 2024."},{"key":"e_1_3_2_1_39_1","unstructured":"Gunho Park Baeseong Park Minsub Kim Sungjae Lee Jeonghoon Kim Beomseok Kwon Se Jung Kwon Byeongwook Kim Youngjoo Lee and Dongsoo Lee. Lut-gemm: Quantized matrix multiplication based on luts for efficient inference in large-scale generative language models. arXiv preprint arXiv:2206.09557 2022."},{"key":"e_1_3_2_1_40_1","first-page":"132","volume-title":"2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA)","author":"Patel Pratyush","year":"2024","unstructured":"Pratyush Patel, Esha Choukse, Chaojie Zhang, Aashaka Shah, \u00cd\u00f1igo Goiri, Saeed Maleki, and Ricardo Bianchini. Splitwise: Efficient generative llm inference using phase splitting. In 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA), pages 118\u2013132, 2024."},{"key":"e_1_3_2_1_41_1","volume-title":"Snapdragon 8Gen3 - Mobile Platform ignites endless possibilities. https:\/\/www.qualcomm.com\/products\/mobile\/snapdragon\/smartphones","year":"2023","unstructured":"Qualcomm. Snapdragon 8Gen3 - Mobile Platform ignites endless possibilities. https:\/\/www.qualcomm.com\/products\/mobile\/snapdragon\/smartphones, 2023-2024. Referenced December 2024."},{"key":"e_1_3_2_1_42_1","volume-title":"https:\/\/aihub.qualcomm.com\/mobile\/models\/llama_v3_1_8b_chat_quantized?searchTerm=llama","year":"2024","unstructured":"Qualcomm. Llama-v3.1-8b-chat on qualcomm 8 elite. https:\/\/aihub.qualcomm.com\/mobile\/models\/llama_v3_1_8b_chat_quantized?searchTerm=llama, 2024. Referenced December 2024."},{"key":"e_1_3_2_1_43_1","volume-title":"https:\/\/www.qualcomm.com\/developer\/software\/qualcomm-ai-engine-direct-sdk","year":"2024","unstructured":"Qualcomm. Qnn. https:\/\/www.qualcomm.com\/developer\/software\/qualcomm-ai-engine-direct-sdk, 2024. Referenced December 2024."},{"key":"e_1_3_2_1_44_1","volume-title":"Qualcomm adreno gpu, game-changing speed and efficiency. https:\/\/www.qualcomm.com\/products\/technology\/processors\/adreno","year":"2025","unstructured":"Qualcomm. Qualcomm adreno gpu, game-changing speed and efficiency. https:\/\/www.qualcomm.com\/products\/technology\/processors\/adreno, 2025."},{"key":"e_1_3_2_1_45_1","volume-title":"Qualcomm hexagon npu, powering the generative ai revolution. https:\/\/www.qualcomm.com\/products\/technology\/processors\/hexagon","year":"2025","unstructured":"Qualcomm. Qualcomm hexagon npu, powering the generative ai revolution. https:\/\/www.qualcomm.com\/products\/technology\/processors\/hexagon, 2025."},{"issue":"6","key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","first-page":"519","DOI":"10.1145\/2499370.2462176","article-title":"a language and compiler for optimizing parallelism, locality, and recomputation in image processing pipelines","volume":"48","author":"Ragan-Kelley Jonathan","year":"2013","unstructured":"Jonathan Ragan-Kelley, Connelly Barnes, Andrew Adams, Sylvain Paris, Fr\u00e9do Durand, and Saman Amarasinghe. Halide: a language and compiler for optimizing parallelism, locality, and recomputation in image processing pipelines. Acm Sigplan Notices, 48(6):519\u2013530, 2013.","journal-title":"Acm Sigplan Notices"},{"key":"e_1_3_2_1_47_1","first-page":"692","volume-title":"19th USENIX Symposium on Operating Systems Design and Implementation (OSDI 25)","author":"Shen Weihang","year":"2025","unstructured":"Weihang Shen, Mingcong Han, Jialong Liu, Rong Chen, and Haibo Chen. {XSched}: Preemptive scheduling for diverse {XPUs}. In 19th USENIX Symposium on Operating Systems Design and Implementation (OSDI 25), pages 671\u2013692, 2025."},{"key":"e_1_3_2_1_48_1","first-page":"485","volume-title":"Proceedings of the ACM SIGCOMM 2023 Conference","author":"Shubha Sudipta Saha","year":"2023","unstructured":"Sudipta Saha Shubha and Haiying Shen. Adainf: Data drift adaptive scheduling for accurate and slo-guaranteed multiple-model inference serving at edge servers. In Proceedings of the ACM SIGCOMM 2023 Conference, pages 473\u2013485, 2023."},{"key":"e_1_3_2_1_49_1","first-page":"355","volume-title":"2020 IEEE International Symposium on High Performance Computer Architecture (HPCA)","author":"Song Linghao","unstructured":"Linghao Song, Fan Chen, Youwei Zhuo, Xuehai Qian, Hai Li, and Yiran Chen. Accpar: Tensor partitioning for heterogeneous deep learning accelerators. In 2020 IEEE International Symposium on High Performance Computer Architecture (HPCA), pages 342\u2013355. IEEE, 2020."},{"key":"e_1_3_2_1_50_1","volume-title":"et al. Theano: A python framework for fast computation of mathematical expressions. arXiv preprint arXiv:1605.02688","author":"Development Team The Theano","year":"2016","unstructured":"The Theano Development Team, Rami Al-Rfou, Guillaume Alain, Amjad Almahairi, Christof Angermueller, Dzmitry Bahdanau, Nicolas Ballas, Fr\u00e9d\u00e9ric Bastien, Justin Bayer, Anatoly Belikov, et al. Theano: A python framework for fast computation of mathematical expressions. arXiv preprint arXiv:1605.02688, 2016."},{"key":"e_1_3_2_1_51_1","volume-title":"NCNN - High-performance neural network inference computing framework optimized for mobile platforms. https:\/\/github.com\/Tencent\/ncnn","year":"2017","unstructured":"Tencent. NCNN - High-performance neural network inference computing framework optimized for mobile platforms. https:\/\/github.com\/Tencent\/ncnn, 2017. Referenced December 2024."},{"key":"e_1_3_2_1_52_1","volume-title":"Mobile-agent-v2: Mobile device operation assistant with effective navigation via multi-agent collaboration. arXiv preprint arXiv:2406.01014","author":"Wang Junyang","year":"2024","unstructured":"Junyang Wang, Haiyang Xu, Haitao Jia, Xi Zhang, Ming Yan, Weizhou Shen, Ji Zhang, Fei Huang, and Jitao Sang. Mobile-agent-v2: Mobile device operation assistant with effective navigation via multi-agent collaboration. arXiv preprint arXiv:2406.01014, 2024."},{"key":"e_1_3_2_1_53_1","volume-title":"Mobile-agent: Autonomous multimodal mobile device agent with visual perception. arXiv preprint arXiv:2401.16158","author":"Wang Junyang","year":"2024","unstructured":"Junyang Wang, Haiyang Xu, Jiabo Ye, Ming Yan, Weizhou Shen, Ji Zhang, Fei Huang, and Jitao Sang. Mobile-agent: Autonomous multimodal mobile device agent with visual perception. arXiv preprint arXiv:2401.16158, 2024."},{"key":"e_1_3_2_1_54_1","volume-title":"Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191","author":"Wang Peng","year":"2024","unstructured":"Peng Wang, Shuai Bai, Sinan Tan, Shijie Wang, Zhihao Fan, Jinze Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Yang Fan, Kai Dang, Mengfei Du, Xuancheng Ren, Rui Men, Dayiheng Liu, Chang Zhou, Jingren Zhou, and Junyang Lin. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191, 2024."},{"key":"e_1_3_2_1_55_1","volume-title":"Proceedings of the 6th ACM International Conference on Multimedia in Asia Workshops, MMAsia '24 Workshops","author":"Wang Zhaode","year":"2024","unstructured":"Zhaode Wang, Jingbang Yang, Xinyu Qian, Shiwen Xing, Xiaotang Jiang, Chengfei Lv, and Shengyu Zhang. Mnn-llm: A generic inference engine for fast large language model deployment on mobile devices. In Proceedings of the 6th ACM International Conference on Multimedia in Asia Workshops, MMAsia '24 Workshops, New York, NY, USA, 2024. Association for Computing Machinery."},{"key":"e_1_3_2_1_56_1","volume-title":"Fast on-device llm inference with npus","author":"Xu Daliang","year":"2024","unstructured":"Daliang Xu, Hao Zhang, Liming Yang, Ruiqi Liu, Gang Huang, Mengwei Xu, and Xuanzhe Liu. Fast on-device llm inference with npus, 2024."},{"key":"e_1_3_2_1_57_1","first-page":"86","volume-title":"Proceedings of the 19th Workshop on Hot Topics in Operating Systems, HOTOS '23","author":"Xue Yuqi","year":"2023","unstructured":"Yuqi Xue, Yiqi Liu, and Jian Huang. System virtualization for neural processing units. In Proceedings of the 19th Workshop on Hot Topics in Operating Systems, HOTOS '23, page 80\u201386, New York, NY, USA, 2023. Association for Computing Machinery."},{"key":"e_1_3_2_1_58_1","volume-title":"Proceedings of the 50th Annual International Symposium on Computer Architecture, ISCA '23","author":"Xue Yuqi","year":"2023","unstructured":"Yuqi Xue, Yiqi Liu, Lifeng Nai, and Jian Huang. V10: Hardware-assisted npu multi-tenancy for improved resource utilization and fairness. In Proceedings of the 50th Annual International Symposium on Computer Architecture, ISCA '23, New York, NY, USA, 2023. Association for Computing Machinery."},{"key":"e_1_3_2_1_59_1","volume-title":"Hardware-assisted virtualization of neural processing units for cloud platforms. arXiv preprint arXiv:2408.04104","author":"Xue Yuqi","year":"2024","unstructured":"Yuqi Xue, Yiqi Liu, Lifeng Nai, and Jian Huang. Hardware-assisted virtualization of neural processing units for cloud platforms. arXiv preprint arXiv:2408.04104, 2024."},{"key":"e_1_3_2_1_60_1","volume-title":"Powerinfer-2: Fast large language model inference on a smartphone","author":"Xue Zhenliang","year":"2024","unstructured":"Zhenliang Xue, Yixin Song, Zeyu Mi, Xinrui Zheng, Yubin Xia, and Haibo Chen. Powerinfer-2: Fast large language model inference on a smartphone, 2024."},{"key":"e_1_3_2_1_61_1","volume-title":"et al. Cogvideox: Text-to-video diffusion models with an expert transformer. arXiv preprint arXiv:2408.06072","author":"Yang Zhuoyi","year":"2024","unstructured":"Zhuoyi Yang, Jiayan Teng, Wendi Zheng, Ming Ding, Shiyu Huang, Jiazheng Xu, Yuanming Yang, Wenyi Hong, Xiaohan Zhang, Guanyu Feng, et al. Cogvideox: Text-to-video diffusion models with an expert transformer. arXiv preprint arXiv:2408.06072, 2024."},{"key":"e_1_3_2_1_62_1","first-page":"255","volume-title":"European Conference on Computer Vision","author":"You Keen","unstructured":"Keen You, Haotian Zhang, Eldon Schoop, Floris Weers, Amanda Swearngin, Jeffrey Nichols, Yinfei Yang, and Zhe Gan. Ferret-ui: Grounded mobile ui understanding with multimodal llms. In European Conference on Computer Vision, pages 240\u2013255. Springer, 2025."},{"key":"e_1_3_2_1_63_1","first-page":"538","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 521\u2013538, 2022."},{"key":"e_1_3_2_1_64_1","first-page":"1062","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Zhang Chengliang","year":"2019","unstructured":"Chengliang Zhang, Minchen Yu, Wei Wang, and Feng Yan. MArk: Exploiting cloud services for Cost-Effective, SLO-Aware machine learning inference serving. In 2019 USENIX Annual Technical Conference (USENIX ATC 19), pages 1049\u20131062, Renton, WA, July 2019. USENIX Association."},{"key":"e_1_3_2_1_65_1","volume-title":"Internlm-xcomposer-2.5: A versatile large vision language model supporting long-contextual input and output. arXiv preprint arXiv:2407.03320","author":"Zhang Pan","year":"2024","unstructured":"Pan Zhang, Xiaoyi Dong, Yuhang Zang, Yuhang Cao, Rui Qian, Lin Chen, Qipeng Guo, Haodong Duan, Bin Wang, Linke Ouyang, et al. Internlm-xcomposer-2.5: A versatile large vision language model supporting long-contextual input and output. arXiv preprint arXiv:2407.03320, 2024."},{"key":"e_1_3_2_1_66_1","first-page":"879","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Zheng Lianmin","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph E. Gonzalez, and Ion Stoica. Ansor: Generating HighPerformance tensor programs for deep learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pages 863\u2013879. USENIX Association, November 2020."},{"key":"e_1_3_2_1_67_1","first-page":"210","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. DistServe: Disaggregating prefill and decoding for goodput-optimized large language model serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pages 193\u2013210, Santa Clara, CA, July 2024. USENIX Association."},{"key":"e_1_3_2_1_68_1","first-page":"248","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zhu Hongyu","year":"2022","unstructured":"Hongyu Zhu, Ruofan Wu, Yijia Diao, Shanbin Ke, Haoyu Li, Chen Zhang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Wei Cui, Fan Yang, Mao Yang, Lidong Zhou, Asaf Cidon, and Gennady Pekhimenko. ROLLER: Fast and efficient tensor compilation for deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 233\u2013248, Carlsbad, CA, July 2022. USENIX Association."},{"key":"e_1_3_2_1_69_1","first-page":"1005","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhuang Donglin","year":"2024","unstructured":"Donglin Zhuang, Zhen Zheng, Haojun Xia, Xiafei Qiu, Junjie Bai, Wei Lin, and Shuaiwen Leon Song. MonoNN: Enabling a new monolithic optimization space for neural network inference tasks on modern GPU-Centric architectures. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pages 989\u20131005, Santa Clara, CA, July 2024. USENIX Association."}],"event":{"name":"SOSP '25: ACM SIGOPS 31st Symposium on Operating Systems Principles","location":"Lotte Hotel World Seoul Republic of Korea","acronym":"SOSP '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","USENIX"]},"container-title":["Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles"],"original-title":[],"deposited":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:48:24Z","timestamp":1759322904000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731569.3764808"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":69,"alternative-id":["10.1145\/3731569.3764808","10.1145\/3731569"],"URL":"https:\/\/doi.org\/10.1145\/3731569.3764808","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}