{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T13:14:39Z","timestamp":1780060479569,"version":"3.54.0"},"publisher-location":"New York, NY, USA","reference-count":95,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T00:00:00Z","timestamp":1781913600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/legalcode"}],"funder":[{"name":"National Key Research and Development Program of China","award":["2021YFB2900100"],"award-info":[{"award-number":["2021YFB2900100"]}]},{"name":"National Natural Science Foundation of China","award":["61932013"],"award-info":[{"award-number":["61932013"]}]},{"name":"National Natural Science Foundation of China","award":["62225204"],"award-info":[{"award-number":["62225204"]}]},{"name":"A3 Foresight Program of NSFC","award":["62061146002"],"award-info":[{"award-number":["62061146002"]}]},{"name":"Xiaomi Young Talents Program of Xiaomi Foundation","award":["I have not received a number for this grant"],"award-info":[{"award-number":["I have not received a number for this grant"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,21]]},"DOI":"10.1145\/3745756.3809208","type":"proceedings-article","created":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:52:21Z","timestamp":1780059141000},"page":"338-351","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Act Before It\u2019s Too Late: Power-Efficient LLM Inference on Mobile Device"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-7184-3150","authenticated-orcid":false,"given":"Haolin","family":"Chu","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0119-1916","authenticated-orcid":false,"given":"Jinxiao","family":"Fan","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1869-742X","authenticated-orcid":false,"given":"Jiabin","family":"Deng","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6949-3995","authenticated-orcid":false,"given":"Bensong","family":"Yu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3764-9888","authenticated-orcid":false,"given":"Liguang","family":"Xie","sequence":"additional","affiliation":[{"name":"Bytedance technology company Limited, Seattle, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5040-2468","authenticated-orcid":false,"given":"Liang","family":"Liu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7199-5047","authenticated-orcid":false,"given":"Huadong","family":"Ma","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7950-6773","authenticated-orcid":false,"given":"Xiaolong","family":"Zheng","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,20]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Perfetto. https:\/\/ui.perfetto.dev\/."},{"key":"e_1_3_2_1_2_1","unstructured":"Snapdragon Profiler. https:\/\/developer.qualcomm.com\/software\/snapdragon-profiler."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3053409"},{"key":"e_1_3_2_1_4_1","first-page":"74","volume-title":"2021 IEEE 34th International System-on-Chip Conference (SOCC)","author":"Akselrod D.","unstructured":"D. Akselrod. Reinforcement learning-based power management architecture for optimal dvfs in socs. In 2021 IEEE 34th International System-on-Chip Conference (SOCC), pages 71\u201374. IEEE, 2021."},{"key":"e_1_3_2_1_5_1","first-page":"672","volume-title":"Proceedings of the 21st ACM Internet Measurement Conference","author":"Almeida M.","year":"2021","unstructured":"M. Almeida, S. Laskaridis, A. Mehrotra, L. Dudziak, I. Leontiadis, and N. D. Lane. Smart at what cost? characterising mobile deep neural networks in the wild. In Proceedings of the 21st ACM Internet Measurement Conference, pages 658\u2013672, 2021."},{"key":"e_1_3_2_1_6_1","volume-title":"AOSP Kernel gs201-gpu.dtsi","author":"AOSP.","year":"2023","unstructured":"AOSP. AOSP Kernel gs201-gpu.dtsi., 2023-2025."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2004.1275298"},{"key":"e_1_3_2_1_8_1","first-page":"293","volume-title":"Proceedings of the 9th ACM SIGCOMM Conference on Internet Measurement","author":"Balasubramanian N.","year":"2009","unstructured":"N. Balasubramanian, A. Balasubramanian, and A. Venkataramani. Energy consumption in mobile phones: a measurement study and implications for network applications. In Proceedings of the 9th ACM SIGCOMM Conference on Internet Measurement, pages 280\u2013293, 2009."},{"key":"e_1_3_2_1_9_1","first-page":"267","volume-title":"Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"4","author":"Bharadwaj S.","year":"2023","unstructured":"S. Bharadwaj, S. Das, K. Mazumdar, B. M. Beckmann, and S. Kosonocky. Predict; don't react for enabling efficient fine-grain dvfs in gpus. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 4, pages 253\u2013267, 2023."},{"key":"e_1_3_2_1_10_1","first-page":"374","volume-title":"Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles","author":"Chen L.","year":"2025","unstructured":"L. Chen, D. Feng, E. Feng, Y. Wang, R. Zhao, Y. Xia, P. Xu, and H. Chen. Characterizing mobile soc for accelerating heterogeneous llm inference. In Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles, pages 359\u2013374, 2025."},{"key":"e_1_3_2_1_11_1","first-page":"481","volume-title":"Proceedings of the 17th annual international conference on mobile systems, applications, and services","author":"Choi Y.","year":"2019","unstructured":"Y. Choi, S. Park, and H. Cha. Graphics-aware power governing for mobile devices. In Proceedings of the 17th annual international conference on mobile systems, applications, and services, pages 469\u2013481, 2019."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","DOI":"10.1109\/TMC.2026.3651722","article-title":"Rams: Runtime adaptive memory scaling for tiny deep learning on iot devices","author":"Chu H.","year":"2026","unstructured":"H. Chu, H. Xin, X. Zheng, L. Liu, and H. Ma. Rams: Runtime adaptive memory scaling for tiny deep learning on iot devices. IEEE Transactions on Mobile Computing, 2026.","journal-title":"IEEE Transactions on Mobile Computing"},{"key":"e_1_3_2_1_13_1","first-page":"137","volume-title":"Proceedings of the 21st ACM Conference on Embedded Networked Sensor Systems","author":"Chu H.","year":"2023","unstructured":"H. Chu, X. Zheng, L. Liu, and H. Ma. nnperf: Demystifying dnn runtime inference latency on mobile platforms. In Proceedings of the 21st ACM Conference on Embedded Networked Sensor Systems, pages 125\u2013137, 2023."},{"key":"e_1_3_2_1_14_1","first-page":"538","volume-title":"2022 IEEE International Conference on Pervasive Computing and Communications Workshops and other Affiliated Events (PerCom Workshops)","author":"Das A.","unstructured":"A. Das, Y. D. Kwon, J. Chauhan, and C. Mascolo. Enabling on-device smartphone gpu based training: Lessons learned. In 2022 IEEE International Conference on Pervasive Computing and Communications Workshops and other Affiliated Events (PerCom Workshops), pages 533\u2013538. IEEE, 2022."},{"key":"e_1_3_2_1_15_1","volume-title":"Hawq-v2: Hessian aware trace-weighted quantization of neural networks. Advances in neural information processing systems, 33:18518\u201318529","author":"Dong Z.","year":"2020","unstructured":"Z. Dong, Z. Yao, D. Arfeen, A. Gholami, M. W. Mahoney, and K. Keutzer. Hawq-v2: Hessian aware trace-weighted quantization of neural networks. Advances in neural information processing systems, 33:18518\u201318529, 2020."},{"key":"e_1_3_2_1_16_1","unstructured":"C. Du. Accelerating llm inference: Fast sampling with gumbel-max trick. https:\/\/huggingface.co\/blog\/cxdu\/fastsampling."},{"key":"e_1_3_2_1_17_1","first-page":"5569","volume-title":"International Conference on Machine Learning","author":"Du N.","unstructured":"N. Du, Y. Huang, A. M. Dai, S. Tong, D. Lepikhin, Y. Xu, M. Krikun, Y. Zhou, A. W. Yu, O. Firat, et al. Glam: Efficient scaling of language models with mixture-of-experts. In International Conference on Machine Learning, pages 5547\u20135569. PMLR, 2022."},{"issue":"4","key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","first-page":"1087","DOI":"10.1109\/TAC.2014.2336358","article-title":"Robust tube mpc for linear systems with multiplicative uncertainty","volume":"60","author":"Fleming J.","year":"2014","unstructured":"J. Fleming, B. Kouvaritakis, and M. Cannon. Robust tube mpc for linear systems with multiplicative uncertainty. IEEE Transactions on Automatic Control, 60(4):1087\u20131092, 2014.","journal-title":"IEEE Transactions on Automatic Control"},{"key":"e_1_3_2_1_19_1","first-page":"10337","volume-title":"International Conference on Machine Learning","author":"Frantar E.","unstructured":"E. Frantar and D. Alistarh. Sparsegpt: Massive language models can be accurately pruned in one-shot. In International Conference on Machine Learning, pages 10323\u201310337. PMLR, 2023."},{"key":"e_1_3_2_1_20_1","volume-title":"Ggml tensor library for machine learning","author":"Gerganov G.","year":"2024","unstructured":"G. Gerganov. Ggml tensor library for machine learning., 2024."},{"key":"e_1_3_2_1_21_1","volume-title":"PopCap Games","author":"Games P.","year":"2009","unstructured":"P. Games. Plants vs. zombies. PopCap Games, 2009."},{"key":"e_1_3_2_1_22_1","unstructured":"Google. Adreno gpu kgsl pwrctrl default gpu idle time. https:\/\/android.googlesource.com\/kernel\/msm\/+\/android-msm-marlin-3.18-nougat-dr1\/drivers\/gpu\/msm\/kgsl_pwrctrl.c."},{"key":"e_1_3_2_1_23_1","unstructured":"Google. Agi guides. https:\/\/developer.android.com\/agi\/start?hl=zh-cn."},{"key":"e_1_3_2_1_24_1","unstructured":"Google. Agi systrace and ftrace. https:\/\/source.android.com\/docs\/core\/tests\/debug\/ftrace#expandable-1."},{"key":"e_1_3_2_1_25_1","unstructured":"Google. Android surfaceflinger api. https:\/\/android.googlesource.com\/platform\/frameworks\/native\/+\/lollipop-release\/services\/surfaceflinger\/SurfaceFlinger.cpp."},{"key":"e_1_3_2_1_26_1","unstructured":"Google. Gemma-2-2b. https:\/\/huggingface.co\/google\/gemma-2-2b."},{"key":"e_1_3_2_1_27_1","volume-title":"Android gpu inspector","year":"2017","unstructured":"Google. Android gpu inspector, 2017."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.5555\/2531370"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2018.02.001"},{"key":"e_1_3_2_1_30_1","first-page":"372","volume-title":"2012 USENIX Annual Technical Conference (USENIX ATC 12)","author":"Gupta V.","year":"2012","unstructured":"V. Gupta, P. Brett, D. Koufaty, D. Reddy, S. Hahn, K. Schwan, and G. Srinivasa. The forgotten {'Uncore'}: On the {Energy-Efficiency} of heterogeneous cores. In 2012 USENIX Annual Technical Conference (USENIX ATC 12), pages 367\u2013372, 2012."},{"key":"e_1_3_2_1_31_1","first-page":"76","volume-title":"2016 IEEE International Symposium on High Performance Computer Architecture (HPCA)","author":"Halpern M.","unstructured":"M. Halpern, Y. Zhu, and V. J. Reddi. Mobile cpu's rise to power: Quantifying the impact of generational mobile cpu design trends on performance, energy, and user satisfaction. In 2016 IEEE International Symposium on High Performance Computer Architecture (HPCA), pages 64\u201376. IEEE, 2016."},{"key":"e_1_3_2_1_32_1","volume-title":"qpOASES Public","author":"Hans Joachim Ferreau C. K.","year":"2017","unstructured":"C. K. Hans Joachim Ferreau, Andreas Potschka. qpOASES Public, 2017."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00140"},{"key":"e_1_3_2_1_34_1","first-page":"729","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Hu Q.","year":"2024","unstructured":"Q. Hu, Z. Ye, Z. Wang, G. Wang, M. Zhang, Q. Chen, P. Sun, D. Lin, X. Wang, Y. Luo, et al. Characterization of large language model development in the datacenter. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24), pages 709\u2013729, 2024."},{"key":"e_1_3_2_1_35_1","first-page":"91","volume-title":"2024 IEEE 42nd International Conference on Computer Design (ICCD)","author":"Jia Z.","unstructured":"Z. Jia, L. N. Bhuyan, and D. Wong. Pccl: Energy-efficient llm training with power-aware collective communication. In 2024 IEEE 42nd International Conference on Computer Design (ICCD), pages 84\u201391. IEEE, 2024."},{"key":"e_1_3_2_1_36_1","first-page":"52481","article-title":"Minference 1.0: Accelerating pre-filling for long-context llms via dynamic sparse attention","volume":"37","author":"Jiang H.","year":"2025","unstructured":"H. Jiang, Y. Li, C. Zhang, Q. Wu, X. Luo, S. Ahn, Z. Han, A. Abdi, D. Li, C.-Y. Lin, et al. Minference 1.0: Accelerating pre-filling for long-context llms via dynamic sparse attention. Advances in Neural Information Processing Systems, 37:52481\u201352515, 2025.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_37_1","volume-title":"Endor: Hardware-friendly sparse format for offloaded llm inference. arXiv preprint arXiv:2406.11674","author":"Joo D.","year":"2024","unstructured":"D. Joo, R. Hadidi, S. Feizi, and B. Asgari. Endor: Hardware-friendly sparse format for offloaded llm inference. arXiv preprint arXiv:2406.11674, 2024."},{"key":"e_1_3_2_1_38_1","first-page":"1378","volume-title":"2025 IEEE International Symposium on High Performance Computer Architecture (HPCA)","author":"Kakolyris A. K.","unstructured":"A. K. Kakolyris, D. Masouros, P. Vavaroutsos, S. Xydis, and D. Soudris. throt-tll'em: Predictive gpu throttling for energy efficient llm inference serving. In 2025 IEEE International Symposium on High Performance Computer Architecture (HPCA), pages 1363\u20131378. IEEE, 2025."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2024.3406038"},{"key":"e_1_3_2_1_40_1","first-page":"53","volume-title":"Proceedings of the 19th Annual International Conference on Mobile Systems, Applications, and Services","author":"Kim S.","year":"2021","unstructured":"S. Kim, K. Bin, S. Ha, K. Lee, and S. Chong. Ztt: learning-based dvfs with zero thermal throttling for mobile devices. In Proceedings of the 19th Annual International Conference on Mobile Systems, Applications, and Services, pages 41\u201353, 2021."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_42_1","first-page":"907","volume-title":"Proceedings of the 30th Annual International Conference on Mobile Computing and Networking","author":"Laskaridis S.","year":"2024","unstructured":"S. Laskaridis, K. Katevas, L. Minto, and H. Haddadi. Melting point: Mobile evaluation of language transformers. In Proceedings of the 30th Annual International Conference on Mobile Computing and Networking, pages 890\u2013907, 2024."},{"key":"e_1_3_2_1_43_1","first-page":"152","volume-title":"Proceedings of the 26th Symposium on Operating Systems Principles","author":"Li B.","year":"2017","unstructured":"B. Li, Z. Ruan, W. Xiao, Y. Lu, Y. Xiong, A. Putnam, E. Chen, and L. Zhang. Kv-direct: High-performance in-memory key-value store with programmable nic. In Proceedings of the 26th Symposium on Operating Systems Principles, pages 137\u2013152, 2017."},{"key":"e_1_3_2_1_44_1","first-page":"16","volume-title":"Proceedings of the 29th Annual International Conference on Mobile Computing and Networking","author":"Lin C.","year":"2023","unstructured":"C. Lin, K. Wang, Z. Li, and Y. Pu. A workload-aware dvfs robust to concurrent tasks for mobile devices. In Proceedings of the 29th Annual International Conference on Mobile Computing and Networking, pages 1\u201316, 2023."},{"key":"e_1_3_2_1_45_1","first-page":"87","article-title":"Awq: Activation-aware weight quantization for on-device llm compression and acceleration","volume":"6","author":"Lin J.","year":"2024","unstructured":"J. Lin, J. Tang, H. Tang, S. Yang, W.-M. Chen, W.-C. Wang, G. Xiao, X. Dang, C. Gan, and S. Han. Awq: Activation-aware weight quantization for on-device llm compression and acceleration. Proceedings of Machine Learning and Systems, 6:87\u2013100, 2024.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_46_1","first-page":"22176","volume-title":"International Conference on Machine Learning","author":"Liu Z.","unstructured":"Z. Liu, J. Wang, T. Dao, T. Zhou, B. Yuan, Z. Song, A. Shrivastava, C. Zhang, Y. Tian, C. Re, et al. Deja vu: Contextual sparsity for efficient llms at inference time. In International Conference on Machine Learning, pages 22137\u201322176. PMLR, 2023."},{"key":"e_1_3_2_1_47_1","first-page":"1662","volume-title":"2019 American Control Conference (ACC)","author":"Lopez B. T.","unstructured":"B. T. Lopez, J.-J. E. Slotine, and J. P. How. Dynamic tube mpc for nonlinear systems. In 2019 American Control Conference (ACC), pages 1655\u20131662. IEEE, 2019."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","first-page":"1889","DOI":"10.1016\/j.applthermaleng.2007.11.025","article-title":"System thermal analysis for mobile phone","volume":"28","author":"Luo Z.","year":"2008","unstructured":"Z. Luo, H. Cho, X. Luo, and K.-i. Cho. System thermal analysis for mobile phone. Applied Thermal Engineering, 28(14\u201315):1889\u20131895, 2008.","journal-title":"Applied Thermal Engineering"},{"key":"e_1_3_2_1_49_1","volume-title":"Llm-pruner: On the structural pruning of large language models. Advances in neural information processing systems, 36:21702\u201321720","author":"Ma X.","year":"2023","unstructured":"X. Ma, G. Fang, and X. Wang. Llm-pruner: On the structural pruning of large language models. Advances in neural information processing systems, 36:21702\u201321720, 2023."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.automatica.2004.08.019"},{"issue":"8","key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","first-page":"41","DOI":"10.5120\/ijca2019918791","article-title":"Google maps","volume":"178","author":"Mehta H.","year":"2019","unstructured":"H. Mehta, P. Kanani, and P. Lande. Google maps. International Journal of Computer Applications, 178(8):41\u201346, 2019.","journal-title":"International Journal of Computer Applications"},{"key":"e_1_3_2_1_52_1","first-page":"9","volume-title":"2020 IEEE 32nd International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD)","author":"Mendes F.","unstructured":"F. Mendes, P. Tom\u00e1s, and N. Roma. Exploiting non-conventional dvfs on gpus: Application to deep learning. In 2020 IEEE 32nd International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD), pages 1\u20139. IEEE, 2020."},{"key":"e_1_3_2_1_53_1","unstructured":"Meta. Llama-2-7b-chat-hf. https:\/\/huggingface.co\/meta-llama\/Llama-2-7b-chat-hf."},{"key":"e_1_3_2_1_54_1","unstructured":"MLC team. MLC-LLM 2023-2025."},{"key":"e_1_3_2_1_55_1","unstructured":"Msoon. High voltage power monitor. https:\/\/www.msoon.com\/online-store\/High-Voltage-Power-Monitor-p90002590."},{"key":"e_1_3_2_1_56_1","volume-title":"Virtualpower: coordinated power management in virtualized enterprise systems. ACM SIGOPS operating systems review, 41(6):265\u2013278","author":"Nathuji R.","year":"2007","unstructured":"R. Nathuji and K. Schwan. Virtualpower: coordinated power management in virtualized enterprise systems. ACM SIGOPS operating systems review, 41(6):265\u2013278, 2007."},{"key":"e_1_3_2_1_57_1","unstructured":"K. org. Adreno gpu pmu event open list. https:\/\/www.kernel.org\/doc\/Documentation\/trace\/ftrace.txt."},{"key":"e_1_3_2_1_58_1","unstructured":"K. org. The file system. https:\/\/www.kernel.org\/doc\/Documentation\/trace\/ftrace.txt."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651329"},{"key":"e_1_3_2_1_60_1","first-page":"6","volume-title":"Proceedings of the 51st Annual Design Automation Conference","author":"Pathania A.","year":"2014","unstructured":"A. Pathania, Q. Jiao, A. Prakash, and T. Mitra. Integrated cpu-gpu power management for 3d mobile games. In Proceedings of the 51st Annual Design Automation Conference, pages 1\u20136, 2014."},{"key":"e_1_3_2_1_61_1","first-page":"1292","volume-title":"Proceedings of the sixteenth ACM international conference on web search and data mining","author":"Peris C.","year":"2023","unstructured":"C. Peris, C. Dupuy, J. Majmudar, R. Parikh, S. Smaili, R. Zemel, and R. Gupta. Privacy in the time of language models. In Proceedings of the sixteenth ACM international conference on web search and data mining, pages 1291\u20131292, 2023."},{"key":"e_1_3_2_1_62_1","first-page":"102","volume-title":"Proceedings of the eighteenth ACM symposium on Operating systems principles","author":"Pillai P.","year":"2001","unstructured":"P. Pillai and K. G. Shin. Real-time dynamic voltage scaling for low-power embedded operating systems. In Proceedings of the eighteenth ACM symposium on Operating systems principles, pages 89\u2013102, 2001."},{"key":"e_1_3_2_1_63_1","first-page":"93","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Qiu H.","year":"2024","unstructured":"H. Qiu, W. Mao, A. Patke, S. Cui, S. Jha, C. Wang, H. Franke, Z. Kalbarczyk, T. Ba\u015far, and R. K. Iyer. Power-aware deep learning model serving with {\u03bc-Serve}. In 2024 USENIX Annual Technical Conference (USENIX ATC 24), pages 75\u201393, 2024."},{"key":"e_1_3_2_1_64_1","unstructured":"Qualcomm. Adreno gpu kgsl driver. https:\/\/lwn.net\/Articles\/394665\/."},{"key":"e_1_3_2_1_65_1","unstructured":"Qualcomm. Thermal engine. https:\/\/android.googlesource.com\/device\/google\/taimen\/+\/81375f6446dbdbe5c95e8790b716e10253191702\/thermal-engine.conf#:~:text=thresholds%2048000%2050000%2052000%20thresholds_clr MONITOR2."},{"key":"e_1_3_2_1_66_1","unstructured":"Qualcomm. Thermal framework. https:\/\/lore.kernel.org\/all\/30066188-3787-4277-914e-e06c95fe2e1c@linaro.org\/T\/."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.automatica.2012.05.003"},{"key":"e_1_3_2_1_68_1","first-page":"754","volume-title":"Thirteenth International Symposium on Quality Electronic Design (ISQED)","author":"Shen H.","unstructured":"H. Shen, J. Lu, and Q. Qiu. Learning based dvfs for simultaneous temperature, performance and energy management. In Thirteenth International Symposium on Quality Electronic Design (ISQED), pages 747\u2013754. IEEE, 2012."},{"key":"e_1_3_2_1_69_1","volume-title":"Llm pruning and distillation in practice: The minitron approach. arXiv preprint arXiv:2408.11796","author":"Sreenivas S. T.","year":"2024","unstructured":"S. T. Sreenivas, S. Muralidharan, R. Joshi, M. Chochowski, M. Patwary, M. Shoeybi, B. Catanzaro, J. Kautz, and P. Molchanov. Llm pruning and distillation in practice: The minitron approach. arXiv preprint arXiv:2408.11796, 2024."},{"key":"e_1_3_2_1_70_1","unstructured":"STMicroelectronics. Stmicroelectronics stts22h low-voltage ultralow-power temperature sensor. https:\/\/www.st.com\/en\/mems-and-sensors\/stts22h.html#documentation."},{"key":"e_1_3_2_1_71_1","volume-title":"Dynamollm: Designing llm inference clusters for performance and energy efficiency. arXiv preprint arXiv:2408.00741","author":"Stojkovic J.","year":"2024","unstructured":"J. Stojkovic, C. Zhang, \u00cd. Goiri, J. Torrellas, and E. Choukse. Dynamollm: Designing llm inference clusters for performance and energy efficiency. arXiv preprint arXiv:2408.00741, 2024."},{"key":"e_1_3_2_1_72_1","volume-title":"A simple and effective pruning approach for large language models. arXiv preprint arXiv:2306.11695","author":"Sun M.","year":"2023","unstructured":"M. Sun, Z. Liu, A. Bair, and J. Z. Kolter. A simple and effective pruning approach for large language models. arXiv preprint arXiv:2306.11695, 2023."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2017.2761740"},{"key":"e_1_3_2_1_74_1","first-page":"467","volume-title":"Proceedings of the 2009 international conference on computer-aided design","author":"Tan Y.","year":"2009","unstructured":"Y. Tan, W. Liu, and Q. Qiu. Adaptive power management using reinforcement learning. In Proceedings of the 2009 international conference on computer-aided design, pages 461\u2013467, 2009."},{"key":"e_1_3_2_1_75_1","unstructured":"M. team. Phi3.5miniinstructq4f16_1mlc. https:\/\/huggingface.co\/mlc-ai\/Phi-3.5-mini-instruct-q4f16_1-MLC."},{"key":"e_1_3_2_1_76_1","unstructured":"Q. Team. Qwen2.5-1.5b-instruct. https:\/\/huggingface.co\/Qwen\/."},{"key":"e_1_3_2_1_77_1","unstructured":"TensorBlock. Redpajama-incite-chat-instruct-3b. https:\/\/huggingface.co\/togethercomputer\/RedPajama-INCITE-Chat-3B-v1."},{"key":"e_1_3_2_1_78_1","unstructured":"Unity. Mali gpu architecture and mobile studio. https:\/\/www.slideshare.net\/slideshow\/unite-seoul-2019-mali-gpu-architecture-and-mobile-studio\/156022111."},{"key":"e_1_3_2_1_79_1","first-page":"228","volume-title":"Proceedings of the 27th Annual International Conference on Mobile Computing and Networking","author":"Wang M.","year":"2021","unstructured":"M. Wang, S. Ding, T. Cao, Y. Liu, and F. Xu. Asymo: scalable and efficient deep-learning inference on asymmetric mobile cpus. In Proceedings of the 27th Annual International Conference on Mobile Computing and Networking, pages 215\u2013228, 2021."},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1145\/3152042.3152066"},{"key":"e_1_3_2_1_81_1","first-page":"2087","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Wang T.","year":"2020","unstructured":"T. Wang, K. Wang, H. Cai, J. Lin, Z. Liu, H. Wang, Y. Lin, and S. Han. Apq: Joint search for network architecture, pruning and quantization policy. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 2078\u20132087, 2020."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"crossref","first-page":"7","DOI":"10.1145\/3700410","volume-title":"Proceedings of the 6th ACM International Conference on Multimedia in Asia Workshops","author":"Wang Z.","year":"2024","unstructured":"Z. Wang, J. Yang, X. Qian, S. Xing, X. Jiang, C. Lv, and S. Zhang. Mnn-llm: A generic inference engine for fast large language model deployment on mobile devices. In Proceedings of the 6th ACM International Conference on Multimedia in Asia Workshops, pages 1\u20137, 2024."},{"key":"e_1_3_2_1_83_1","unstructured":"wiki. Adreno gpu computing power list. https:\/\/wikimili.com\/en\/Adreno."},{"key":"e_1_3_2_1_84_1","unstructured":"wiki. Ioctl api. https:\/\/en.wikipedia.org\/wiki\/Ioctl."},{"key":"e_1_3_2_1_85_1","first-page":"513","volume-title":"Proceedings of the 15th ACM International Conference on Future and Sustainable Energy Systems","author":"Wilkins G.","year":"2024","unstructured":"G. Wilkins, S. Keshav, and R. Mortier. Hybrid heterogeneous clusters can lower the energy consumption of llm inference workloads. In Proceedings of the 15th ACM International Conference on Future and Sustainable Energy Systems, pages 506\u2013513, 2024."},{"key":"e_1_3_2_1_86_1","first-page":"576","volume-title":"2015 IEEE 21st international symposium on high performance computer architecture (HPCA)","author":"Wu G.","unstructured":"G. Wu, J. L. Greathouse, A. Lyashevsky, N. Jayasena, and D. Chiou. Gpgpu performance and power estimation using machine learning. In 2015 IEEE 21st international symposium on high performance computer architecture (HPCA), pages 564\u2013576. IEEE, 2015."},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","DOI":"10.1145\/2775054.2694360"},{"key":"e_1_3_2_1_88_1","volume-title":"Qa-lora: Quantization-aware low-rank adaptation of large language models. arXiv preprint arXiv:2309.14717","author":"Xu Y.","year":"2023","unstructured":"Y. Xu, L. Xie, X. Gu, X. Chen, H. Chang, H. Zhang, Z. Chen, X. Zhang, and Q. Tian. Qa-lora: Quantization-aware low-rank adaptation of large language models. arXiv preprint arXiv:2309.14717, 2023."},{"key":"e_1_3_2_1_89_1","volume-title":"Llm as a system service on mobile devices. arXiv preprint arXiv:2403.11805","author":"Yin W.","year":"2024","unstructured":"W. Yin, M. Xu, Y. Li, and X. Liu. Llm as a system service on mobile devices. arXiv preprint arXiv:2403.11805, 2024."},{"key":"e_1_3_2_1_90_1","volume-title":"Native sparse attention: Hardware-aligned and natively trainable sparse attention. arXiv preprint arXiv:2502.11089","author":"Yuan J.","year":"2025","unstructured":"J. Yuan, H. Gao, D. Dai, J. Luo, L. Zhao, Z. Zhang, Z. Xie, Y. Wei, L. Wang, Z. Xiao, et al. Native sparse attention: Hardware-aligned and natively trainable sparse attention. arXiv preprint arXiv:2502.11089, 2025."},{"key":"e_1_3_2_1_91_1","volume-title":"Llm inference unveiled: Survey and roofline model insights. arXiv preprint arXiv:2402.16363","author":"Yuan Z.","year":"2024","unstructured":"Z. Yuan, Y. Shang, Y. Zhou, Z. Dong, Z. Zhou, C. Xue, B. Wu, Z. Li, Q. Gu, Y. J. Lee, et al. Llm inference unveiled: Survey and roofline model insights. arXiv preprint arXiv:2402.16363, 2024."},{"key":"e_1_3_2_1_92_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Zhang Y.","unstructured":"Y. Zhang and Y. You. Speedloader: An i\/o efficient scheme for heterogeneous and distributed llm operation. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_93_1","volume-title":"Dissecting the impact of mobile dvfs governors on llm inference performance and energy efficiency. arXiv preprint arXiv:2507.02135","author":"Zhang Z.","year":"2025","unstructured":"Z. Zhang, P. Dash, Y. C. Hu, Q. Xu, J. Li, and H. Guan. Dissecting the impact of mobile dvfs governors on llm inference performance and energy efficiency. arXiv preprint arXiv:2507.02135, 2025."},{"key":"e_1_3_2_1_94_1","first-page":"34661","article-title":"H2o: Heavy-hitter oracle for efficient generative inference of large language models","volume":"36","author":"Zhang Z.","year":"2023","unstructured":"Z. Zhang, Y. Sheng, T. Zhou, T. Chen, L. Zheng, R. Cai, Z. Song, Y. Tian, C. R\u00e9, C. Barrett, et al. H2o: Heavy-hitter oracle for efficient generative inference of large language models. Advances in Neural Information Processing Systems, 36:34661\u201334710, 2023.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00704"}],"event":{"name":"MobiSys '26: 24th Annual International Conference on Mobile Systems, Applications and Services","location":"University of Cambridge Cambridge United Kingdom","acronym":"MobiSys '26","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 24th Annual International Conference on Mobile Systems, Applications and Services"],"original-title":[],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:55:22Z","timestamp":1780059322000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3745756.3809208"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,20]]},"references-count":95,"alternative-id":["10.1145\/3745756.3809208","10.1145\/3745756"],"URL":"https:\/\/doi.org\/10.1145\/3745756.3809208","relation":{},"subject":[],"published":{"date-parts":[[2026,6,20]]},"assertion":[{"value":"2026-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}