{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T13:02:42Z","timestamp":1780664562747,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,4,26]],"date-time":"2026-04-26T00:00:00Z","timestamp":1777161600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3769382","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"2157-2172","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Scaling LLM Test-Time Compute with Mobile NPU on Smartphones"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-1671-1367","authenticated-orcid":false,"given":"Zixu","family":"Hao","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0830-044X","authenticated-orcid":false,"given":"Jianyu","family":"Wei","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8272-8151","authenticated-orcid":false,"given":"Tuowei","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8024-4928","authenticated-orcid":false,"given":"Minxing","family":"Huang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1327-4882","authenticated-orcid":false,"given":"Huiqiang","family":"Jiang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4685-9633","authenticated-orcid":false,"given":"Shiqi","family":"Jiang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9107-013X","authenticated-orcid":false,"given":"Ting","family":"Cao","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2782-183X","authenticated-orcid":false,"given":"Ju","family":"Ren","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"100213","volume":"37","author":"Ashkboos Saleh","year":"2024","unstructured":"Saleh Ashkboos, Amirkeivan Mohtashami, Maximilian Croci, Bo Li, Pashmina Cameron, Martin Jaggi, Dan Alistarh, Torsten Hoefler, and James Hensman. Quarot: Outlier-free 4-bit inference in rotated llms. Advances in Neural Information Processing Systems, 37:100213\u2013100240, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_2_1","volume-title":"https:\/\/github.com\/powerserve-project\/PowerServe\/tree\/main","author":"Authors PowerServe","year":"2025","unstructured":"PowerServe Authors. Powerserve. https:\/\/github.com\/powerserve-project\/PowerServe\/tree\/main, 2025."},{"key":"e_1_3_2_1_3_1","volume-title":"Large language monkeys: Scaling inference compute with repeated sampling. arXiv preprint arXiv:2407.21787","author":"Brown Bradley","year":"2024","unstructured":"Bradley Brown, Jordan Juravsky, Ryan Ehrlich, Ronald Clark, Quoc V Le, Christopher R\u00e9, and Azalia Mirhoseini. Large language monkeys: Scaling inference compute with repeated sampling. arXiv preprint arXiv:2407.21787, 2024."},{"key":"e_1_3_2_1_4_1","volume-title":"Ruoxi Sun, and Sercan O. Arik. Sets: Leveraging self-verification and self-correction for improved test-time scaling. arXiv preprint arXiv:2501.19306","author":"Chen Jiefeng","year":"2025","unstructured":"Jiefeng Chen, Jie Ren, Xinyun Chen, Cheng Run Yang, Ruoxi Sun, and Sercan O. Arik. Sets: Leveraging self-verification and self-correction for improved test-time scaling. arXiv preprint arXiv:2501.19306, 2025."},{"key":"e_1_3_2_1_5_1","unstructured":"Jiyu Chen Poh Seng Lim Shuang Peng Daxiong Luo JungHau Foo Yap Deep Timothy Lee Jun Jie Kelvin Teh Kae Wen Fan Yang Danyu Feng et al. Edgeinfinite-instruct: Bridging sft-based optimization and npu-level efficiency for edge devices. arXiv preprint arXiv:2508.00370 2025."},{"key":"e_1_3_2_1_6_1","volume-title":"Heterollm: Accelerating large language model inference on mobile socs platform with heterogeneous ai accelerators. arXiv preprint arXiv:2501.14794","author":"Chen Le","year":"2025","unstructured":"Le Chen, Dahu Feng, Erhu Feng, Rong Zhao, Yingrui Wang, Yubin Xia, Haibo Chen, and Pinjie Xu. Heterollm: Accelerating large language model inference on mobile socs platform with heterogeneous ai accelerators. arXiv preprint arXiv:2501.14794, 2025."},{"key":"e_1_3_2_1_7_1","volume-title":"Parallel scaling law for language models. arXiv preprint arXiv:2505.10475","author":"Chen Mouxiang","year":"2025","unstructured":"Mouxiang Chen, Binyuan Hui, Zeyu Cui, Jiaxi Yang, Dayiheng Liu, Jianling Sun, Junyang Lin, and Zhongxin Liu. Parallel scaling law for language models. arXiv preprint arXiv:2505.10475, 2025."},{"key":"e_1_3_2_1_8_1","volume-title":"Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168","author":"Cobbe Karl","year":"2021","unstructured":"Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Jacob Hilton, Reiichiro Nakano, Christopher Hesse, and John Schulman. Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168, 2021."},{"key":"e_1_3_2_1_9_1","volume-title":"Intel npu acceleration library. https:\/\/intel.github.io\/intel-npu-acceleration-library\/npu.html","author":"Intel Corporation","year":"2024","unstructured":"Intel Corporation. Intel npu acceleration library. https:\/\/intel.github.io\/intel-npu-acceleration-library\/npu.html, 2024. Accessed: [Insert date here]."},{"key":"e_1_3_2_1_10_1","volume-title":"Intel gaudi 3 ai accelerator white paper. https:\/\/www.intel.com\/content\/www\/us\/en\/content-details\/817486\/intel-gaudi-3-ai-accelerator-white-paper.html","author":"Intel Corporation","year":"2025","unstructured":"Intel Corporation. Intel gaudi 3 ai accelerator white paper. https:\/\/www.intel.com\/content\/www\/us\/en\/content-details\/817486\/intel-gaudi-3-ai-accelerator-white-paper.html, 2025. Accessed: 2025."},{"key":"e_1_3_2_1_11_1","volume-title":"July","author":"Dao Tri","year":"2023","unstructured":"Tri Dao. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning, July 2023."},{"key":"e_1_3_2_1_12_1","volume-title":"Qlora: Efficient finetuning of quantized llms. arXiv preprint arXiv:2305.14314","author":"Dettmers Tim","year":"2023","unstructured":"Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. Qlora: Efficient finetuning of quantized llms. arXiv preprint arXiv:2305.14314, 2023."},{"key":"e_1_3_2_1_13_1","volume-title":"Making language models better reasoners with step-aware verifier. arXiv preprint arXiv:2306.04509","author":"Dwivedi Yash","year":"2023","unstructured":"Yash Dwivedi, Aman Madaan, Uri Alon, Graham Neubig, Kyle Richardson, Wen-tau Yih, Sourav Roy, and Arman Cohan. Making language models better reasoners with step-aware verifier. arXiv preprint arXiv:2306.04509, 2023."},{"key":"e_1_3_2_1_14_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323, 2022."},{"key":"e_1_3_2_1_15_1","volume-title":"llama.cpp: Inference of meta's llama model (and others) in pure c\/c++. https:\/\/github.com\/ggml-org\/llama.cpp","author":"Gerganov Georgi","year":"2025","unstructured":"Georgi Gerganov. llama.cpp: Inference of meta's llama model (and others) in pure c\/c++. https:\/\/github.com\/ggml-org\/llama.cpp, 2025."},{"key":"e_1_3_2_1_16_1","volume-title":"The Twelfth International Conference on Learning Representations (ICLR)","author":"Hafner Danijar","year":"2024","unstructured":"Danijar Hafner, Pranav Deka, Shixiang Shane Gu, Timothy Lillicrap, and Mohammad Norouzi. Reasoning as planning: LLMs as building blocks of a rational agent. In The Twelfth International Conference on Learning Representations (ICLR), 2024."},{"key":"e_1_3_2_1_17_1","volume-title":"fastrpc_munmap\/remote_mem_unmap does not seem to remove dsp-side mapping immediately #137","author":"Hao Zixu","year":"2025","unstructured":"Zixu Hao. fastrpc_munmap\/remote_mem_unmap does not seem to remove dsp-side mapping immediately #137, 2025."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3662006.3662067"},{"key":"e_1_3_2_1_19_1","volume-title":"Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, and Jacob Steinhardt. Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300, 2020."},{"key":"e_1_3_2_1_20_1","volume-title":"Measuring mathematical problem solving with the math dataset. arXiv preprint arXiv:2103.03874","author":"Hendrycks Dan","year":"2021","unstructured":"Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, and Jacob Steinhardt. Measuring mathematical problem solving with the math dataset. arXiv preprint arXiv:2103.03874, 2021."},{"key":"e_1_3_2_1_21_1","volume-title":"et al. Metagpt: Meta programming for multi-agent collaborative framework. arXiv preprint arXiv:2308.00352","author":"Hong Sirui","year":"2023","unstructured":"Sirui Hong, Xiawu Zheng, Jonathan Chen, Yuheng Cheng, Ceyao Lin, Wen-Yi Liu, Bill Yin, David Jiang, Deheng Fu, Zhiyuan Lin, et al. Metagpt: Meta programming for multi-agent collaborative framework. arXiv preprint arXiv:2308.00352, 2023."},{"key":"e_1_3_2_1_22_1","volume-title":"et al. Minicpm: Unveiling the potential of small language models with scalable training strategies. arXiv preprint arXiv:2404.06395","author":"Hu Shengding","year":"2024","unstructured":"Shengding Hu, Yuge Tu, Xu Han, Chaoqun He, Ganqu Cui, Xiang Long, Zhi Zheng, Yewei Fang, Yuxiang Huang, Weilin Zhao, et al. Minicpm: Unveiling the potential of small language models with scalable training strategies. arXiv preprint arXiv:2404.06395, 2024."},{"key":"e_1_3_2_1_23_1","volume-title":"Scaling test-time compute - a hugging face space. https:\/\/huggingface.co\/spaces\/HuggingFaceH4\/blogpost-scaling-test-time-compute","year":"2025","unstructured":"HuggingFaceH4. Scaling test-time compute - a hugging face space. https:\/\/huggingface.co\/spaces\/HuggingFaceH4\/blogpost-scaling-test-time-compute, 2025."},{"key":"e_1_3_2_1_24_1","volume-title":"Amd processors specifications. https:\/\/www.amd.com\/en\/products\/specifications\/processors.html","author":"AMD Inc.","year":"2025","unstructured":"AMD Inc. Amd processors specifications. https:\/\/www.amd.com\/en\/products\/specifications\/processors.html, 2025. Accessed: 2025."},{"key":"e_1_3_2_1_25_1","unstructured":"Qualcomm Incorporated. Unlocking on-device generative ai with an npu and heterogeneous computing. https:\/\/www.qualcomm.com\/content\/dam\/qcomm-martech\/dm-assets\/documents\/Unlocking-on-device-generative-AI-with-an-NPU-and-heterogeneous-computing.pdf 2024. Accessed: 2025."},{"key":"e_1_3_2_1_26_1","volume-title":"Systematic outliers in large language models. arXiv preprint arXiv:2402.01353","author":"Kovaleva Oleksandra","year":"2024","unstructured":"Oleksandra Kovaleva, Tim Dettmers, Mikel Artetxe, Luke Zettlemoyer, Mike Lewis, Gautier Izacard, and Edouard Grave. Systematic outliers in large language models. arXiv preprint arXiv:2402.01353, 2024."},{"key":"e_1_3_2_1_27_1","first-page":"19286","volume-title":"International Conference on Machine Learning","author":"Leviathan Yaniv","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. Fast inference from transformers via speculative decoding. In International Conference on Machine Learning, pages 19274\u201319286. PMLR, 2023."},{"key":"e_1_3_2_1_28_1","volume-title":"Quantization meets reasoning: Exploring llm low-bit quantization degradation for mathematical reasoning. arXiv preprint arXiv:2501.03035","author":"Li Zhen","year":"2025","unstructured":"Zhen Li, Yupeng Su, Runming Yang, Congkai Xie, Zheng Wang, Zhongwei Xie, Ngai Wong, and Hongxia Yang. Quantization meets reasoning: Exploring llm low-bit quantization degradation for mathematical reasoning. arXiv preprint arXiv:2501.03035, 2025."},{"key":"e_1_3_2_1_29_1","volume-title":"Reward-guided speculative decoding for efficient llm reasoning. arXiv preprint arXiv:2501.19324","author":"Liao Bao Hao","year":"2025","unstructured":"Bao Hao Liao, Yuhui Xu, Hanze Dong, Junnan Li, Christof Monz, Silvio Savarese, Doyen Sahoo, and Caiming Xiong. Reward-guided speculative decoding for efficient llm reasoning. arXiv preprint arXiv:2501.19324, 2025."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00071"},{"key":"e_1_3_2_1_31_1","first-page":"87766","article-title":"Distributing outliers via dual transformation makes stronger quantized llms","volume":"37","author":"Lin Haokun","year":"2024","unstructured":"Haokun Lin, Haobo Xu, Yichen Wu, Jingzhi Cui, Yingtao Zhang, Linzhan Mou, Linqi Song, Zhenan Sun, and Ying Wei. Duquant: Distributing outliers via dual transformation makes stronger quantized llms. Advances in Neural Information Processing Systems, 37:87766\u201387800, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_32_1","first-page":"87","article-title":"Activation-aware weight quantization for on-device llm compression and acceleration","volume":"6","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, Wei-Chen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, and Song Han. Awq: Activation-aware weight quantization for on-device llm compression and acceleration. Proceedings of Machine Learning and Systems, 6:87\u2013100, 2024.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_33_1","first-page":"14087","volume-title":"Findings of the Association for Computational Linguistics: ACL","author":"Lin Yu-Shan","year":"2023","unstructured":"Yu-Shan Lin, Cheng-En Wu, Hsin-Hsuan Chen, Chi-Jen Lee, and Da-Cheng Juan. Do emergent abilities exist in quantized large language models: An empirical study. In Findings of the Association for Computational Linguistics: ACL 2023, pages 14076\u201314087. Association for Computational Linguistics, 2023."},{"key":"e_1_3_2_1_34_1","volume-title":"Quantization hurts reasoning? an empirical study on quantized reasoning models. arXiv preprint arXiv:2504.04823","author":"Liu Ruikang","year":"2025","unstructured":"Ruikang Liu, Yuxuan Sun, Manyi Zhang, Haoli Bai, Xianzhi Yu, Tiezheng Yu, Chun Yuan, and Lu Hou. Quantization hurts reasoning? an empirical study on quantized reasoning models. arXiv preprint arXiv:2504.04823, 2025."},{"key":"e_1_3_2_1_35_1","volume-title":"Can 1b llm surpass 405b llm? rethinking compute-optimal test-time scaling. arXiv preprint arXiv:2502.06703","author":"Liu Runze","year":"2025","unstructured":"Runze Liu, Junqi Gao, Jian Zhao, Kaiyan Zhang, Xiu Li, Biqing Qi, Wanli Ouyang, and Bowen Zhou. Can 1b llm surpass 405b llm? rethinking compute-optimal test-time scaling. arXiv preprint arXiv:2502.06703, 2025."},{"key":"e_1_3_2_1_36_1","volume-title":"Spinquant: Llm quantization with learned rotations. arXiv preprint arXiv:2405.16406","author":"Liu Zechun","year":"2024","unstructured":"Zechun Liu, Changsheng Zhao, Igor Fedorov, Bilge Soran, Dhruv Choudhary, Raghuraman Krishnamoorthi, Vikas Chandra, Yuandong Tian, and Tijmen Blankevoort. Spinquant: Llm quantization with learned rotations. arXiv preprint arXiv:2405.16406, 2024."},{"key":"e_1_3_2_1_37_1","volume-title":"Improving multi-candidate speculative decoding. arXiv preprint arXiv:2409.10644","author":"Lu Xiaofan","year":"2024","unstructured":"Xiaofan Lu, Yixiao Zeng, Feiyang Ma, Zixu Yu, and Marco Levorato. Improving multi-candidate speculative decoding. arXiv preprint arXiv:2409.10644, 2024."},{"key":"e_1_3_2_1_38_1","volume-title":"Bluelm-v-3b: Algorithm and system co-design for multimodal large language models on mobile devices. arXiv preprint arXiv:2411.10640","author":"Lu Xudong","year":"2024","unstructured":"Xudong Lu, Yinghao Chen, Cheng Chen, Hui Tan, Boheng Chen, Yina Xie, Rui Hu, Guanxin Tan, Renshou Wu, Yan Hu, et al. Bluelm-v-3b: Algorithm and system co-design for multimodal large language models on mobile devices. arXiv preprint arXiv:2411.10640, 2024."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/HCS59251.2023.10254715"},{"key":"e_1_3_2_1_40_1","volume-title":"Meta AI Blog","author":"Llama","year":"2024","unstructured":"Meta. Llama 3.2: Vision and edge-optimized models for multimodal and mobile ai. Meta AI Blog, 2024."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651335"},{"key":"e_1_3_2_1_42_1","volume-title":"Online normalizer calculation for softmax. arXiv preprint arXiv:1805.02867","author":"Milakov Maxim","year":"2018","unstructured":"Maxim Milakov and Natalia Gimelshein. Online normalizer calculation for softmax. arXiv preprint arXiv:1805.02867, 2018."},{"key":"e_1_3_2_1_43_1","volume-title":"Skywork-o1 open series. https:\/\/huggingface.co\/Skywork","author":"Skywork Team","year":"2024","unstructured":"o1 Team Skywork. Skywork-o1 open series. https:\/\/huggingface.co\/Skywork, 2024."},{"key":"e_1_3_2_1_44_1","volume-title":"Zhenrui Zheng, Driss Guessous, Vasiliy Kuznetsov, Christian Puhrsch, Mark Saroufim, et al. Torchao: Pytorch-native training-to-serving model optimization. arXiv preprint arXiv:2507.16099","author":"Or Andrew","year":"2025","unstructured":"Andrew Or, Apurva Jain, Daniel Vega-Myhre, Jesse Cai, Charles David Hernandez, Zhenrui Zheng, Driss Guessous, Vasiliy Kuznetsov, Christian Puhrsch, Mark Saroufim, et al. Torchao: Pytorch-native training-to-serving model optimization. arXiv preprint arXiv:2507.16099, 2025."},{"key":"e_1_3_2_1_45_1","volume-title":"Boosting llm reasoning: A crossover of tree of thoughts and retrieve-augmented generation. arXiv preprint arXiv:2310.00844","author":"Paranjape Bhavana","year":"2023","unstructured":"Bhavana Paranjape, Amit Budhiraja, Amir Stdehghani, Gholamreza Haffari, and Sameer Sawhney. Boosting llm reasoning: A crossover of tree of thoughts and retrieve-augmented generation. arXiv preprint arXiv:2310.00844, 2023."},{"key":"e_1_3_2_1_46_1","unstructured":"Qualcomm Innovation Center Inc. fastrpc: Fastrpc library for linux userspace. https:\/\/github.com\/quic\/fastrpc 2025."},{"key":"e_1_3_2_1_47_1","volume-title":"Qualcomm Technologies. Qualcomm\u00ae ai engine direct sdk. https:\/\/developer.qualcomm.com\/software\/qualcomm-ai-engine-direct-sdk","author":"Inc.","year":"2025","unstructured":"Inc. Qualcomm Technologies. Qualcomm\u00ae ai engine direct sdk. https:\/\/developer.qualcomm.com\/software\/qualcomm-ai-engine-direct-sdk, 2025. Unified API for AI development on Qualcomm accelerators (Hexagon DSP, Kryo CPU, Adreno GPU). Supports direct offloading for TensorFlow Lite\/ONNX Runtime, with HTP (Hexagon Tensor Accelerator) and CDSP (Hexagon Compute DSP) backend optimizations. Accessed: 2025-05-16."},{"key":"e_1_3_2_1_48_1","volume-title":"Amd xdna\u2122 npu in ryzen\u2122 ai processors","author":"Rico Alejandro","year":"2024","unstructured":"Alejandro Rico, Satyaprakash Pareek, Javier Cabezas, David Clarke, Baris Ozgul, Francisco Barat, Yao Fu, Stephan M\u00fcnz, Dylan Stuart, Patrick Schlangen, et al. Amd xdna\u2122 npu in ryzen\u2122 ai processors. IEEE Micro, 2024."},{"key":"e_1_3_2_1_49_1","volume-title":"Chandra Bhagavatula, and Yejin Choi. Winogrande: An adversarial winograd schema challenge at scale. arXiv preprint arXiv:1907.10641","author":"Sakaguchi Keisuke","year":"2019","unstructured":"Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. Winogrande: An adversarial winograd schema challenge at scale. arXiv preprint arXiv:1907.10641, 2019."},{"key":"e_1_3_2_1_50_1","volume-title":"Scaling llm test-time compute optimally can be more effective than scaling model parameters. arXiv preprint arXiv:2408.03314","author":"Snell Charlie","year":"2024","unstructured":"Charlie Snell, Jaehoon Lee, Kelvin Xu, and Aviral Kumar. Scaling llm test-time compute optimally can be more effective than scaling model parameters. arXiv preprint arXiv:2408.03314, 2024."},{"key":"e_1_3_2_1_51_1","volume-title":"Gemma 3 technical report. arXiv preprint arXiv:2503.19786","author":"Team Gemma","year":"2025","unstructured":"Gemma Team, Aishwarya Kamath, Johan Ferret, Shreya Pathak, Nino Vieillard, Ramona Merhej, Sarah Perrin, Tatiana Matejovicova, Alexandre Ram\u00e9, Morgane Rivi\u00e8re, et al. Gemma 3 technical report. arXiv preprint arXiv:2503.19786, 2025."},{"key":"e_1_3_2_1_52_1","volume-title":"Juliette Love, et al. Gemma: Open models based on gemini research and technology. arXiv preprint arXiv:2403.08295","author":"Team Gemma","year":"2024","unstructured":"Gemma Team, Thomas Mesnard, Cassidy Hardin, Robert Dadashi, Surya Bhupatiraju, Shreya Pathak, Laurent Sifre, Morgane Rivi\u00e8re, Mihir Sanjay Kale, Juliette Love, et al. Gemma: Open models based on gemini research and technology. arXiv preprint arXiv:2403.08295, 2024."},{"key":"e_1_3_2_1_53_1","volume-title":"Executorch: A pytorch platform for on-device deployment. https:\/\/github.com\/pytorch\/executorch","author":"Team PyTorch","year":"2025","unstructured":"PyTorch Team. Executorch: A pytorch platform for on-device deployment. https:\/\/github.com\/pytorch\/executorch, 2025. Accessed: 2025."},{"key":"e_1_3_2_1_54_1","volume-title":"Math-shepherd: Verify and reinforce llms step-by-step without human annotations. arXiv preprint arXiv:2312.08935","author":"Wang Peiyi","year":"2023","unstructured":"Peiyi Wang, Lei Li, Zhihong Shao, RX Xu, Damai Dai, Yifei Li, Deli Chen, Yu Wu, and Zhifang Sui. Math-shepherd: Verify and reinforce llms step-by-step without human annotations. arXiv preprint arXiv:2312.08935, 2023."},{"key":"e_1_3_2_1_55_1","volume-title":"Chi, and Denny Zhou. Self-consistency improves chain of thought reasoning in language models. arXiv preprint arXiv:2303.11366","author":"Wang Xuezhi","year":"2023","unstructured":"Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc Le, Ed Chi, and Denny Zhou. Self-consistency improves chain of thought reasoning in language models. arXiv preprint arXiv:2303.11366, 2023."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696099"},{"key":"e_1_3_2_1_57_1","volume-title":"Inference scaling laws: An empirical analysis of compute-optimal inference for problem-solving with language models. arXiv preprint arXiv:2408.00724","author":"Wu Yangzhen","year":"2024","unstructured":"Yangzhen Wu, Zhiqing Sun, Shanda Li, Sean Welleck, and Yiming Yang. Inference scaling laws: An empirical analysis of compute-optimal inference for problem-solving with language models. arXiv preprint arXiv:2408.00724, 2024."},{"key":"e_1_3_2_1_58_1","first-page":"38099","volume-title":"International Conference on Machine Learning","author":"Xiao Guangxuan","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning, pages 38087\u201338099. PMLR, 2023."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707239"},{"key":"e_1_3_2_1_60_1","volume-title":"Towards large reasoning models: A survey on scaling llm reasoning capabilities. arXiv preprint arXiv:2501.09686","author":"Xu Fengli","year":"2025","unstructured":"Fengli Xu, Qianyue Hao, Zefang Zong, Jingwei Wang, Yunke Zhang, Jingyi Wang, Xiaochong Lan, Jiahui Gong, Tianjian Ouyang, Fan Jinmeng, et al. Towards large reasoning models: A survey on scaling llm reasoning capabilities. arXiv preprint arXiv:2501.09686, 2025."},{"key":"e_1_3_2_1_61_1","volume-title":"Powerinfer-2: Fast large language model inference on a smartphone. arXiv preprint arXiv:2406.06282","author":"Xue Zhenliang","year":"2024","unstructured":"Zhenliang Xue, Yixin Song, Zeyu Mi, Xinrui Zheng, Yubin Xia, and Haibo Chen. Powerinfer-2: Fast large language model inference on a smartphone. arXiv preprint arXiv:2406.06282, 2024."},{"key":"e_1_3_2_1_62_1","volume-title":"Empowering agentic video analytics systems with video language models. arXiv preprint arXiv:2505.00254","author":"Yan Yuxuan","year":"2025","unstructured":"Yuxuan Yan, Shiqi Jiang, Ting Cao, Yifan Yang, Qianqian Yang, Yuanchao Shu, Yuqing Yang, and Lili Qiu. Empowering agentic video analytics systems with video language models. arXiv preprint arXiv:2505.00254, 2025."},{"key":"e_1_3_2_1_63_1","volume-title":"Qwen 2.5 technical report. arXiv preprint arXiv:2412.15115","author":"Yang An","year":"2024","unstructured":"An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, et al. Qwen 2.5 technical report. arXiv preprint arXiv:2412.15115, 2024."},{"key":"e_1_3_2_1_64_1","volume-title":"Tree of thoughts: Deliberate problem solving with large language models. Advances in neural information processing systems, 36:11809\u201311822","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and Karthik Narasimhan. Tree of thoughts: Deliberate problem solving with large language models. Advances in neural information processing systems, 36:11809\u201311822, 2023."},{"key":"e_1_3_2_1_65_1","volume-title":"et al. Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800","author":"Yao Yuan","year":"2024","unstructured":"Yuan Yao, Tianyu Yu, Ao Zhang, Chongyi Wang, Junbo Cui, Hongji Zhu, Tianchi Cai, Haoyu Li, Weilin Zhao, Zhihui He, et al. Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800, 2024."},{"key":"e_1_3_2_1_66_1","volume-title":"Dynamic sparse attention on mobile socs. arXiv preprint arXiv:2508.16703","author":"Yin Wangsong","year":"2025","unstructured":"Wangsong Yin, Daliang Xu, Mengwei Xu, Gang Huang, and Xuanzhe Liu. Dynamic sparse attention on mobile socs. arXiv preprint arXiv:2508.16703, 2025."},{"key":"e_1_3_2_1_67_1","volume-title":"Process-bench: Identifying process errors in mathematical reasoning. arXiv preprint arXiv:2412.06559","author":"Zheng Chujie","year":"2024","unstructured":"Chujie Zheng, Zhenru Zhang, Beichen Zhang, Runji Lin, Keming Lu, Bowen Yu, Dayiheng Liu, Jingren Zhou, and Junyang Lin. Process-bench: Identifying process errors in mathematical reasoning. arXiv preprint arXiv:2412.06559, 2024."}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3767295.3769382","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T12:07:42Z","timestamp":1780661262000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3769382"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":67,"alternative-id":["10.1145\/3767295.3769382","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3769382","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}