{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T16:06:36Z","timestamp":1780675596742,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":77,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3774904.3792382","type":"proceedings-article","created":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T21:54:34Z","timestamp":1775771674000},"page":"5404-5414","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["<scp>LaTune<\/scp>\n                    : Lightweight and Adaptive Configuration Tuning for LLM Inference on Edge Devices"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-0134-2683","authenticated-orcid":false,"given":"Siqi","family":"Zhong","sequence":"first","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7625-8721","authenticated-orcid":false,"given":"Mugeng","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4599-3198","authenticated-orcid":false,"given":"Haiyang","family":"Shen","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6159-5761","authenticated-orcid":false,"given":"Chongyang","family":"Pan","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7866-4075","authenticated-orcid":false,"given":"Yun","family":"Ma","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,4,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"https:\/\/github.com\/ggerganov\/llama.cpp. Retrieved on","year":"2025","unstructured":"2024. Llama.cpp. https:\/\/github.com\/ggerganov\/llama.cpp. Retrieved on October 1, 2025."},{"key":"e_1_3_2_1_2_1","volume-title":"Ollama: Run Large Language Models Locally. https:\/\/ollama.com. Retrieved on","year":"2025","unstructured":"2024. Ollama: Run Large Language Models Locally. https:\/\/ollama.com. Retrieved on October 1, 2025."},{"key":"e_1_3_2_1_3_1","volume-title":"TensorRT-LLM: A TensorRT Toolbox for Optimized Large Language Model Inference. https:\/\/github.com\/NVIDIA\/TensorRT-LLM. Retrieved on","year":"2025","unstructured":"2025. TensorRT-LLM: A TensorRT Toolbox for Optimized Large Language Model Inference. https:\/\/github.com\/NVIDIA\/TensorRT-LLM. Retrieved on October 1, 2025."},{"key":"e_1_3_2_1_4_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI . 117-134","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming {Throughput-Latency} tradeoff in {LLM} inference with {Sarathi-Serve}. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI . 117-134."},{"key":"e_1_3_2_1_5_1","volume-title":"Sarathi: Efficient llm inference by piggybacking decodes with chunked prefills. arXiv preprint arXiv:2308.16369","author":"Agrawal Amey","year":"2023","unstructured":"Amey Agrawal, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S Gulavani, and Ramachandran Ramjee. 2023. Sarathi: Efficient llm inference by piggybacking decodes with chunked prefills. arXiv preprint arXiv:2308.16369 (2023)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00073"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 14th USENIX Conference on Networked Systems Design and Implementation. 469-482","author":"Alipourfard Omid","year":"2017","unstructured":"Omid Alipourfard, Hongqiang Harry Liu, Jianshu Chen, Shivaram Venkataraman, Minlan Yu, and Ming Zhang. 2017. Cherrypick: adaptively unearthing the best cloud configurations for big data analytics. In Proceedings of the 14th USENIX Conference on Networked Systems Design and Implementation. 469-482."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2628071.2628092"},{"key":"e_1_3_2_1_10_1","volume-title":"Reducing Transformer Key-Value Cache Size with Cross-Layer Attention. In The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Brandon William","year":"2024","unstructured":"William Brandon, Mayank Mishra, Aniruddha Nrusimha, Rameswar Panda, and Jonathan Ragan-Kelley. 2024. Reducing Transformer Key-Value Cache Size with Cross-Layer Attention. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_11_1","volume-title":"A tutorial on Bayesian optimization of expensive cost functions, with application to active user modeling and hierarchical reinforcement learning. arXiv preprint arXiv:1012.2599","author":"Brochu Eric","year":"2010","unstructured":"Eric Brochu, Vlad M Cora, and Nando De Freitas. 2010. A tutorial on Bayesian optimization of expensive cost functions, with application to active user modeling and hierarchical reinforcement learning. arXiv preprint arXiv:1012.2599 (2010)."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the 2025 USENIX Conference on Usenix Annual Technical Conference. 1553-1568","author":"Chen Chao","year":"2025","unstructured":"Chao Chen, Shixin Huang, Xuehai Qian, and Zhibin Yu. 2025. Swift: fast performance tuning with GAN-generated configurations. In Proceedings of the 2025 USENIX Conference on Usenix Annual Technical Conference. 1553-1568."},{"key":"e_1_3_2_1_13_1","first-page":"578","volume-title":"TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578-594."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3696410.3714553"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3696410.3714930"},{"key":"e_1_3_2_1_16_1","volume-title":"ELIS: Efficient LLM Iterative Scheduling System with Response Length Predictor. arXiv preprint arXiv:2505.09142","author":"Choi Seungbeom","year":"2025","unstructured":"Seungbeom Choi, Jeonghoe Goo, Eunjoo Jeon, Mingyu Yang, and Minsung Jang. 2025. ELIS: Efficient LLM Iterative Scheduling System with Response Length Predictor. arXiv preprint arXiv:2505.09142 (2025)."},{"key":"e_1_3_2_1_17_1","volume-title":"Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691","author":"Dao Tri","year":"2023","unstructured":"Tri Dao. 2023. Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691 (2023)."},{"key":"e_1_3_2_1_18_1","first-page":"16344","article-title":"FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In Advances in Neural Information Processing Systems. Curran Associates","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In Advances in Neural Information Processing Systems. Curran Associates, Inc., 16344-16359.","journal-title":"Inc."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.14778\/1687627.1687767"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.5555\/3691992.3691999"},{"key":"e_1_3_2_1_21_1","first-page":"111","volume-title":"Cost-Efficient Large Language Model Serving for Multi-turn Conversations with CachedAttention. In 2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Gao Bin","year":"2024","unstructured":"Bin Gao, Zhuomin He, Puru Sharma, Qingxuan Kang, Djordje Jevdjic, Junbo Deng, Xingkun Yang, Zhou Yu, and Pengfei Zuo. 2024. Cost-Efficient Large Language Model Serving for Multi-turn Conversations with CachedAttention. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). 111-126."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the Seventh Annual Conference on Machine Learning and Systems, MLSys 2024","author":"Gim In","year":"2024","unstructured":"In Gim, Guojun Chen, Seung-Seob Lee, Nikhil Sarda, Anurag Khandelwal, and Lin Zhong. 2024. Prompt Cache: Modular Attention Reuse for Low-Latency Inference. In Proceedings of the Seventh Annual Conference on Machine Learning and Systems, MLSys 2024, Santa Clara, CA, USA, May 13-16, 2024."},{"key":"e_1_3_2_1_23_1","unstructured":"Mark A Hall. 1999. Correlation-based feature selection for machine learning. Ph.D. Dissertation. The University of Waikato."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICEC.1994.350037"},{"key":"e_1_3_2_1_25_1","volume-title":"International conference on learning and intelligent optimization. Springer, 507-523","author":"Hutter Frank","year":"2011","unstructured":"Frank Hutter, Holger H Hoos, and Kevin Leyton-Brown. 2011. Sequential modelbased optimization for general algorithm configuration. In International conference on learning and intelligent optimization. Springer, 507-523."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.14778\/3551793.3551844"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Florian Karl Tobias Pielok Julia Moosbauer Florian Pfisterer Stefan Coors Martin Binder Lennart Schneider Janek Thomas Jakob Richter Michel Lang Eduardo C. Garrido-Merch\u00e1n Juergen Branke and Bernd Bischl. 2023. Multi- Objective Hyperparameter Optimization in Machine Learning\u2014An Overview. ACM Trans. Evol. Learn. Optim. (2023).","DOI":"10.1145\/3610536"},{"key":"e_1_3_2_1_28_1","volume-title":"ParEGO: A hybrid algorithm with on-line landscape approximation for expensive multiobjective optimization problems","author":"Knowles Joshua","year":"2006","unstructured":"Joshua Knowles. 2006. ParEGO: A hybrid algorithm with on-line landscape approximation for expensive multiobjective optimization problems. IEEE transactions on evolutionary computation (2006), 50-66."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671620"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.5555\/3618408.3619203"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC62836.2024.10938426"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611548"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation. 929-945","author":"Lin Chaofan","year":"2024","unstructured":"Chaofan Lin, Zhenhua Han, Chengruidong Zhang, Yuqing Yang, Fan Yang, Chen Chen, and Lili Qiu. 2024. Parrot: efficient serving of LLM-based applications with semantic variable. In Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation. 929-945."},{"key":"e_1_3_2_1_35_1","first-page":"87","article-title":"AWQ","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, Wei-Chen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, and Song Han. 2024. AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration. In Proceedings of Machine Learning and Systems. 87-100.","journal-title":"In Proceedings of Machine Learning and Systems."},{"key":"e_1_3_2_1_36_1","volume-title":"WebAssembly for container runtime: Are we there yet? ACM Transactions on Software Engineering and Methodology","author":"Liu Mugeng","year":"2025","unstructured":"Mugeng Liu, Haiyang Shen, Yixuan Zhang, Hong Mei, and Yun Ma. 2025. WebAssembly for container runtime: Are we there yet? ACM Transactions on Software Engineering and Methodology (2025), 1-22."},{"key":"e_1_3_2_1_37_1","volume-title":"A First Look at Bugs in LLM Inference Engines. arXiv preprint arXiv:2506.09713","author":"Liu Mugeng","year":"2025","unstructured":"Mugeng Liu, Siqi Zhong, Weichen Bi, Yixuan Zhang, Zhiyang Chen, Zhenpeng Chen, Xuanzhe Liu, and Yun Ma. 2025. A First Look at Bugs in LLM Inference Engines. arXiv preprint arXiv:2506.09713 (2025)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3726302.3730115"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711896.3736553"},{"key":"e_1_3_2_1_40_1","volume-title":"A unified approach to interpreting model predictions. Advances in neural information processing systems 30","author":"Lundberg Scott M","year":"2017","unstructured":"Scott M Lundberg and Su-In Lee. 2017. A unified approach to interpreting model predictions. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_41_1","volume-title":"Optimizing LLM Inference Throughput via Memory-aware and SLA-constrained Dynamic Batching. arXiv preprint arXiv:2503.05248","author":"Pang Bowen","year":"2025","unstructured":"Bowen Pang, Kai Li, and FeifanWang. 2025. Optimizing LLM Inference Throughput via Memory-aware and SLA-constrained Dynamic Batching. arXiv preprint arXiv:2503.05248 (2025)."},{"key":"e_1_3_2_1_42_1","volume-title":"A Survey on Inference Engines for Large Language Models: Perspectives on Optimization and Efficiency. arXiv preprint arXiv:2505.01658","author":"Park Sihyeong","year":"2025","unstructured":"Sihyeong Park, Sungryeol Jeon, Chaelyn Lee, Seokhun Jeon, Byung-Soo Kim, and Jemin Lee. 2025. A Survey on Inference Engines for Large Language Models: Perspectives on Optimization and Efficiency. arXiv preprint arXiv:2505.01658 (2025)."},{"key":"e_1_3_2_1_43_1","volume-title":"23rd USENIX Conference on File and Storage Technologies (FAST . 155-170","author":"Qin Ruoyu","year":"2025","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Jialei Cui, Feng Ren, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2025. Mooncake: Trading More Storage for Less Computation \u2014 A KVCache-centric Architecture for Serving LLM Chatbot. In 23rd USENIX Conference on File and Storage Technologies (FAST . 155-170."},{"key":"e_1_3_2_1_44_1","volume-title":"Global Sensitivity Analysis: The Primer","author":"Saltelli Andrea","unstructured":"Andrea Saltelli, Marco Ratto, Terry Andres, Francesca Campolongo, Jessica Cariboni, Debora Gatelli, Michaela Saisana, and Stefano Tarantola. 2008. Global Sensitivity Analysis: The Primer. John Wiley & Sons."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1142\/S0129065704001899"},{"key":"e_1_3_2_1_46_1","first-page":"68658","article-title":"Flashattention-3: Fast and accurate attention with asynchrony and low-precision","volume":"37","author":"Shah Jay","year":"2024","unstructured":"Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, and Tri Dao. 2024. Flashattention-3: Fast and accurate attention with asynchrony and low-precision. Advances in Neural Information Processing Systems 37 (2024), 68658-68685.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_47_1","unstructured":"ShareAI Lab. 2023. ShareGPT-Chinese-English-90k: A Bilingual Chinese-English Human-Machine Dialogue Dataset. https:\/\/huggingface.co\/datasets\/shareAI\/ ShareGPT-Chinese-English-90k License: Apache-2.0."},{"key":"e_1_3_2_1_48_1","volume-title":"Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150","author":"Shazeer Noam","year":"2019","unstructured":"Noam Shazeer. 2019. Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150 (2019)."},{"key":"e_1_3_2_1_49_1","volume-title":"ShortcutsBench: A Large-Scale Real-world Benchmark for API-based Agents. In The Thirteenth International Conference on Learning Representations.","author":"Yue Li Haiyang SHEN","year":"2025","unstructured":"Haiyang SHEN, Yue Li, Desong Meng, Dongqi Cai, Sheng Qi, Li Zhang, Mengwei Xu, and Yun Ma. 2025. ShortcutsBench: A Large-Scale Real-world Benchmark for API-based Agents. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning. 31094-31116","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. 2023. FlexGen: high-throughput generative inference of large language models with a single GPU. In Proceedings of the 40th International Conference on Machine Learning. 31094-31116."},{"key":"e_1_3_2_1_51_1","volume-title":"Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation. 173-191","author":"Sun Biao","year":"2024","unstructured":"Biao Sun, Ziming Huang, Hanyu Zhao,Wencong Xiao, Xinyi Zhang, Yong Li, and Wei Lin. 2024. Llumnix: dynamic scheduling for large language model serving. In Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation. 173-191."},{"key":"e_1_3_2_1_52_1","volume-title":"SpecExec: Massively Parallel Speculative Decoding For Interactive LLM Inference on Consumer Devices. In The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Svirschevski Ruslan","year":"2024","unstructured":"Ruslan Svirschevski, Avner May, Zhuoming Chen, Beidi Chen, Zhihao Jia, and Max Ryabinin. 2024. SpecExec: Massively Parallel Speculative Decoding For Interactive LLM Inference on Consumer Devices. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_53_1","unstructured":"MLC team. 2023. MLC-LLM. https:\/\/github.com\/mlc-ai\/mlc-llm"},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of the 2025 USENIX Conference on Usenix Annual Technical Conference. 563-585","author":"Tian Chunlin","year":"2025","unstructured":"Chunlin Tian, Xinpeng Qin, Kahou Tam, Li Li, Zijian Wang, Yuanzhe Zhao, Minglei Zhang, and Chengzhong Xu. 2025. CLONE: customizing LLMs for efficient latency-aware inference at the edge. In Proceedings of the 2025 USENIX Conference on Usenix Annual Technical Conference. 563-585."},{"key":"e_1_3_2_1_55_1","volume-title":"Mlgo: a machine learning guided compiler optimizations framework. arXiv preprint arXiv:2101.04808","author":"Trofin Mircea","year":"2021","unstructured":"Mircea Trofin, Yundi Qian, Eugene Brevdo, Zinan Lin, Krzysztof Choromanski, and David Li. 2021. Mlgo: a machine learning guided compiler optimizations framework. arXiv preprint arXiv:2101.04808 (2021)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3035918.3064029"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00881"},{"key":"e_1_3_2_1_58_1","volume-title":"Model compression and efficient inference for large language models: A survey. arXiv preprint arXiv:2402.09748","author":"Wang Wenxiao","year":"2024","unstructured":"Wenxiao Wang, Wei Chen, Yicong Luo, Yongliu Long, Zhengkai Lin, Liye Zhang, Binbin Lin, Deng Cai, and Xiaofei He. 2024. Model compression and efficient inference for large language models: A survey. arXiv preprint arXiv:2402.09748 (2024)."},{"key":"e_1_3_2_1_59_1","first-page":"911","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhu Ruidong","year":"2024","unstructured":"BingyangWu, Ruidong Zhu, Zili Zhang, Peng Sun, Xuanzhe Liu, and Xin Jin. 2024. dLoRA: Dynamically Orchestrating Requests and Adapters for LoRA LLM Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 911-927."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/988672.988711"},{"key":"e_1_3_2_1_61_1","volume-title":"Unlocking efficiency in large language model inference: A comprehensive survey of speculative decoding. arXiv preprint arXiv:2401.07851","author":"Xia Heming","year":"2024","unstructured":"Heming Xia, Zhe Yang, Qingxiu Dong, Peiyi Wang, Yongqi Li, Tao Ge, Tianyu Liu, Wenjie Li, and Zhifang Sui. 2024. Unlocking efficiency in large language model inference: A comprehensive survey of speculative decoding. arXiv preprint arXiv:2401.07851 (2024)."},{"key":"e_1_3_2_1_62_1","volume-title":"Droidcall: A dataset for llm-powered android intent invocation. arXiv preprint arXiv:2412.00402","author":"Xie Weikai","year":"2024","unstructured":"Weikai Xie, Li Zhang, ShiheWang, Rongjie Yi, and Mengwei Xu. 2024. Droidcall: A dataset for llm-powered android intent invocation. arXiv preprint arXiv:2412.00402 (2024)."},{"key":"e_1_3_2_1_63_1","unstructured":"Mengwei Xu Dongqi Cai Wangsong Yin Shangguang Wang Xin Jin and Xuanzhe Liu. 2025. Resource-efficient Algorithms and Systems of Foundation Models: A Survey. ACM Comput. Surv. (2025)."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581791.3596842"},{"key":"e_1_3_2_1_65_1","first-page":"521","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung- Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521-538."},{"key":"e_1_3_2_1_66_1","volume-title":"Challenging GPU Dominance: When CPUs Outperform for On-Device LLM Inference. arXiv preprint arXiv:2505.06461","author":"Zhang Haolin","year":"2025","unstructured":"Haolin Zhang and Jeff Huang. 2025. Challenging GPU Dominance: When CPUs Outperform for On-Device LLM Inference. arXiv preprint arXiv:2505.06461 (2025)."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3299869.3300085"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i24.34789"},{"key":"e_1_3_2_1_69_1","first-page":"38902","article-title":"Hypervolume maximization: A geometric view of pareto set learning","author":"Zhang Xiaoyuan","year":"2023","unstructured":"Xiaoyuan Zhang, Xi Lin, Bo Xue, Yifan Chen, and Qingfu Zhang. 2023. Hypervolume maximization: A geometric view of pareto set learning. Advances in Neural Information Processing Systems, 38902-38929.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3457291"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3514221.3526176"},{"key":"e_1_3_2_1_72_1","volume-title":"Dissecting the Impact of Mobile DVFS Governors on LLM Inference Performance and Energy Efficiency. arXiv preprint arXiv:2507.02135","author":"Zhang Zongpu","year":"2025","unstructured":"Zongpu Zhang, Pranab Dash, Y Charlie Hu, Qiang Xu, Jian Li, and Haibing Guan. 2025. Dissecting the Impact of Mobile DVFS Governors on LLM Inference Performance and Energy Efficiency. arXiv preprint arXiv:2507.02135 (2025)."},{"key":"e_1_3_2_1_73_1","first-page":"196","article-title":"Atom","author":"Zhao Yilong","year":"2024","unstructured":"Yilong Zhao, Chien-Yu Lin, Kan Zhu, Zihao Ye, Lequn Chen, Size Zheng, Luis Ceze, Arvind Krishnamurthy, Tianqi Chen, and Baris Kasikci. 2024. Atom: Low-Bit Quantization for Efficient and Accurate LLM Serving. In Proceedings of Machine Learning and Systems. 196-209.","journal-title":"Low-Bit Quantization for Efficient and Accurate LLM Serving. In Proceedings of Machine Learning and Systems."},{"key":"e_1_3_2_1_74_1","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation. 863-879","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph E. Gonzalez, and Ion Stoica. 2020. Ansor: generating high-performance tensor programs for deep learning. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation. 863-879."},{"key":"e_1_3_2_1_75_1","volume-title":"Jeff Huang, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al.","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Chuyue Livia Sun, Jeff Huang, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. 2024. Sglang: Efficient execution of structured language model programs. Advances in neural information processing systems (2024), 62557-62583."},{"key":"e_1_3_2_1_76_1","volume-title":"Batchllm: Optimizing large batched llm inference with global prefix sharing and throughput-oriented token batching. arXiv preprint arXiv:2412.03594","author":"Zheng Zhen","year":"2024","unstructured":"Zhen Zheng, Xin Ji, Taosong Fang, Fanghao Zhou, Chuanjie Liu, and Gang Peng. 2024. Batchllm: Optimizing large batched llm inference with global prefix sharing and throughput-oriented token batching. arXiv preprint arXiv:2412.03594 (2024)."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3127479.3128605"}],"event":{"name":"WWW '26: The ACM Web Conference 2026","location":"Dubai United Arab Emirates","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM Web Conference 2026"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3774904.3792382","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T15:40:18Z","timestamp":1780674018000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3774904.3792382"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,12]]},"references-count":77,"alternative-id":["10.1145\/3774904.3792382","10.1145\/3774904"],"URL":"https:\/\/doi.org\/10.1145\/3774904.3792382","relation":{},"subject":[],"published":{"date-parts":[[2026,4,12]]},"assertion":[{"value":"2026-04-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}