{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T22:37:26Z","timestamp":1780612646288,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T00:00:00Z","timestamp":1781913600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Institute of Information & communications Technology Planning & Evaluation (IITP)","award":["RS-2024-00398157"],"award-info":[{"award-number":["RS-2024-00398157"]}]},{"name":"Institute of Information & communications Technology Planning & Evaluation (IITP)","award":["IITP-2021-0-02048"],"award-info":[{"award-number":["IITP-2021-0-02048"]}]},{"name":"National Research Foundation of Korea (NRF)","award":["RS-2022-NR070834"],"award-info":[{"award-number":["RS-2022-NR070834"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,21]]},"DOI":"10.1145\/3745756.3809224","type":"proceedings-article","created":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:52:21Z","timestamp":1780059141000},"page":"580-592","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["SAIL: Redesigning Collaborative Language Inference with a Single Server-to-Mobile Handoff"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-3659-2615","authenticated-orcid":false,"given":"Gibum","family":"Park","sequence":"first","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8667-5521","authenticated-orcid":false,"given":"Sanghyun","family":"Han","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2430-0270","authenticated-orcid":false,"given":"Yonghwa","family":"Cho","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3975-2566","authenticated-orcid":false,"given":"Chanjeong","family":"Park","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8647-1476","authenticated-orcid":false,"given":"Kyunghan","family":"Lee","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,20]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of USENIX OSDI. 117\u2013134","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming Throughput-Latency tradeoff in LLM inference with Sarathi-Serve. In Proceedings of USENIX OSDI. 117\u2013134."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","DOI":"10.3115\/v1\/W14-33","volume-title":"Proceedings of the Workshop on Statistical Machine Translation. 12\u201358","author":"Bojar Ond\u0159ej","year":"2014","unstructured":"Ond\u0159ej Bojar, Christian Buck, Christian Federmann, Barry Haddow, Philipp Koehn, Johannes Leveling, Christof Monz, Pavel Pecina, Matt Post, Herv\u00e9 Saint-Amand, Radu Soricut, Lucia Specia, and Ale\u0161 Tamchyna. 2014. Findings of the 2014 Workshop on Statistical Machine Translation. In Proceedings of the Workshop on Statistical Machine Translation. 12\u201358."},{"key":"e_1_3_2_1_3_1","volume-title":"RTBAgent: A LLM-based Agent System for Real-Time Bidding. In Companion Proceedings of the ACM Web Conference","author":"Cai Leng","year":"2025","unstructured":"Leng Cai, Junxuan He, Yikai Li, Junjie Liang, Yuanping Lin, Ziming Quan, Yawen Zeng, and Jin Xu. 2025. RTBAgent: A LLM-based Agent System for Real-Time Bidding. In Companion Proceedings of the ACM Web Conference 2025. 104\u2013113."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of ICML.","author":"Cai Tianle","year":"2024","unstructured":"Tianle Cai, Yuhong Li, Zhengyang Geng, Hongwu Peng, Jason D Lee, Deming Chen, and Tri Dao. 2024. Medusa: Simple LLM inference acceleration framework with multiple decoding heads. In Proceedings of ICML."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","first-page":"23505","DOI":"10.1609\/aaai.v39i22.34519","article-title":"Approximated Variational Bayesian Inverse Reinforcement Learning for Large Language Model Alignment","volume":"39","author":"Cai Yuang","year":"2025","unstructured":"Yuang Cai, Yuyu Yuan, Jinsheng Shi, and Qinhong Lin. 2025. Approximated Variational Bayesian Inverse Reinforcement Learning for Large Language Model Alignment. In Proceedings of AAAI, Vol. 39. 23505\u201323513.","journal-title":"Proceedings of AAAI"},{"key":"e_1_3_2_1_6_1","first-page":"101725","article-title":"Transfer Q-star : Principled Decoding for LLM Alignment","volume":"37","author":"Chakraborty Souradip","year":"2024","unstructured":"Souradip Chakraborty, Soumya Suvra Ghosal, Ming Yin, Dinesh Manocha, Mengdi Wang, Amrit Singh Bedi, and Furong Huang. 2024. Transfer Q-star : Principled Decoding for LLM Alignment. In Proceedings of NeurIPS, Vol. 37. 101725\u2013101761.","journal-title":"Proceedings of NeurIPS"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of IEEE CVPR. 29083\u201329095","author":"Chen Joya","year":"2025","unstructured":"Joya Chen, Ziyun Zeng, Yiqi Lin, Wei Li, Zejun Ma, and Mike Zheng Shou. 2025. Livecc: Learning video llm with streaming speech transcription at scale. In Proceedings of IEEE CVPR. 29083\u201329095."},{"key":"e_1_3_2_1_8_1","volume-title":"Training Verifiers to Solve Math Word Problems. arXiv preprint arXiv:2110.14168","author":"Cobbe Karl","year":"2021","unstructured":"Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, \u0141ukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, Christopher Hesse, and John Schulman. 2021. Training Verifiers to Solve Math Word Problems. arXiv preprint arXiv:2110.14168 (2021)."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of IEEE CVPR. 4178\u20134188","author":"Feng Qianhan","year":"2025","unstructured":"Qianhan Feng, Wenshuo Li, Tong Lin, and Xinghao Chen. 2025. Align-KD: Distilling Cross-Modal Alignment Knowledge for Mobile Vision-Language Large Model Enhancement. In Proceedings of IEEE CVPR. 4178\u20134188."},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of ICLR.","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2023. OPTQ: Accurate Quantization for Generative Pre-trained Transformers. In Proceedings of ICLR."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of ICLR.","author":"Fu Tianyu","year":"2026","unstructured":"Tianyu Fu, Zihan Min, Hanling Zhang, Jichao Yan, Guohao Dai, Wanli Ouyang, and Yu Wang. 2026. Cache-to-Cache: Direct Semantic Communication Between Large Language Models. In Proceedings of ICLR."},{"key":"e_1_3_2_1_12_1","first-page":"59006","article-title":"Efficient llm scheduling by learning to rank","volume":"37","author":"Fu Yichao","year":"2024","unstructured":"Yichao Fu, Siqi Zhu, Runlong Su, Aurick Qiao, Ion Stoica, and Hao Zhang. 2024. Efficient llm scheduling by learning to rank. In Proceedings of NeurIPS, Vol. 37. 59006\u201359029.","journal-title":"Proceedings of NeurIPS"},{"key":"e_1_3_2_1_13_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3662006.3662067"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of IEEE INFOCOM. 1423\u20131431","author":"Hu Chuang","year":"2019","unstructured":"Chuang Hu, Wei Bao, Dan Wang, and Fengming Liu. 2019. Dynamic adaptive DNN surgery for inference acceleration on the edge. In Proceedings of IEEE INFOCOM. 1423\u20131431."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.2016299"},{"key":"e_1_3_2_1_17_1","first-page":"209","article-title":"CoDL: efficient CPU-GPU co-execution for deep learning inference on mobile devices","volume":"22","author":"Jia Fucheng","year":"2022","unstructured":"Fucheng Jia, Deyu Zhang, Ting Cao, Shiqi Jiang, Yunxin Liu, Ju Ren, and Yaoxue Zhang. 2022. CoDL: efficient CPU-GPU co-execution for deep learning inference on mobile devices.. In Proceedings of ACM MobiSys, Vol. 22. 209\u2013221.","journal-title":"Proceedings of ACM MobiSys"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of ACM SIGCOMM. 253\u2013266","author":"Jiang Junchen","year":"2018","unstructured":"Junchen Jiang, Ganesh Ananthanarayanan, Peter Bodik, Siddhartha Sen, and Ion Stoica. 2018. Chameleon: scalable adaptation of video analytics. In Proceedings of ACM SIGCOMM. 253\u2013266."},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of IEEE\/ACM ISCA. 974\u2013989","author":"Jiang Wenqi","year":"2025","unstructured":"Wenqi Jiang, Suvinay Subramanian, Cat Graves, Gustavo Alonso, Amir Yazdanbakhsh, and Vidushi Dadu. 2025. Rago: Systematic performance optimization for retrieval-augmented generation serving. In Proceedings of IEEE\/ACM ISCA. 974\u2013989."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037698"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of ACM KDD. 1128\u20131138","author":"Kawamae Noriaki","year":"2025","unstructured":"Noriaki Kawamae. 2025. Knowledge-Aligned Domain Shift Tuning for Efficient Adaptation in Large Language Models. In Proceedings of ACM KDD. 1128\u20131138."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of ACM MobiSys. 412\u2013424","author":"Kim Jinhyuk","year":"2024","unstructured":"Jinhyuk Kim, Jaewon Lee, and Sunwoo Park. 2024. COACTO: Cooperative Activation Offloading for Real-Time Mobile Vision. In Proceedings of ACM MobiSys. 412\u2013424."},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of ACM SOSP. 611\u2013626","author":"Kwon Woosuk","year":"2023","unstructured":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph Gonzalez, Hao Zhang, and Ion Stoica. 2023. Efficient Memory Management for Large Language Model Serving with PagedAttention. In Proceedings of ACM SOSP. 611\u2013626."},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of ACM MobiCom. 1\u201315","author":"Laskaridis Stefanos","year":"2020","unstructured":"Stefanos Laskaridis, Stylianos I Venieris, Mario Almeida, Ilias Leontiadis, and Nicholas D Lane. 2020. SPINN: Synergistic progressive inference of neural networks over device and cloud. In Proceedings of ACM MobiCom. 1\u201315."},{"key":"e_1_3_2_1_25_1","unstructured":"Yaniv Leviathan Matan Kalman and Yossi Matias. 2023. Fast Inference from Transformers via Speculative Decoding. arXiv:2211.17192 [cs.LG] https:\/\/arxiv.org\/abs\/2211.17192"},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of USENIX ATC.","author":"Li Suyi","year":"2025","unstructured":"Suyi Li, Hanfeng Lu, Tianyuan Wu, Minchen Yu, Qizhen Weng, Xusheng Chen, Yizhou Shan, Binhang Yuan, and Wei Wang. 2025. TOPPINGS: CPU-Assisted, Rank-Aware Adapter Serving for LLM Inference. In Proceedings of USENIX ATC."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of MLsys. 87\u2013100","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, Wei-Chen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, and Song Han. 2024. AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration. In Proceedings of MLsys. 87\u2013100."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of IEEE\/ACM ISCA. 884\u2013898","author":"Liu Chaoqiang","year":"2025","unstructured":"Chaoqiang Liu, Haifeng Liu, Dan Chen, Yu Huang, Yi Zhang, Wenjing Xiao, Xiaofei Liao, and Hai Jin. 2025. HeterRAG: Heterogeneous Processing-in-Memory Acceleration for Retrieval-augmented Generation. In Proceedings of IEEE\/ACM ISCA. 884\u2013898."},{"key":"e_1_3_2_1_29_1","volume-title":"DroidSpeak: KV Cache Sharing for Cross-LLM Communication and Multi-LLM Serving. arXiv preprint arXiv:2411.02820","author":"Liu Yuhan","year":"2024","unstructured":"Yuhan Liu, Yuyang Huang, Jiayi Yao, Shaoting Feng, Zhuohan Gu, Kuntai Du, Hanchen Li, Yihua Cheng, Junchen Jiang, Shan Lu, Madan Musuvathi, and Esha Choukse. 2024. DroidSpeak: KV Cache Sharing for Cross-LLM Communication and Multi-LLM Serving. arXiv preprint arXiv:2411.02820 (2024)."},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of ACM CHI. 1\u201320","author":"Liu Yuhan","year":"2025","unstructured":"Yuhan Liu, Aadit Shah, Jordan Ackerman, and Manaswi Saha. 2025. Exploring the Design Space of Real-time LLM Knowledge Support Systems: A Case Study of Jargon Explanations. In Proceedings of ACM CHI. 1\u201320."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of ICML.","author":"Liu Zechun","year":"2024","unstructured":"Zechun Liu, Changsheng Zhao, Forrest Iandola, Chen Lai, Yuandong Tian, Igor Fedorov, Yunyang Xiong, Ernie Chang, Yangyang Shi, Raghuraman Krishnamoorthi, et al. 2024. Mobilellm: Optimizing sub-billion parameter language models for on-device use cases. In Proceedings of ICML."},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of ACM ASPLOS. 932\u2013949","author":"Miao Xupeng","year":"2024","unstructured":"Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Zeyu Wang, Zhengxin Zhang, Rae Ying Yee Wong, Alan Zhu, Lijie Yang, Xiaoxiang Shi, et al. 2024. SpecInfer: Accelerating generative large language model serving with tree-based speculative inference and verification. In Proceedings of ACM ASPLOS. 932\u2013949."},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of ACL. 311\u2013318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. BLEU: a method for automatic evaluation of machine translation. In Proceedings of ACL. 311\u2013318."},{"key":"e_1_3_2_1_34_1","volume-title":"SpecEdge: Scalable Edge-Assisted Serving Framework for Interactive LLMs. arXiv preprint arXiv:2505.17052","author":"Park Jinwoo","year":"2025","unstructured":"Jinwoo Park, Seunggeun Cho, and Dongsu Han. 2025. SpecEdge: Scalable Edge-Assisted Serving Framework for Interactive LLMs. arXiv preprint arXiv:2505.17052 (2025)."},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of AAAI. 25056\u201325064","author":"Qin Zongyue","year":"2025","unstructured":"Zongyue Qin, Zifan He, Neha Prakriya, Jason Cong, and Yizhou Sun. 2025. Dynamic-width speculative beam decoding for llm inference. In Proceedings of AAAI. 25056\u201325064."},{"key":"e_1_3_2_1_36_1","volume-title":"Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, and Samuel R. Bowman.","author":"Rein David","year":"2024","unstructured":"David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, and Samuel R. Bowman. 2024. GPQA: A Graduate-Level Google-Proof Q&A Benchmark. In Proceedings of COLM."},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of USENIX ATC. 397\u2013411","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja J Yadwadkar, and Christos Kozyrakis. 2021. {INFaaS}: Automated model-less inference serving. In Proceedings of USENIX ATC. 397\u2013411."},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of ACM SenSys. 199\u2013212","author":"Shen Leming","year":"2025","unstructured":"Leming Shen, Qiang Yang, Xinyu Huang, Zijing Ma, and Yuanqing Zheng. 2025. Gpiot: Tailoring small language models for iot program synthesis and development. In Proceedings of ACM SenSys. 199\u2013212."},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of IEEE\/ACM ISCA. 958\u2013973","author":"Shen Michael","year":"2025","unstructured":"Michael Shen, Muhammad Umar, Kiwan Maeng, G Edward Suh, and Udit Gupta. 2025. Hermes: Algorithm-System Co-design for Efficient Retrieval-Augmented Generation At-Scale. In Proceedings of IEEE\/ACM ISCA. 958\u2013973."},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of ACM ASPLOS. 1266\u20131281","author":"Stojkovic Jovan","year":"2025","unstructured":"Jovan Stojkovic, Chaojie Zhang, \u00cd\u00f1igo Goiri, Esha Choukse, Haoran Qiu, Rodrigo Fonseca, Josep Torrellas, and Ricardo Bianchini. 2025. Tapas: Thermal-and power-aware scheduling for LLM inference in cloud platforms. In Proceedings of ACM ASPLOS. 1266\u20131281."},{"key":"e_1_3_2_1_41_1","volume-title":"Proceedings of IEEE HPCA. 1348\u20131362","author":"Stojkovic Jovan","year":"2025","unstructured":"Jovan Stojkovic, Chaojie Zhang, \u00cd\u00f1igo Goiri, Josep Torrellas, and Esha Choukse. 2025. Dynamollm: Designing llm inference clusters for performance and energy efficiency. In Proceedings of IEEE HPCA. 1348\u20131362."},{"key":"e_1_3_2_1_42_1","first-page":"30222","article-title":"Spectr: Fast speculative decoding via optimal transport","volume":"36","author":"Sun Ziteng","year":"2023","unstructured":"Ziteng Sun, Ananda Theertha Suresh, Jae Hun Ro, Ahmad Beirami, Himanshu Jain, and Felix Yu. 2023. Spectr: Fast speculative decoding via optimal transport. Proceedings of NeurIPS 36 (2023), 30222\u201330242.","journal-title":"Proceedings of NeurIPS"},{"key":"e_1_3_2_1_43_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_44_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of NeurIPS. 6000\u20136010","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All You Need. In Proceedings of NeurIPS. 6000\u20136010."},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of ACM SIGCOMM. 496\u2013511","author":"Wang Chenxu","year":"2025","unstructured":"Chenxu Wang, Xumiao Zhang, Runwei Lu, Xianshang Lin, Xuan Zeng, Xinlei Zhang, Zhe An, Gongwei Wu, Jiaqi Gao, Chen Tian, et al. 2025. Towards LLM-Based Failure Localization in Production-Scale Networks. In Proceedings of ACM SIGCOMM. 496\u2013511."},{"key":"e_1_3_2_1_47_1","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume":"35","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Proceedings of NeurIPS 35 (2022), 24824\u201324837.","journal-title":"Proceedings of NeurIPS"},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of ACM SOSP. 640\u2013654","author":"Wu Bingyang","year":"2024","unstructured":"Bingyang Wu, Shengyu Liu, Yinmin Zhong, Peng Sun, Xuanzhe Liu, and Xin Jin. 2024. Loongserve: Efficiently serving long-context large language models with elastic sequence parallelism. In Proceedings of ACM SOSP. 640\u2013654."},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of NeurIPS. 41618\u201341650","author":"Xie Yuxi","year":"2023","unstructured":"Yuxi Xie, Kenji Kawaguchi, Yiran Zhao, James Xu Zhao, Min-Yen Kan, Junxian He, and Michael Xie. 2023. Self-evaluation guided beam search for reasoning. In Proceedings of NeurIPS. 41618\u201341650."},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of ACM ASPLOS. 445\u2013462","author":"Xu Daliang","year":"2025","unstructured":"Daliang Xu, Hao Zhang, Liming Yang, Ruiqi Liu, Gang Huang, Mengwei Xu, and Xuanzhe Liu. 2025. Fast on-device LLM inference with npus. In Proceedings of ACM ASPLOS. 445\u2013462."},{"key":"e_1_3_2_1_51_1","unstructured":"An Yang Anfeng Li Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chang Gao Chengen Huang Chenxu Lv et al. 2025. Qwen3 technical report. arXiv preprint arXiv:2505.09388 (2025)."},{"key":"e_1_3_2_1_52_1","unstructured":"An Yang Baosong Yang Binyuan Hui Bo Zheng Bowen Yu Chang Zhou Chengpeng Li Chengyuan Li Dayiheng Liu Fei Huang et al. 2024. Qwen2 Technical Report. arXiv preprint arXiv:2407.10671 (2024)."},{"key":"e_1_3_2_1_53_1","volume-title":"Proceedings of ACM EuroSys. 94\u2013109","author":"Yao Jiayi","year":"2025","unstructured":"Jiayi Yao, Hanchen Li, Yuhan Liu, Siddhant Ray, Yihua Cheng, Qizheng Zhang, Kuntai Du, Shan Lu, and Junchen Jiang. 2025. CacheBlend: Fast large language model serving for RAG with cached knowledge fusion. In Proceedings of ACM EuroSys. 94\u2013109."},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of ICLR.","author":"Yao Shunyu","year":"2022","unstructured":"Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik R Narasimhan, and Yuan Cao. 2022. React: Synergizing reasoning and acting in language models. In Proceedings of ICLR."},{"key":"e_1_3_2_1_55_1","first-page":"128082","article-title":"A theoretical perspective for speculative decoding algorithm","volume":"37","author":"Yin Ming","year":"2024","unstructured":"Ming Yin, Minshuo Chen, Kaixuan Huang, and Mengdi Wang. 2024. A theoretical perspective for speculative decoding algorithm. Proceedings of NeurIPS 37 (2024), 128082\u2013128117.","journal-title":"Proceedings of NeurIPS"},{"key":"e_1_3_2_1_56_1","volume-title":"Proceedings of USENIX OSDI. 521\u2013538","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In Proceedings of USENIX OSDI. 521\u2013538."},{"key":"e_1_3_2_1_57_1","volume-title":"Tinyllama: An open-source small language model. arXiv preprint arXiv:2401.02385","author":"Zhang Peiyuan","year":"2024","unstructured":"Peiyuan Zhang, Guangtao Zeng, Tianduo Wang, and Wei Lu. 2024. Tinyllama: An open-source small language model. arXiv preprint arXiv:2401.02385 (2024)."},{"key":"e_1_3_2_1_58_1","volume-title":"Proceedings of USENIX OSDI. 193\u2013210","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. DistServe: Disaggregating prefill and decoding for goodput-optimized large language model serving. In Proceedings of USENIX OSDI. 193\u2013210."}],"event":{"name":"MobiSys '26: 24th Annual International Conference on Mobile Systems, Applications and Services","location":"University of Cambridge Cambridge United Kingdom","acronym":"MobiSys '26","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 24th Annual International Conference on Mobile Systems, Applications and Services"],"original-title":[],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T13:00:25Z","timestamp":1780059625000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3745756.3809224"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,20]]},"references-count":58,"alternative-id":["10.1145\/3745756.3809224","10.1145\/3745756"],"URL":"https:\/\/doi.org\/10.1145\/3745756.3809224","relation":{},"subject":[],"published":{"date-parts":[[2026,6,20]]},"assertion":[{"value":"2026-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}