{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,30]],"date-time":"2026-06-30T15:44:27Z","timestamp":1782834267295,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,4,26]],"date-time":"2026-04-26T00:00:00Z","timestamp":1777161600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["CNS-2211882"],"award-info":[{"award-number":["CNS-2211882"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["CNS-2239351"],"award-info":[{"award-number":["CNS-2239351"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3769315","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"36-54","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["AdaServe: Accelerating Multi-SLO LLM Serving with SLO-Customized Speculative Decoding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6746-6225","authenticated-orcid":false,"given":"Zikun","family":"Li","sequence":"first","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1735-4443","authenticated-orcid":false,"given":"Zhuofu","family":"Chen","sequence":"additional","affiliation":[{"name":"Princeton University, Princeton, NJ, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1319-4832","authenticated-orcid":false,"given":"Remi","family":"Delacourt","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5406-0736","authenticated-orcid":false,"given":"Gabriele","family":"Oliaro","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5756-2744","authenticated-orcid":false,"given":"Zeyu","family":"Wang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4306-3998","authenticated-orcid":false,"given":"Qinghan","family":"Chen","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7574-755X","authenticated-orcid":false,"given":"Shuhuai","family":"Lin","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Sunnyvale, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4226-6509","authenticated-orcid":false,"given":"April","family":"Yang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Sunnyvale, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8409-2717","authenticated-orcid":false,"given":"Zhihao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7797-573X","authenticated-orcid":false,"given":"Zhuoming","family":"Chen","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2358-805X","authenticated-orcid":false,"given":"Yi-Hsiang","family":"Lai","sequence":"additional","affiliation":[{"name":"Amazon Web Services, San Jose, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3981-9159","authenticated-orcid":false,"given":"Xinhao","family":"Cheng","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9371-8358","authenticated-orcid":false,"given":"Xupeng","family":"Miao","sequence":"additional","affiliation":[{"name":"Purdue University, West Lafayette, IN, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1270-5185","authenticated-orcid":false,"given":"Zhihao","family":"Jia","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Taming throughput-latency tradeoff in llm inference with sarathi-serve. arXiv preprint arXiv:2403.02310","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S Gulavani, Alexey Tumanov, and Ramachandran Ramjee. Taming throughput-latency tradeoff in llm inference with sarathi-serve. arXiv preprint arXiv:2403.02310, 2024."},{"key":"e_1_3_2_1_2_1","unstructured":"Anthropic. Claude 3.5. https:\/\/www.anthropic.com\/news\/claude-3-5-sonnet. (Accessed on 10\/11\/2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"et al. Long-bench: A bilingual, multitask benchmark for long context understanding. arXiv preprint arXiv:2308.14508","author":"Bai Yushi","year":"2023","unstructured":"Yushi Bai, Xin Lv, Jiajie Zhang, Hongchang Lyu, Jiankai Tang, Zhidian Huang, Zhengxiao Du, Xiao Liu, Aohan Zeng, Lei Hou, et al. Long-bench: A bilingual, multitask benchmark for long context understanding. arXiv preprint arXiv:2308.14508, 2023."},{"key":"e_1_3_2_1_4_1","volume-title":"How many words do we read per minute? a review and meta-analysis of reading rate. Journal of memory and language, 109:104047","author":"Brysbaert Marc","year":"2019","unstructured":"Marc Brysbaert. How many words do we read per minute? a review and meta-analysis of reading rate. Journal of memory and language, 109:104047, 2019."},{"key":"e_1_3_2_1_5_1","volume-title":"Medusa: Simple llm inference acceleration framework with multiple decoding heads. arXiv preprint arXiv:2401.10774","author":"Cai Tianle","year":"2024","unstructured":"Tianle Cai, Yuhong Li, Zhengyang Geng, Hongwu Peng, Jason D Lee, Deming Chen, and Tri Dao. Medusa: Simple llm inference acceleration framework with multiple decoding heads. arXiv preprint arXiv:2401.10774, 2024."},{"key":"e_1_3_2_1_6_1","volume-title":"Accelerating large language model decoding with speculative sampling. arXiv preprint arXiv:2302.01318","author":"Chen Charlie","year":"2023","unstructured":"Charlie Chen, Sebastian Borgeaud, Geoffrey Irving, Jean-Baptiste Lespiau, Laurent Sifre, and John Jumper. Accelerating large language model decoding with speculative sampling. arXiv preprint arXiv:2302.01318, 2023."},{"key":"e_1_3_2_1_7_1","volume-title":"Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374","author":"Chen Mark","year":"2021","unstructured":"Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde De Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374, 2021."},{"key":"e_1_3_2_1_8_1","volume-title":"Slos-serve: Optimized serving of multi-slo llms. arXiv preprint arXiv:2504.08784","author":"Chen Siyuan","year":"2025","unstructured":"Siyuan Chen, Zhipeng Jia, Samira Khan, Arvind Krishnamurthy, and Phillip B Gibbons. Slos-serve: Optimized serving of multi-slo llms. arXiv preprint arXiv:2504.08784, 2025."},{"key":"e_1_3_2_1_9_1","volume-title":"Scalable, robust, and hardware-aware speculative decoding. arXiv preprint arXiv:2402.12374","author":"Chen Zhuoming","year":"2024","unstructured":"Zhuoming Chen, Avner May, Ruslan Svirschevski, Yuhsun Huang, Max Ryabinin, Zhihao Jia, and Beidi Chen. Sequoia: Scalable, robust, and hardware-aware speculative decoding. arXiv preprint arXiv:2402.12374, 2024."},{"key":"e_1_3_2_1_10_1","volume-title":"How github copilot serves 400 million completion requests a day","author":"Cheney David","year":"2025","unstructured":"David Cheney. How github copilot serves 400 million completion requests a day, 2025."},{"key":"e_1_3_2_1_11_1","volume-title":"March","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality, March 2023."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599572"},{"key":"e_1_3_2_1_13_1","volume-title":"The llama 3 herd of models. arXiv preprint arXiv:2407.21783","author":"Dubey Abhimanyu","year":"2024","unstructured":"Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, et al. The llama 3 herd of models. arXiv preprint arXiv:2407.21783, 2024."},{"key":"e_1_3_2_1_14_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Fu Yichao","unstructured":"Yichao Fu, Peter Bailis, Ion Stoica, and Hao Zhang. Break the sequential dependency of llm inference using lookahead decoding. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_15_1","unstructured":"Google DeepMind. Gemini pro. https:\/\/deepmind.google\/technologies\/gemini\/pro\/. (Accessed on 10\/11\/2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"September","author":"Gray Alan","year":"2019","unstructured":"Alan Gray. Getting started with cuda graphs, September 2019."},{"key":"e_1_3_2_1_17_1","volume-title":"Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948","author":"Guo Daya","year":"2025","unstructured":"Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et al. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948, 2025."},{"key":"e_1_3_2_1_18_1","unstructured":"Connor Holmes Masahiro Tanaka Michael Wyatt Ammar Ahmad Awan Jeff Rasley Samyam Rajbhandari Reza Yazdani Aminabadi Heyang Qin Arash Bakhtiari Lev Kurilenko et al. Deepspeed-fastgen: High-throughput text generation for llms via mii and deepspeed-inference. arXiv preprint arXiv:2401.08671 2024."},{"key":"e_1_3_2_1_19_1","volume-title":"Specserve: Efficient and slo-aware large language model serving with adaptive speculative decoding. arXiv preprint arXiv:2503.05096","author":"Huang Kaiyu","year":"2025","unstructured":"Kaiyu Huang, Hao Wu, Zhubo Shi, Han Zou, Minchen Yu, and Qingjiang Shi. Specserve: Efficient and slo-aware large language model serving with adaptive speculative decoding. arXiv preprint arXiv:2503.05096, 2025."},{"key":"e_1_3_2_1_20_1","volume-title":"Openai o1 system card. arXiv preprint arXiv:2412.16720","author":"Jaech Aaron","year":"2024","unstructured":"Aaron Jaech, Adam Kalai, Adam Lerer, Adam Richardson, Ahmed El-Kishky, Aiden Low, Alec Helyar, Aleksander Madry, Alex Beutel, Alex Carney, et al. Openai o1 system card. arXiv preprint arXiv:2412.16720, 2024."},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the 2nd Conference on Systems and Machine Learning, SysML'19","author":"Matei Zaharia Zhihao Jia","year":"2019","unstructured":"Zhihao Jia Matei Zaharia, and Alex Aiken. Beyond data and model parallelism for deep neural networks. In Proceedings of the 2nd Conference on Systems and Machine Learning, SysML'19, 2019."},{"key":"e_1_3_2_1_22_1","volume-title":"vllm: Easy, fast, and cheap llm serving with pagedattention. See https:\/\/vllm.ai\/ (accessed","author":"Kwon Woosuk","year":"2023","unstructured":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Yu, Joseph E Gonzalez, Hao Zhang, and Ion Stoica. vllm: Easy, fast, and cheap llm serving with pagedattention. See https:\/\/vllm.ai\/ (accessed 9 August 2023), 2023."},{"key":"e_1_3_2_1_23_1","volume-title":"Fast inference from transformers via speculative decoding. arXiv preprint arXiv:2211.17192","author":"Leviathan Yaniv","year":"2022","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. Fast inference from transformers via speculative decoding. arXiv preprint arXiv:2211.17192, 2022."},{"key":"e_1_3_2_1_24_1","volume-title":"Eagle-2: Faster inference of language models with dynamic draft trees. arXiv preprint arXiv:2406.16858","author":"Li Yuhui","year":"2024","unstructured":"Yuhui Li, Fangyun Wei, Chao Zhang, and Hongyang Zhang. Eagle-2: Faster inference of language models with dynamic draft trees. arXiv preprint arXiv:2406.16858, 2024."},{"key":"e_1_3_2_1_25_1","volume-title":"Eagle: Speculative sampling requires rethinking feature uncertainty","author":"Li Yuhui","year":"2024","unstructured":"Yuhui Li, Fangyun Wei, Chao Zhang, and Hongyang Zhang. Eagle: Speculative sampling requires rethinking feature uncertainty, 2024."},{"key":"e_1_3_2_1_26_1","volume-title":"Eagle-3: Scaling up inference acceleration of large language models via training-time test","author":"Li Yuhui","year":"2025","unstructured":"Yuhui Li, Fangyun Wei, Chao Zhang, and Hongyang Zhang. Eagle-3: Scaling up inference acceleration of large language models via training-time test, 2025."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1126\/science.abq1158"},{"key":"e_1_3_2_1_28_1","first-page":"679","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E Gonzalez, et al. {AlpaServe}: Statistical multiplexing with model parallelism for deep learning serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23), pages 663\u2013679, 2023."},{"key":"e_1_3_2_1_29_1","volume-title":"Andes: Defining and enhancing quality-of-experience in llm-based text streaming services. arXiv preprint arXiv:2404.16283","author":"Liu Jiachen","year":"2024","unstructured":"Jiachen Liu, Zhiyu Wu, Jae-Won Chung, Fan Lai, Myungjin Lee, and Mosharaf Chowdhury. Andes: Defining and enhancing quality-of-experience in llm-based text streaming services. arXiv preprint arXiv:2404.16283, 2024."},{"key":"e_1_3_2_1_30_1","volume-title":"Optimizing speculative decoding for serving large language models using goodput","author":"Liu Xiaoxuan","year":"2024","unstructured":"Xiaoxuan Liu, Cade Daniel, Langxiang Hu, Woosuk Kwon, Zhuohan Li, Xiangxi Mo, Alvin Cheung, Zhijie Deng, Ion Stoica, and Hao Zhang. Optimizing speculative decoding for serving large language models using goodput, 2024."},{"key":"e_1_3_2_1_31_1","volume-title":"Helix: Serving large language models over heterogeneous gpus and network via max-flow. arXiv preprint arXiv:2406.01566","author":"Mei Yixuan","year":"2024","unstructured":"Yixuan Mei, Yonghao Zhuang, Xupeng Miao, Juncheng Yang, Zhihao Jia, and Rashmi Vinayak. Helix: Serving large language models over heterogeneous gpus and network via max-flow. arXiv preprint arXiv:2406.01566, 2024."},{"key":"e_1_3_2_1_32_1","volume-title":"Towards efficient generative large language model serving: A survey from algorithms to systems. arXiv preprint arXiv:2312.15234","author":"Miao Xupeng","year":"2023","unstructured":"Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Hongyi Jin, Tianqi Chen, and Zhihao Jia. Towards efficient generative large language model serving: A survey from algorithms to systems. arXiv preprint arXiv:2312.15234, 2023."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651335"},{"key":"e_1_3_2_1_34_1","volume-title":"Spotserve: Serving generative large language models on preemptible instances. arXiv preprint arXiv:2311.15566","author":"Miao Xupeng","year":"2023","unstructured":"Xupeng Miao, Chunan Shi, Jiangfei Duan, Xiaoli Xi, Dahua Lin, Bin Cui, and Zhihao Jia. Spotserve: Serving generative large language models on preemptible instances. arXiv preprint arXiv:2311.15566, 2023."},{"key":"e_1_3_2_1_35_1","volume-title":"Mlperf inference: Datacenter","year":"2025","unstructured":"MLCommons. Mlperf inference: Datacenter, 2025."},{"key":"e_1_3_2_1_36_1","volume-title":"Mlperf inference v5.0 advances language model capabilities for genai","year":"2025","unstructured":"MLCommons. Mlperf inference v5.0 advances language model capabilities for genai, 2025."},{"key":"e_1_3_2_1_37_1","volume-title":"Can foundation models wrangle your data? arXiv preprint arXiv:2205.09911","author":"Narayan Avanika","year":"2022","unstructured":"Avanika Narayan, Ines Chami, Laurel Orr, Simran Arora, and Christopher R\u00e9. Can foundation models wrangle your data? arXiv preprint arXiv:2205.09911, 2022."},{"key":"e_1_3_2_1_38_1","unstructured":"NVIDIA. Tensorrt-llm. https:\/\/nvidia.github.io\/TensorRT-LLM\/index.html. (Accessed on 10\/11\/2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Suffixde-coding: A model-free approach to speeding up large language model inference","author":"Oliaro Gabriele","year":"2024","unstructured":"Gabriele Oliaro, Zhihao Jia, Daniel Campos, and Aurick Qiao. Suffixde-coding: A model-free approach to speeding up large language model inference, 2024."},{"key":"e_1_3_2_1_40_1","volume-title":"et al. Flexllm: A system for co-serving large language model inference and parameter-efficient finetuning. arXiv preprint arXiv:2402.18789","author":"Oliaro Gabriele","year":"2024","unstructured":"Gabriele Oliaro, Xupeng Miao, Xinhao Cheng, Vineeth Kada, Ruohan Gao, Yingyi Huang, Remi Delacourt, April Yang, Yingcheng Wang, Mengdi Wu, et al. Flexllm: A system for co-serving large language model inference and parameter-efficient finetuning. arXiv preprint arXiv:2402.18789, 2024."},{"key":"e_1_3_2_1_41_1","unstructured":"OpenAI. Gpt-4o. https:\/\/openai.com\/index\/hello-gpt-4o\/. (Accessed on 10\/11\/2024)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_2_1_43_1","volume-title":"Mooncake: Kimi's kvcache-centric architecture for llm serving. arXiv preprint arXiv:2407.00079","author":"Qin Ruoyu","year":"2024","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. Mooncake: Kimi's kvcache-centric architecture for llm serving. arXiv preprint arXiv:2407.00079, 2024."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1177\/1529100615623267"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00045"},{"key":"e_1_3_2_1_46_1","volume-title":"Yossi Adi, Jingyu Liu, Romain Sauvestre, Tal Remez, et al. Code llama: Open foundation models for code. arXiv preprint arXiv:2308.12950","author":"Roziere Baptiste","year":"2023","unstructured":"Baptiste Roziere, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Romain Sauvestre, Tal Remez, et al. Code llama: Open foundation models for code. arXiv preprint arXiv:2308.12950, 2023."},{"key":"e_1_3_2_1_47_1","first-page":"988","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Sheng Ying","year":"2024","unstructured":"Ying Sheng, Shiyi Cao, Dacheng Li, Banghua Zhu, Zhuohan Li, Danyang Zhuo, Joseph E Gonzalez, and Ion Stoica. Fairness in serving large language models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pages 965\u2013988, 2024."},{"key":"e_1_3_2_1_48_1","volume-title":"Flexgen: High-throughput generative inference of large language models with a single gpu","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Daniel Y. Fu, Zhiqiang Xie, Beidi Chen, Clark Barrett, Joseph E. Gonzalez, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. Flexgen: High-throughput generative inference of large language models with a single gpu, 2023."},{"key":"e_1_3_2_1_49_1","volume-title":"Dynamollm: Designing llm inference clusters for performance and energy efficiency. arXiv preprint arXiv:2408.00741","author":"Stojkovic Jovan","year":"2024","unstructured":"Jovan Stojkovic, Chaojie Zhang, \u00cd\u00f1igo Goiri, Josep Torrellas, and Esha Choukse. Dynamollm: Designing llm inference clusters for performance and energy efficiency. arXiv preprint arXiv:2408.00741, 2024."},{"key":"e_1_3_2_1_50_1","first-page":"36","article-title":"Fast speculative decoding via optimal transport","author":"Sun Ziteng","year":"2024","unstructured":"Ziteng Sun, Ananda Theertha Suresh, Jae Hun Ro, Ahmad Beirami, Himanshu Jain, and Felix Yu. Spectr: Fast speculative decoding via optimal transport. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_51_1","volume-title":"Ml-enhanced code completion improves developer productivity","author":"Tabachnyk Maxim","year":"2022","unstructured":"Maxim Tabachnyk and Stoyan Nikolov. Ml-enhanced code completion improves developer productivity, 2022."},{"key":"e_1_3_2_1_52_1","volume-title":"Stanford alpaca: An instruction-following llama model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. Stanford alpaca: An instruction-following llama model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca, 2023."},{"key":"e_1_3_2_1_53_1","volume-title":"Gptvoicetasker: Llm-powered virtual assistant for smartphone. arXiv preprint arXiv:2401.14268","author":"Vu Minh Due","year":"2024","unstructured":"Minh Due Vu, Han Wang, Zhuang Li, Jieshan Chen, Shengdong Zhao, Zhenchang Xing, and Chunyang Chen. Gptvoicetasker: Llm-powered virtual assistant for smartphone. arXiv preprint arXiv:2401.14268, 2024."},{"key":"e_1_3_2_1_54_1","volume-title":"Fast distributed inference serving for large language models. arXiv preprint arXiv:2305.05920","author":"Wu Bingyang","year":"2023","unstructured":"Bingyang Wu, Yinmin Zhong, Zili Zhang, Shengyu Liu, Fangyue Liu, Yuanhang Sun, Gang Huang, Xuanzhe Liu, and Xin Jin. Fast distributed inference serving for large language models. arXiv preprint arXiv:2305.05920, 2023."},{"key":"e_1_3_2_1_55_1","unstructured":"Heming Xia Tao Ge Si-Qing Chen Furu Wei and Zhifang Sui. Speculative decoding: Lossless speedup of autoregressive translation."},{"key":"e_1_3_2_1_56_1","volume-title":"Qwen2. 5 technical report. arXiv preprint arXiv:2412.15115","author":"Yang An","year":"2024","unstructured":"An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, et al. Qwen2. 5 technical report. arXiv preprint arXiv:2412.15115, 2024."},{"key":"e_1_3_2_1_57_1","volume-title":"February","author":"Ye Zihao","year":"2024","unstructured":"Zihao Ye, Lequn Chen, Ruihang Lai, Yilong Zhao, Size Zheng, Junru Shao, Bohan Hou, Hongyi Jin, Yifei Zuo, Liangsheng Yin, Tianqi Chen, and Luis Ceze. Accelerating self-attentions for llm serving with flash-infer, February 2024."},{"key":"e_1_3_2_1_58_1","first-page":"538","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. Orca: A distributed serving system for Transformer-Based generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 521\u2013538, Carlsbad, CA, July 2022. USENIX Association."},{"key":"e_1_3_2_1_59_1","volume-title":"Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. Efficiently programming large language models using sglang. arXiv preprint arXiv:2312.07104","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Jeff Huang, Chuyue Sun, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. Efficiently programming large language models using sglang. arXiv preprint arXiv:2312.07104, 2023."},{"key":"e_1_3_2_1_60_1","volume-title":"Distserve: Disaggregating prefill and decoding for goodput-optimized large language model serving. arXiv preprint arXiv:2401.09670","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. Distserve: Disaggregating prefill and decoding for goodput-optimized large language model serving. arXiv preprint arXiv:2401.09670, 2024."},{"key":"e_1_3_2_1_61_1","unstructured":"Yongchao Zhou Kaifeng Lyu Ankit Singh Rawat Aditya Krishna Menon Afshin Rostamizadeh Sanjiv Kumar Jean-Fran\u00e7ois Kagy and Rishabh Agarwal. Distillspec: Improving speculative decoding via knowledge distillation. arXiv preprint arXiv:2310.08461 2023."}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3767295.3769315","content-type":"text\/html","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3767295.3769315","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3767295.3769315","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T12:14:50Z","timestamp":1780661690000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3769315"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":61,"alternative-id":["10.1145\/3767295.3769315","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3769315","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}