{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T23:07:46Z","timestamp":1768345666840,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","funder":[{"name":"the Science and Technology Commission of Shanghai Municipality","award":["24DP1500704 and 24YL1901100"],"award-info":[{"award-number":["24DP1500704 and 24YL1901100"]}]},{"name":"the Fundamental Research Funds for the Central Universities","award":["22120230311"],"award-info":[{"award-number":["22120230311"]}]},{"name":"the Guangdong Provincial Key Laboratory of Mathematical Foundations for Artificial Intelligence","award":["2023B1212010001"],"award-info":[{"award-number":["2023B1212010001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,19]]},"DOI":"10.1145\/3772052.3772239","type":"proceedings-article","created":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:19:00Z","timestamp":1768321140000},"page":"361-374","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["AdaSpec: Adaptive Speculative Decoding for Fast, SLO-Aware Large Language Model Serving"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-2340-4143","authenticated-orcid":false,"given":"Kaiyu","family":"Huang","sequence":"first","affiliation":[{"name":"Tongji University, Shanghai, China and Shenzhen Research Institute of Big Data, The Chinese University of Hong Kong, Shenzhen, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2570-4648","authenticated-orcid":false,"given":"Hao","family":"Wu","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, Hubei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2042-2947","authenticated-orcid":false,"given":"Zhubo","family":"Shi","sequence":"additional","affiliation":[{"name":"Tongji University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7503-542X","authenticated-orcid":false,"given":"Han","family":"Zou","sequence":"additional","affiliation":[{"name":"Tongji University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6797-9028","authenticated-orcid":false,"given":"Minchen","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Data Science, The Chinese University of Hong Kong, Shenzhen, Shenzhen, Guangdong, China and Shenzhen Research Institute of Big Data, The Chinese University of Hong Kong, Shenzhen, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0507-9080","authenticated-orcid":false,"given":"Qingjiang","family":"Shi","sequence":"additional","affiliation":[{"name":"Tongji University, Shanghai, China and Shenzhen Research Institute of Big Data, The Chinese University of Hong Kong, Shenzhen, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming {Throughput-Latency} tradeoff in {LLM} inference with {Sarathi-Serve}. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 117\u2013134."},{"key":"e_1_3_2_1_3_1","volume-title":"A neural probabilistic language model. Advances in neural information processing systems 13","author":"Bengio Yoshua","year":"2000","unstructured":"Yoshua Bengio, R\u00e9jean Ducharme, and Pascal Vincent. 2000. A neural probabilistic language model. Advances in neural information processing systems 13 (2000)."},{"key":"e_1_3_2_1_4_1","volume-title":"Lin (Eds.)","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems, H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin (Eds.), Vol. 33. Curran Associates, Inc., 1877\u20131901. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"5235","author":"Cai Tianle","year":"2024","unstructured":"Tianle Cai, Yuhong Li, Zhengyang Geng, Hongwu Peng, Jason D. Lee, Deming Chen, and Tri Dao. 2024. Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads. In Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 235), Ruslan Salakhutdinov, Zico Kolter, Katherine Heller, Adrian Weller, Nuria Oliver, Jonathan Scarlett, and Felix Berkenkamp (Eds.). PMLR, 5209\u20135235. https:\/\/proceedings.mlr.press\/v235\/cai24b.html"},{"key":"e_1_3_2_1_6_1","volume-title":"Accelerating large language model decoding with speculative sampling. arXiv preprint arXiv:2302.01318","author":"Chen Charlie","year":"2023","unstructured":"Charlie Chen, Sebastian Borgeaud, Geoffrey Irving, Jean-Baptiste Lespiau, Laurent Sifre, and John Jumper. 2023. Accelerating large language model decoding with speculative sampling. arXiv preprint arXiv:2302.01318 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"Ian En-Hsu Yen, and Beidi Chen","author":"Chen Jian","year":"2024","unstructured":"Jian Chen, Vashisth Tiwari, Ranajoy Sadhukhan, Zhuoming Chen, Jinyuan Shi, Ian En-Hsu Yen, and Beidi Chen. 2024. Magicdec: Breaking the latency-throughput tradeoff for long context generation with speculative decoding. arXiv preprint arXiv:2408.11049 (2024)."},{"key":"e_1_3_2_1_8_1","unstructured":"Karl Cobbe Vineet Kosaraju Mohammad Bavarian Mark Chen Heewoo Jun Lukasz Kaiser Matthias Plappert Jerry Tworek Jacob Hilton Reiichiro Nakano et al. 2021. Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168 (2021)."},{"key":"e_1_3_2_1_9_1","volume-title":"Forty-first International Conference on Machine Learning. https:\/\/openreview.net\/forum?id=mk8oRhox2l","author":"Du Cunxiao","year":"2024","unstructured":"Cunxiao Du, Jing Jiang, Xu Yuanchen, Jiawei Wu, Sicheng Yu, Yongqi Li, Shenggui Li, Kai Xu, Liqiang Nie, Zhaopeng Tu, and Yang You. 2024. GliDe with a CaPE: A Low-Hassle Method to Accelerate Speculative Decoding. In Forty-first International Conference on Machine Learning. https:\/\/openreview.net\/forum?id=mk8oRhox2l"},{"key":"e_1_3_2_1_10_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Break the Sequential Dependency of LLM Inference Using Lookahead Decoding. In Forty-first International Conference on Machine Learning. https:\/\/openreview.net\/forum?id=eDjvSFOkXw","author":"Fu Yichao","year":"2024","unstructured":"Yichao Fu, Peter Bailis, Ion Stoica, and Hao Zhang. 2024. Break the Sequential Dependency of LLM Inference Using Lookahead Decoding. In Forty-first International Conference on Machine Learning. https:\/\/openreview.net\/forum?id=eDjvSFOkXw"},{"key":"e_1_3_2_1_12_1","volume-title":"Proc. USENIX OSDI.","author":"Fu Yao","year":"2024","unstructured":"Yao Fu, Leyang Xue, Yeqi Huang, Andrei-Octavian Brabete, Dmitrii Ustiugov, Yuvraj Patel, and Luo Mai. 2024. ServerlessLLM: Locality-Enhanced Serverless Inference for Large Language Models. In Proc. USENIX OSDI."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190541"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.88"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3486993"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.550"},{"key":"e_1_3_2_1_17_1","volume-title":"Levine (Eds.)","volume":"36","author":"Kim Sehoon","year":"2023","unstructured":"Sehoon Kim, Karttikeya Mangalam, Suhong Moon, Jitendra Malik, Michael W Mahoney, Amir Gholami, and Kurt Keutzer. 2023. Speculative Decoding with Big Little Decoder. In Advances in Neural Information Processing Systems, A. Oh, T. Naumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 39236\u201339256. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/7b97adeafa1c51cf65263459ca9d0d7c-Paper-Conference.pdf"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00276"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"19286","author":"Leviathan Yaniv","year":"2023","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. 2023. Fast Inference from Transformers via Speculative Decoding. In Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 19274\u201319286. https:\/\/proceedings.mlr.press\/v202\/leviathan23a.html"},{"key":"e_1_3_2_1_21_1","volume-title":"EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty. In Forty-first International Conference on Machine Learning.","author":"Li Yuhui","unstructured":"Yuhui Li, Fangyun Wei, Chao Zhang, and Hongyang Zhang. [n. d.]. EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_22_1","volume-title":"Eagle-2: Faster inference of language models with dynamic draft trees. arXiv preprint arXiv:2406.16858","author":"Li Yuhui","year":"2024","unstructured":"Yuhui Li, Fangyun Wei, Chao Zhang, and Hongyang Zhang. 2024. Eagle-2: Faster inference of language models with dynamic draft trees. arXiv preprint arXiv:2406.16858 (2024)."},{"key":"e_1_3_2_1_23_1","unstructured":"Zikun Li Zhuofu Chen Remi Delacourt Gabriele Oliaro Zeyu Wang Qinghan Chen Shuhuai Lin April Yang Zhihao Zhang Zhuoming Chen et al. 2025. AdaServe: SLO-Customized LLM Serving with Fine-Grained Speculative Decoding. arXiv preprint arXiv:2501.12162 (2025)."},{"key":"e_1_3_2_1_24_1","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"Andes: Defining and enhancing quality-of-experience in llm-based text streaming services. arXiv preprint arXiv:2404.16283","author":"Liu Jiachen","year":"2024","unstructured":"Jiachen Liu, Jae-Won Chung, Zhiyu Wu, Fan Lai, Myungjin Lee, and Mosharaf Chowdhury. 2024. Andes: Defining and enhancing quality-of-experience in llm-based text streaming services. arXiv preprint arXiv:2404.16283 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"Optimizing Speculative Decoding for Serving Large Language Models Using Goodput. arXiv preprint arXiv:2406.14066","author":"Liu Xiaoxuan","year":"2024","unstructured":"Xiaoxuan Liu, Cade Daniel, Langxiang Hu, Woosuk Kwon, Zhuohan Li, Xiangxi Mo, Alvin Cheung, Zhijie Deng, Ion Stoica, and Hao Zhang. 2024. Optimizing Speculative Decoding for Serving Large Language Models Using Goodput. arXiv preprint arXiv:2406.14066 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"Online Speculative Decoding. In Forty-first International Conference on Machine Learning. https:\/\/openreview.net\/forum?id=BPQHXwVNvl","author":"Liu Xiaoxuan","year":"2024","unstructured":"Xiaoxuan Liu, Lanxiang Hu, Peter Bailis, Alvin Cheung, Zhijie Deng, Ion Stoica, and Hao Zhang. 2024. Online Speculative Decoding. In Forty-first International Conference on Machine Learning. https:\/\/openreview.net\/forum?id=BPQHXwVNvl"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.631"},{"key":"e_1_3_2_1_29_1","volume-title":"SSSD: Simply-Scalable Speculative Decoding. arXiv preprint arXiv:2411.05894","author":"Marzollo Michele","year":"2024","unstructured":"Michele Marzollo, Jiawei Zhuang, Niklas Roemer, Lorenz K M\u00fcller, and Lukas Cavigelli. 2024. SSSD: Simply-Scalable Speculative Decoding. arXiv preprint arXiv:2411.05894 (2024)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651335"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/K16-1028"},{"key":"e_1_3_2_1_32_1","unstructured":"NVIDIA. 2024. TensorRT-LLM. https:\/\/github.com\/NVIDIA\/TensorRT-LLM."},{"key":"e_1_3_2_1_33_1","volume-title":"Splitwise: Efficient generative LLM inference using phase splitting. In ISCA. https:\/\/www.microsoft.com\/en-us\/research\/publication\/splitwise-efficient-generative-llm-inference-using-phase-splitting\/","author":"Patel Pratyush","year":"2024","unstructured":"Pratyush Patel, Esha Choukse, Chaojie Zhang, Aashaka Shah, \u00cd\u00f1igo Goiri, Saeed Maleki, and Ricardo Bianchini. 2024. Splitwise: Efficient generative LLM inference using phase splitting. In ISCA. https:\/\/www.microsoft.com\/en-us\/research\/publication\/splitwise-efficient-generative-llm-inference-using-phase-splitting\/"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3698038.3698523"},{"key":"e_1_3_2_1_35_1","volume-title":"23rd USENIX Conference on File and Storage Technologies (FAST 25)","author":"Qin Ruoyu","year":"2025","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Jialei Cui, Feng Ren, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2025. Mooncake: Trading More Storage for Less Computation\u2014A {KVCache-centric} Architecture for Serving {LLM} Chatbot. In 23rd USENIX Conference on File and Storage Technologies (FAST 25). 155\u2013170."},{"key":"e_1_3_2_1_36_1","volume-title":"Llumnix: Dynamic Scheduling for Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Sun Biao","year":"2024","unstructured":"Biao Sun, Ziming Huang, Hanyu Zhao, Wencong Xiao, Xinyi Zhang, Yong Li, and Wei Lin. 2024. Llumnix: Dynamic Scheduling for Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 173\u2013191. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/sun-biao"},{"key":"e_1_3_2_1_37_1","volume-title":"Triforce: Lossless acceleration of long sequence generation with hierarchical speculative decoding. arXiv preprint arXiv:2404.11912","author":"Sun Hanshi","year":"2024","unstructured":"Hanshi Sun, Zhuoming Chen, Xinyu Yang, Yuandong Tian, and Beidi Chen. 2024. Triforce: Lossless acceleration of long sequence generation with hierarchical speculative decoding. arXiv preprint arXiv:2404.11912 (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"Levine (Eds.)","volume":"36","author":"Sun Ziteng","year":"2023","unstructured":"Ziteng Sun, Ananda Theertha Suresh, Jae Hun Ro, Ahmad Beirami, Himanshu Jain, and Felix Yu. 2023. SpecTr: Fast Speculative Decoding via Optimal Transport. In Advances in Neural Information Processing Systems, A. Oh, T. Naumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 30222\u201330242. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/6034a661584af6c28fd97a6f23e56c0a-Paper-Conference.pdf"},{"key":"e_1_3_2_1_39_1","volume-title":"Amelie Chi Zhou, et al","author":"Wang Yuxin","year":"2024","unstructured":"Yuxin Wang, Yuhan Chen, Zeyu Li, Xueze Kang, Zhenheng Tang, Xin He, Rui Guo, Xin Wang, Qiang Wang, Amelie Chi Zhou, et al. 2024. BurstGPT: A Real-world Workload Dataset to Optimize LLM Serving Systems. arXiv preprint arXiv:2401.17644 (2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38\u201345","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. 2020. Transformers: State-of-the-Art Natural Language Processing. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38\u201345. https:\/\/www.aclweb.org\/anthology\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.456"},{"key":"e_1_3_2_1_42_1","volume-title":"Multi-candidate speculative decoding. arXiv preprint arXiv:2401.06706","author":"Yang Sen","year":"2024","unstructured":"Sen Yang, Shujian Huang, Xinyu Dai, and Jiajun Chen. 2024. Multi-candidate speculative decoding. arXiv preprint arXiv:2401.06706 (2024)."},{"key":"e_1_3_2_1_43_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521\u2013538."},{"key":"e_1_3_2_1_44_1","volume-title":"GPU-Efficient Serverless Inference via Model Swapping. arXiv preprint arXiv:2306.03622","author":"Yu Minchen","year":"2024","unstructured":"Minchen Yu, Ao Wang, Dong Chen, Haoxuan Yu, Xiaonan Luo, Zhuohao Li, Wei Wang, Ruichuan Chen, Dapeng Nie, and Haoran Yang. 2024. FaaSwap: SLO-Aware, GPU-Efficient Serverless Inference via Model Swapping. arXiv preprint arXiv:2306.03622 (2024)."},{"key":"e_1_3_2_1_45_1","volume-title":"SLO-Aware Machine Learning Inference Serving. In 2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Zhang Chengliang","year":"2019","unstructured":"Chengliang Zhang, Minchen Yu, Wei Wang, and Feng Yan. 2019. MArk: Exploiting Cloud Services for Cost-Effective, SLO-Aware Machine Learning Inference Serving. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). USENIX Association, Renton, WA, 1049\u20131062. https:\/\/www.usenix.org\/conference\/atc19\/presentation\/zhang-chengliang"},{"key":"e_1_3_2_1_46_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Zhang Hong","year":"2023","unstructured":"Hong Zhang, Yupeng Tang, Anurag Khandelwal, and Ion Stoica. 2023. {SHEPHERD}: Serving {DNNs} in the wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 787\u2013808."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.607"},{"key":"e_1_3_2_1_48_1","volume-title":"Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track. https:\/\/openreview.net\/forum?id=uccHPGDlao","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. 2023. Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track. https:\/\/openreview.net\/forum?id=uccHPGDlao"},{"key":"e_1_3_2_1_49_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. { DistServe } : Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI24). 193\u2013210."},{"key":"e_1_3_2_1_50_1","volume-title":"The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=rsY6J3ZaTF","author":"Zhou Yongchao","year":"2024","unstructured":"Yongchao Zhou, Kaifeng Lyu, Ankit Singh Rawat, Aditya Krishna Menon, Afshin Rostamizadeh, Sanjiv Kumar, Jean-Fran\u00e7ois Kagy, and Rishabh Agarwal. 2024. DistillSpec: Improving Speculative Decoding via Knowledge Distillation. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=rsY6J3ZaTF"}],"event":{"name":"SoCC '25: ACM Symposium on Cloud Computing","location":"Online USA","acronym":"SoCC '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 2025 ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772052.3772239","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:19:27Z","timestamp":1768321167000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772052.3772239"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,19]]},"references-count":50,"alternative-id":["10.1145\/3772052.3772239","10.1145\/3772052"],"URL":"https:\/\/doi.org\/10.1145\/3772052.3772239","relation":{},"subject":[],"published":{"date-parts":[[2025,11,19]]},"assertion":[{"value":"2026-01-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}