{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:26:42Z","timestamp":1774600002808,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U22B2023,62202255"],"award-info":[{"award-number":["U22B2023,62202255"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006374","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFB4500302"],"award-info":[{"award-number":["2022YFB4500302"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3689031.3696072","type":"proceedings-article","created":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T06:25:20Z","timestamp":1742970320000},"page":"128-143","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["Fast State Restoration in LLM Serving with HCache"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-5259-115X","authenticated-orcid":false,"given":"Shiwei","family":"Gao","sequence":"first","affiliation":[{"name":"Tsinghua University"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4171-4299","authenticated-orcid":false,"given":"Youmin","family":"Chen","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7362-2789","authenticated-orcid":false,"given":"Jiwu","family":"Shu","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. Sharegpt4 Dataset. https:\/\/huggingface.co\/datasets\/openchat\/openchat_sharegpt4_dataset. [Computer software]."},{"key":"e_1_3_2_1_2_1","unstructured":"Reyna Abhyankar Zijian He Vikranth Srivatsa Hao Zhang and Yiying Zhang. 2024. APIServe: Efficient API Support for Large-Language Model Inferencing. arXiv:2402.01869 [cs.LG]"},{"key":"e_1_3_2_1_3_1","volume-title":"Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 117--134. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/agrawal"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"e_1_3_2_1_5_1","unstructured":"Chenxin An Shansan Gong Ming Zhong Mukai Li Jun Zhang Lingpeng Kong and Xipeng Qiu. 2023. L-Eval: Instituting Standardized Evaluation for Long Context Language Models. arXiv:2307.11088 [cs.CL]"},{"key":"e_1_3_2_1_6_1","unstructured":"anthropic. 2024. Claude. https:\/\/claude.ai\/. [Computer software]."},{"key":"e_1_3_2_1_7_1","unstructured":"LightLLM authors. 2024. LightLLM. https:\/\/github.com\/ModelTC\/lightllm. [Computer software]."},{"key":"e_1_3_2_1_8_1","unstructured":"RTP-LLM authors. 2024. RTP-LLM. https:\/\/github.com\/alibaba\/rtp-llm. [Computer software]."},{"key":"e_1_3_2_1_9_1","volume-title":"FlashNeuron: SSD-Enabled Large-Batch Training of Very Deep Neural Networks. In 19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Bae Jonghyun","unstructured":"Jonghyun Bae, Jongsung Lee, Yunho Jin, Sam Son, Shine Kim, Hakbeom Jang, Tae Jun Ham, and Jae W. Lee. 2021. FlashNeuron: SSD-Enabled Large-Batch Training of Very Deep Neural Networks. In 19th USENIX Conference on File and Storage Technologies (FAST 21). USENIX Association, 387--401. https:\/\/www.usenix.org\/conference\/fast21\/presentation\/bae"},{"key":"e_1_3_2_1_10_1","unstructured":"Sebastian Borgeaud Arthur Mensch Jordan Hoffmann Trevor Cai Eliza Rutherford Katie Millican George van den Driessche Jean-Baptiste Lespiau Bogdan Damoc Aidan Clark Diego de Las Casas Aurelia Guy Jacob Menick Roman Ring T. W. Hennigan Saffron Huang Lorenzo Maggiore Chris Jones Albin Cassirer Andy Brock Michela Paganini Geoffrey Irving Oriol Vinyals Simon Osindero Karen Simonyan Jack W. Rae Erich Elsen and L. Sifre. 2021. Improving language models by retrieving from trillions of tokens. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:244954723"},{"key":"e_1_3_2_1_11_1","volume-title":"A comprehensive survey of ai-generated content (aigc): A history of generative ai from gan to chatgpt. arXiv preprint arXiv:2303.04226","author":"Cao Yihan","year":"2023","unstructured":"Yihan Cao, Siyu Li, Yixin Liu, Zhiling Yan, Yutong Dai, Philip S Yu, and Lichao Sun. 2023. A comprehensive survey of ai-generated content (aigc): A history of generative ai from gan to chatgpt. arXiv preprint arXiv:2303.04226 (2023)."},{"key":"e_1_3_2_1_12_1","unstructured":"Harrison Chase. 2022. LangChain. https:\/\/github.com\/langchain-ai\/langchain"},{"key":"e_1_3_2_1_13_1","unstructured":"Mark Chen Jerry Tworek Heewoo Jun Qiming Yuan Henrique Ponde de Oliveira Pinto Jared Kaplan Harri Edwards Yuri Burda Nicholas Joseph Greg Brockman Alex Ray Raul Puri Gretchen Krueger Michael Petrov Heidy Khlaaf Girish Sastry Pamela Mishkin Brooke Chan Scott Gray Nick Ryder Mikhail Pavlov Alethea Power Lukasz Kaiser Mohammad Bavarian Clemens Winter Philippe Tillet Felipe Petroski Such Dave Cummings Matthias Plappert Fotios Chantzis Elizabeth Barnes Ariel Herbert-Voss William Hebgen Guss Alex Nichol Alex Paino Nikolas Tezak Jie Tang Igor Babuschkin Suchir Balaji Shantanu Jain William Saunders Christopher Hesse Andrew N. Carr Jan Leike Josh Achiam Vedant Misra Evan Morikawa Alec Radford Matthew Knight Miles Brundage Mira Murati Katie Mayer Peter Welinder Bob McGrew Dario Amodei Sam McCandlish Ilya Sutskever and Wojciech Zaremba. 2021. Evaluating Large Language Models Trained on Code. arXiv:2107.03374 [cs.LG]"},{"key":"e_1_3_2_1_14_1","unstructured":"Karl Cobbe Vineet Kosaraju Mohammad Bavarian Mark Chen Heewoo Jun Lukasz Kaiser Matthias Plappert Jerry Tworek Jacob Hilton Reiichiro Nakano et al. 2021. Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168 (2021)."},{"key":"e_1_3_2_1_15_1","unstructured":"DeepSpeed. 2024. DeepSpeed-MII. https:\/\/github.com\/microsoft\/DeepSpeed-MII. [Computer software]."},{"key":"e_1_3_2_1_16_1","volume-title":"QAQ: Quality Adaptive Quantization for LLM KV Cache. arXiv preprint arXiv:2403.04643","author":"Dong Shichen","year":"2024","unstructured":"Shichen Dong, Wen Cheng, Jiayu Qin, and Wei Wang. 2024. QAQ: Quality Adaptive Quantization for LLM KV Cache. arXiv preprint arXiv:2403.04643 (2024)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575703"},{"key":"e_1_3_2_1_18_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022. OPTQ: Accurate quantization for generative pre-trained transformers. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_19_1","volume-title":"Cost-Efficient Large Language Model Serving for Multi-turn Conversations with CachedAttention. In 2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Gao Bin","year":"2024","unstructured":"Bin Gao, Zhuomin He, Puru Sharma, Qingxuan Kang, Djordje Jevdjic, Junbo Deng, Xingkun Yang, Zhou Yu, and Pengfei Zuo. 2024. Cost-Efficient Large Language Model Serving for Multi-turn Conversations with CachedAttention. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). USENIX Association, Santa Clara, CA, 111--126. https:\/\/www.usenix.org\/conference\/atc24\/presentation\/gao-bin-cost"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa (Eds.)","volume":"6","author":"Gim In","year":"2024","unstructured":"In Gim, Guojun Chen, Seung-seob Lee, Nikhil Sarda, Anurag Khandelwal, and Lin Zhong. 2024. Prompt Cache: Modular Attention Reuse for Low-Latency Inference. In Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa (Eds.), Vol. 6. 325--338. https:\/\/proceedings.mlsys.org\/paper_flles\/paper\/2024\/flle\/a66caa1703fe34705a4368c3014c1966-Paper-Conference.pdf"},{"key":"e_1_3_2_1_21_1","volume-title":"Neuhoff","author":"Gray Robert M.","year":"1998","unstructured":"Robert M. Gray and David L. Neuhoff. 1998. Quantization. IEEE transactions on information theory 44, 6 (1998), 2325--2383."},{"key":"e_1_3_2_1_22_1","volume-title":"Jeff Rasley, Samyam Rajbhandari, Reza Yazdani Aminabadi, Heyang Qin, Arash Bakhtiari, Lev Kurilenko, and Yuxiong He.","author":"Holmes Connor","year":"2024","unstructured":"Connor Holmes, Masahiro Tanaka, Michael Wyatt, Ammar Ahmad Awan, Jeff Rasley, Samyam Rajbhandari, Reza Yazdani Aminabadi, Heyang Qin, Arash Bakhtiari, Lev Kurilenko, and Yuxiong He. 2024. DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference. arXiv:2401.08671 [cs.PF]"},{"key":"e_1_3_2_1_23_1","volume-title":"Kurt Keutzer, and Amir Gholami.","author":"Hooper Coleman","year":"2024","unstructured":"Coleman Hooper, Sehoon Kim, Hiva Mohammadzadeh, Michael W Mahoney, Yakun Sophia Shao, Kurt Keutzer, and Amir Gholami. 2024. KVQuant: Towards 10 Million Context Length LLM Inference with KV Cache Quantization. arXiv preprint arXiv:2401.18079 (2024)."},{"key":"e_1_3_2_1_24_1","unstructured":"Lei Huang Weijiang Yu Weitao Ma Weihong Zhong Zhangyin Feng Haotian Wang Qianglong Chen Weihua Peng Xiaocheng Feng Bing Qin and Ting Liu. 2023. A Survey on Hallucination in Large Language Models: Principles Taxonomy Challenges and Open Questions. arXiv:2311.05232 [cs.CL]"},{"key":"e_1_3_2_1_25_1","volume-title":"Weizhu Chen, Allie Del Giorno, Ronen Eldan, Sivakanth Gopi, et al.","author":"Javaheripi Mojan","year":"2023","unstructured":"Mojan Javaheripi, S\u00e9bastien Bubeck, Marah Abdin, Jyoti Aneja, Sebastien Bubeck, Caio C\u00e9sar Teodoro Mendes, Weizhu Chen, Allie Del Giorno, Ronen Eldan, Sivakanth Gopi, et al. 2023. Phi-2: The surprising power of small language models. Microsoft Research Blog (2023)."},{"key":"e_1_3_2_1_26_1","unstructured":"Chao Jin Zili Zhang Xuanlin Jiang Fangyue Liu Xin Liu Xuanzhe Liu and Xin Jin. 2024. RAGCache: Efficient Knowledge Caching for Retrieval-Augmented Generation. arXiv:2404.12457 [cs.DC]"},{"key":"e_1_3_2_1_27_1","volume-title":"GEAR: An Efficient KV Cache Compression Recipe for Near-Lossless Generative Inference of LLM. arXiv:2403.05527 [cs.LG]","author":"Kang Hao","year":"2024","unstructured":"Hao Kang, Qingru Zhang, Souvik Kundu, Geonhwa Jeong, Zaoxing Liu, Tushar Krishna, and Tuo Zhao. 2024. GEAR: An Efficient KV Cache Compression Recipe for Near-Lossless Generative Inference of LLM. arXiv:2403.05527 [cs.LG]"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_29_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems 33 (2020), 9459--9474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_30_1","volume-title":"Awq: Activation-aware weight quantization for llm compression and acceleration. arXiv preprint arXiv:2306.00978","author":"Lin Ji","year":"2023","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Xingyu Dang, and Song Han. 2023. Awq: Activation-aware weight quantization for llm compression and acceleration. arXiv preprint arXiv:2306.00978 (2023)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","unstructured":"Jerry Liu. 2022. LlamaIndex. https:\/\/doi.org\/10.5281\/zenodo.1234","DOI":"10.5281\/zenodo.1234"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672274"},{"key":"e_1_3_2_1_33_1","volume-title":"KIVI: A TuningFree Asymmetric 2bit Quantization for KV Cache. arXiv preprint arXiv:2402.02750","author":"Liu Zirui","year":"2024","unstructured":"Zirui Liu, Jiayi Yuan, Hongye Jin, Shaochen Zhong, Zhaozhuo Xu, Vladimir Braverman, Beidi Chen, and Xia Hu. 2024. KIVI: A TuningFree Asymmetric 2bit Quantization for KV Cache. arXiv preprint arXiv:2402.02750 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=2DtxPCL3T5","author":"Mu Jesse","year":"2023","unstructured":"Jesse Mu, Xiang Lisa Li, and Noah Goodman. 2023. Learning to Compress Prompts with Gist Tokens. In Thirty-seventh Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=2DtxPCL3T5"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3597503.3639187"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_37_1","unstructured":"NVIDIA. 2024. gdrcopy. https:\/\/github.com\/NVIDIA\/gdrcopy. [Computer software]."},{"key":"e_1_3_2_1_38_1","unstructured":"NVIDIA. 2024. Long context prompting for Claude 2.1. https:\/\/www.anthropic.com\/news\/claude-2-1-prompting. [Website]."},{"key":"e_1_3_2_1_39_1","unstructured":"NVIDIA. 2024. TensorRT-LLM. https:\/\/github.com\/NVIDIA\/TensorRT-LLM. [Computer software]."},{"key":"e_1_3_2_1_40_1","unstructured":"OpenAI. 2024. ChatGPT. https:\/\/chat.openai.com\/. [Computer software]."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.391"},{"key":"e_1_3_2_1_42_1","volume-title":"Kranthi Kiran GV, et al","author":"Peng Bo","year":"2023","unstructured":"Bo Peng, Eric Alcaide, Quentin Anthony, Alon Albalak, Samuel Arcadinho, Huanqi Cao, Xin Cheng, Michael Chung, Matteo Grella, Kranthi Kiran GV, et al. 2023. Rwkv: Reinventing rnns for the transformer era. arXiv preprint arXiv:2305.13048 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150","author":"Shazeer Noam","year":"2019","unstructured":"Noam Shazeer. 2019. Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150 (2019)."},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. 2023. FlexGen: high-throughput generative inference of large language models with a single GPU. In Proceedings of the 40th International Conference on Machine Learning (Honolulu, Hawaii, USA) (ICML'23). JMLR.org, Article 1288, 23 pages."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613169"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"e_1_3_2_1_47_1","unstructured":"Gemma Team Thomas Mesnard Cassidy Hardin Robert Dadashi Surya Bhupatiraju Shreya Pathak Laurent Sifre Morgane Rivi\u00e8re Mihir Sanjay Kale Juliette Love Pouya Tafti L\u00e9onard Hussenot Pier Giuseppe Sessa Aakanksha Chowdhery Adam Roberts Aditya Barua Alex Botev Alex Castro-Ros Ambrose Slone Am\u00e9lie H\u00e9liou Andrea Tacchetti Anna Bulanova Antonia Paterson Beth Tsai Bobak Shahriari Charline Le Lan Christopher A. Choquette-Choo Cl\u00e9ment Crepy Daniel Cer Daphne Ippolito David Reid Elena Buchatskaya Eric Ni Eric Noland Geng Yan George Tucker George-Christian Muraru Grigory Rozhdestvenskiy Henryk Michalewski Ian Tenney Ivan Grishchenko Jacob Austin James Keeling Jane Labanowski Jean-Baptiste Lespiau Jeff Stanway Jenny Brennan Jeremy Chen Johan Ferret Justin Chiu Justin Mao-Jones Katherine Lee Kathy Yu Katie Millican Lars Lowe Sjoesund Lisa Lee Lucas Dixon Machel Reid Maciej Mikuia Mateo Wirth Michael Sharman Nikolai Chinaev Nithum Thain Olivier Bachem Oscar Chang Oscar Wahltinez Paige Bailey Paul Michel Petko Yotov Rahma Chaabouni Ramona Comanescu Reena Jana Rohan Anil Ross McIlroy Ruibo Liu Ryan Mullins Samuel L Smith Sebastian Borgeaud Sertan Girgin Sholto Douglas Shree Pandya Siamak Shakeri Soham De Ted Klimenko Tom Hennigan Vlad Feinberg Wojciech Stokowiec Yu hui Chen Zafarali Ahmed Zhitao Gong Tris Warkentin Ludovic Peran Minh Giang Cl\u00e9ment Farabet Oriol Vinyals Jeff Dean Koray Kavukcuoglu Demis Hassabis Zoubin Ghahramani Douglas Eck Joelle Barral Fernando Pereira Eli Collins Armand Joulin Noah Fiedel Evan Senter Alek Andreev and Kathleen Kenealy. 2024. Gemma: Open Models Based on Gemini Research and Technology. arXiv:2403.08295 [cs.CL]"},{"key":"e_1_3_2_1_48_1","volume-title":"A comprehensive survey of hallucination mitigation techniques in large language models. arXiv preprint arXiv:2401.01313","author":"Tonmoy SM","year":"2024","unstructured":"SM Tonmoy, SM Zaman, Vinija Jain, Anku Rani, Vipula Rawte, Aman Chadha, and Amitava Das. 2024. A comprehensive survey of hallucination mitigation techniques in large language models. arXiv preprint arXiv:2401.01313 (2024)."},{"key":"e_1_3_2_1_49_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv:2307.09288 [cs.CL]"},{"key":"e_1_3_2_1_50_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11704-024-40231-1"},{"key":"e_1_3_2_1_52_1","volume-title":"Fast distributed inference serving for large language models. arXiv preprint arXiv:2305.05920","author":"Wu Bingyang","year":"2023","unstructured":"Bingyang Wu, Yinmin Zhong, Zili Zhang, Gang Huang, Xuanzhe Liu, and Xin Jin. 2023. Fast distributed inference serving for large language models. arXiv preprint arXiv:2305.05920 (2023)."},{"key":"e_1_3_2_1_53_1","unstructured":"Zhiheng Xi Wenxiang Chen Xin Guo Wei He Yiwen Ding Boyang Hong Ming Zhang Junzhe Wang Senjie Jin Enyu Zhou Rui Zheng Xiaoran Fan Xiao Wang Limao Xiong Yuhao Zhou Weiran Wang Changhao Jiang Yicheng Zou Xiangyang Liu Zhangyue Yin Shihan Dou Rongxiang Weng Wensen Cheng Qi Zhang Wenjuan Qin Yongyan Zheng Xipeng Qiu Xuanjing Huang and Tao Gui. 2023. The Rise and Potential of Large Language Model Based Agents: A Survey. arXiv:2309.07864 [cs.AI]"},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"38099","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models. In Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 38087--38099. https:\/\/proceedings.mlr.press\/v202\/xiao23c.html"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519554"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.14778\/3579075.3579077"},{"key":"e_1_3_2_1_57_1","volume-title":"SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1--17","author":"Xie Minhui","year":"2020","unstructured":"Minhui Xie, Kai Ren, Youyou Lu, Guangxu Yang, Qingxing Xu, Bihai Wu, Jiazhen Lin, Hongbo Ao, Wanhong Xu, and Jiwu Shu. 2020. Kraken: memory-efficient continual learning for large-scale real-time recommendations. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1--17."},{"key":"e_1_3_2_1_58_1","volume-title":"Se Jung Kwon, and Dongsoo Lee","author":"Yang June Yong","year":"2024","unstructured":"June Yong Yang, Byeongwook Kim, Jeongin Bae, Beomseok Kwon, Gunho Park, Eunho Yang, Se Jung Kwon, and Dongsoo Lee. 2024. No Token Left Behind: Reliable KV Cache Compression via Importance-Aware Mixed Precision Quantization. arXiv preprint arXiv:2402.18096 (2024)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CloudCom.2017.14"},{"key":"e_1_3_2_1_60_1","unstructured":"Lu Ye Ze Tao Yong Huang and Yang Li. 2024. ChunkAttention: Efficient Attention on KV Cache with Chunking Sharing and Batching. https:\/\/openreview.net\/forum?id=9k27IITeAZ"},{"key":"e_1_3_2_1_61_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521--538."},{"key":"e_1_3_2_1_62_1","unstructured":"Lingfan Yu and Jinyang Li. 2023. Stateful Large Language Model Serving with Pensieve. arXiv:2312.05516 [cs.LG]"},{"key":"e_1_3_2_1_63_1","volume-title":"KV Cache is 1 Bit Per Channel: Efficient Large Language Model Inference with Coupled Quantization. arXiv preprint arXiv:2405.03917","author":"Zhang Tianyi","year":"2024","unstructured":"Tianyi Zhang, Jonah Yi, Zhaozhuo Xu, and Anshumali Shrivastava. 2024. KV Cache is 1 Bit Per Channel: Efficient Large Language Model Inference with Coupled Quantization. arXiv preprint arXiv:2405.03917 (2024)."},{"key":"e_1_3_2_1_64_1","unstructured":"Zhenyu Zhang Ying Sheng Tianyi Zhou Tianlong Chen Lianmin Zheng Ruisi Cai Zhao Song Yuandong Tian Christopher R\u00e9 Clark Barrett et al. 2024. H2o: Heavy-hitter oracle for efficient generative inference of large language models. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_65_1","volume-title":"Atom: Low-bit quantization for efficient and accurate llm serving. arXiv preprint arXiv:2310.19102","author":"Zhao Yilong","year":"2023","unstructured":"Yilong Zhao, Chien-Yu Lin, Kan Zhu, Zihao Ye, Lequn Chen, Size Zheng, Luis Ceze, Arvind Krishnamurthy, Tianqi Chen, and Baris Kasikci. 2023. Atom: Low-bit quantization for efficient and accurate llm serving. arXiv preprint arXiv:2310.19102 (2023)."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/3591196.3596612"},{"key":"e_1_3_2_1_67_1","volume-title":"Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E. Gonzalez, Clark Barrett, and Ying Sheng.","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Jeff Huang, Chuyue Sun, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E. Gonzalez, Clark Barrett, and Ying Sheng. 2023. Efficiently Programming Large Language Models using SGLang. arXiv:2312.07104 [cs.AI]"}],"event":{"name":"EuroSys '25: Twentieth European Conference on Computer Systems","location":"Rotterdam Netherlands","acronym":"EuroSys '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Twentieth European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689031.3696072","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3689031.3696072","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:18:29Z","timestamp":1755775109000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689031.3696072"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":67,"alternative-id":["10.1145\/3689031.3696072","10.1145\/3689031"],"URL":"https:\/\/doi.org\/10.1145\/3689031.3696072","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}