{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T14:24:52Z","timestamp":1780496692661,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"University of Chicago","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3689031.3696098","type":"proceedings-article","created":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T06:25:20Z","timestamp":1742970320000},"page":"94-109","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":44,"title":["CacheBlend: Fast Large Language Model Serving for RAG with Cached Knowledge Fusion"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8588-4356","authenticated-orcid":false,"given":"Jiayi","family":"Yao","sequence":"first","affiliation":[{"name":"University of Chicago\/CUHK Shenzhen"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9980-028X","authenticated-orcid":false,"given":"Hanchen","family":"Li","sequence":"additional","affiliation":[{"name":"University of Chicago"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5957-5071","authenticated-orcid":false,"given":"Yuhan","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Chicago"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0265-2144","authenticated-orcid":false,"given":"Siddhant","family":"Ray","sequence":"additional","affiliation":[{"name":"University of Chicago"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3924-6886","authenticated-orcid":false,"given":"Yihua","family":"Cheng","sequence":"additional","affiliation":[{"name":"University of Chicago"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3208-4601","authenticated-orcid":false,"given":"Qizheng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Stanford University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3964-4079","authenticated-orcid":false,"given":"Kuntai","family":"Du","sequence":"additional","affiliation":[{"name":"University of Chicago"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0757-4600","authenticated-orcid":false,"given":"Shan","family":"Lu","sequence":"additional","affiliation":[{"name":"Microsoft Research \/ University of Chicago"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6877-1683","authenticated-orcid":false,"given":"Junchen","family":"Jiang","sequence":"additional","affiliation":[{"name":"University of Chicago"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"12 Practical Large Language Model (LLM) Applications - Techopedia. https:\/\/www.techopedia.com\/12-practical-large-language-model-llm-applications. (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"[2302.13971] llama: Open and efficient foundation language models. https:\/\/arxiv.org\/abs\/2302.13971. (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_3_1","unstructured":"7 top large language model use cases and applications. https:\/\/www.projectpro.io\/article\/large- language- model- use-cases- and-applications\/887. (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_4_1","unstructured":"Applications of large language models - indata labs. https:\/\/indatalabs.com\/blog\/large-language-model-apps. (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_5_1","unstructured":"Chains. https:\/\/python.langchain.com\/docs\/modules\/chains\/."},{"key":"e_1_3_2_1_6_1","unstructured":"Evaluating qa: Metrics predictions and the null response. https:\/\/github.com\/fastforwardlabs\/ff14_blog\/blob\/master\/_notebooks\/2020-06-09-Evaluating_BERT_on_SQuAD.ipynb."},{"key":"e_1_3_2_1_7_1","unstructured":"Langchain: Map reduce. https:\/\/api.python.langchain.com\/en\/latest\/chains\/langchain.chains.combine_documents.map_reduce.MapReduceDocumentsChain.html#langchain.chains.combine_documents.map_reduce.MapReduceDocumentsChain."},{"key":"e_1_3_2_1_8_1","unstructured":"Langchain: Map rerank. https:\/\/api.python.langchain.com\/en\/latest\/chains\/langchain.chains.combine_documents.map_rerank.MapRerankDocumentsChain.html."},{"key":"e_1_3_2_1_9_1","unstructured":"Real-world use cases for large language models (llms) | by cellstrat | medium. https:\/\/cellstrat.medium.com\/real-world-use-cases-for-large-language-models-llms-d71c3a577bf2. (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"Cloud compute made easy. https:\/\/www.runpod.io\/","author":"Runpod","year":"2024","unstructured":"Runpod: Cloud compute made easy. https:\/\/www.runpod.io\/, 2024. Accessed: 2024-05-21."},{"key":"e_1_3_2_1_11_1","volume-title":"Sarathi: Efficient llm inference by piggybacking decodes with chunked prefills","author":"Agrawal Amey","year":"2023","unstructured":"Amey Agrawal, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S. Gulavani, and Ramachandran Ramjee. Sarathi: Efficient llm inference by piggybacking decodes with chunked prefills, 2023."},{"key":"e_1_3_2_1_12_1","volume-title":"Longbench: A bilingual, multitask benchmark for long context understanding. arXiv preprint arXiv:2308.14508","author":"Bai Yushi","year":"2023","unstructured":"Yushi Bai, Xin Lv, Jiajie Zhang, Hongchang Lyu, Jiankai Tang, Zhidian Huang, Zhengxiao Du, Xiao Liu, Aohan Zeng, Lei Hou, Yuxiao Dong, Jie Tang, and Juanzi Li. Longbench: A bilingual, multitask benchmark for long context understanding. arXiv preprint arXiv:2308.14508, 2023."},{"key":"e_1_3_2_1_13_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems, 33:1877--1901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877--1901, 2020."},{"key":"e_1_3_2_1_14_1","first-page":"17413","article-title":"Scatterbrain: Unifying sparse and low-rank attention","volume":"34","author":"Chen Beidi","year":"2021","unstructured":"Beidi Chen, Tri Dao, Eric Winsor, Zhao Song, Atri Rudra, and Christopher R\u00e9. Scatterbrain: Unifying sparse and low-rank attention. Advances in Neural Information Processing Systems, 34:17413--17426, 2021.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_15_1","volume-title":"Rethinking attention with performers. arXiv preprint arXiv:2009.14794","author":"Choromanski Krzysztof","year":"2020","unstructured":"Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tamas Sarlos, Peter Hawkins, Jared Davis, Afroz Mohiuddin, Lukasz Kaiser, et al. Rethinking attention with performers. arXiv preprint arXiv:2009.14794, 2020."},{"key":"e_1_3_2_1_16_1","volume-title":"Charles Sutton, Sebastian Gehrmann, et al. Palm: Scaling language modeling with pathways. arXiv preprint arXiv:2204.02311","author":"Chowdhery Aakanksha","year":"2022","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. Palm: Scaling language modeling with pathways. arXiv preprint arXiv:2204.02311, 2022."},{"key":"e_1_3_2_1_17_1","volume-title":"Razvan Pascanu, Nando De Freitas, and Caglar Gulcehre. Griffin: Mixing gated linear recurrences with local attention for efficient language models","author":"De Soham","year":"2024","unstructured":"Soham De, Samuel L. Smith, Anushan Fernando, Aleksandar Botev, George Cristian-Muraru, Albert Gu, Ruba Haroun, Leonard Berrada, Yutian Chen, Srivatsan Srinivasan, Guillaume Desjardins, Arnaud Doucet, David Budden, Yee Whye Teh, Razvan Pascanu, Nando De Freitas, and Caglar Gulcehre. Griffin: Mixing gated linear recurrences with local attention for efficient language models, 2024."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/1327452.1327492"},{"key":"e_1_3_2_1_19_1","volume-title":"Get more with less: Synthesizing recurrence with kv cache compression for efficient llm inference. arXiv preprint arXiv:2402.09398","author":"Dong Harry","year":"2024","unstructured":"Harry Dong, Xinyu Yang, Zhenyu Zhang, Zhangyang Wang, Yuejie Chi, and Beidi Chen. Get more with less: Synthesizing recurrence with kv cache compression for efficient llm inference. arXiv preprint arXiv:2402.09398, 2024."},{"key":"e_1_3_2_1_20_1","volume-title":"Multi-news: A large-scale multi-document summarization dataset and abstractive hierarchical model. arXiv preprint arXiv:1906.01749","author":"Fabbri Alexander R","year":"2019","unstructured":"Alexander R Fabbri, Irene Li, Tianwei She, Suyi Li, and Dragomir R Radev. Multi-news: A large-scale multi-document summarization dataset and abstractive hierarchical model. arXiv preprint arXiv:1906.01749, 2019."},{"key":"e_1_3_2_1_21_1","volume-title":"Attentionstore: Cost-effective attention reuse across multi-turn conversations in large language model serving","author":"Gao Bin","year":"2024","unstructured":"Bin Gao, Zhuomin He, Puru Sharma, Qingxuan Kang, Djordje Jevdjic, Junbo Deng, Xingkun Yang, Zhou Yu, and Pengfei Zuo. Attentionstore: Cost-effective attention reuse across multi-turn conversations in large language model serving, 2024."},{"key":"e_1_3_2_1_22_1","volume-title":"Retrieval-augmented generation for large language models: A survey","author":"Gao Yunfan","year":"2023","unstructured":"Yunfan Gao, Yun Xiong, Xinyu Gao, Kangxiang Jia, Jinliu Pan, Yuxi Bi, Yi Dai, Jiawei Sun, and Haofen Wang. Retrieval-augmented generation for large language models: A survey, 2023."},{"key":"e_1_3_2_1_23_1","volume-title":"Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997","author":"Gao Yunfan","year":"2023","unstructured":"Yunfan Gao, Yun Xiong, Xinyu Gao, Kangxiang Jia, Jinliu Pan, Yuxi Bi, Yi Dai, Jiawei Sun, and Haofen Wang. Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997, 2023."},{"key":"e_1_3_2_1_24_1","volume-title":"Nikhil Sarda, Anurag Khandelwal, and Lin Zhong. Prompt cache: Modular attention reuse for low-latency inference","author":"Gim In","year":"2023","unstructured":"In Gim, Guojun Chen, Seung seob Lee, Nikhil Sarda, Anurag Khandelwal, and Lin Zhong. Prompt cache: Modular attention reuse for low-latency inference, 2023."},{"key":"e_1_3_2_1_25_1","volume-title":"Samsum corpus: A human-annotated dialogue dataset for abstractive summarization. arXiv preprint arXiv:1911.12237","author":"Gliwa Bogdan","year":"2019","unstructured":"Bogdan Gliwa, Iwona Mochol, Maciej Biesek, and Aleksander Wawer. Samsum corpus: A human-annotated dialogue dataset for abstractive summarization. arXiv preprint arXiv:1911.12237, 2019."},{"key":"e_1_3_2_1_26_1","volume-title":"Mamba: Linear-time sequence modeling with selective state spaces","author":"Gu Albert","year":"2023","unstructured":"Albert Gu and Tri Dao. Mamba: Linear-time sequence modeling with selective state spaces, 2023."},{"key":"e_1_3_2_1_27_1","volume-title":"Saku Sugawara, and Akiko Aizawa. Constructing a multi-hop qa dataset for comprehensive evaluation of reasoning steps. arXiv preprint arXiv:2011.01060","author":"Ho Xanh","year":"2020","unstructured":"Xanh Ho, Anh-Khoa Duong Nguyen, Saku Sugawara, and Akiko Aizawa. Constructing a multi-hop qa dataset for comprehensive evaluation of reasoning steps. arXiv preprint arXiv:2011.01060, 2020."},{"key":"e_1_3_2_1_28_1","volume-title":"Kurt Keutzer, and Amir Gholami. Kvquant: Towards 10 million context length llm inference with kv cache quantization. arXiv preprint arXiv:2401.18079","author":"Hooper Coleman","year":"2024","unstructured":"Coleman Hooper, Sehoon Kim, Hiva Mohammadzadeh, Michael W Mahoney, Yakun Sophia Shao, Kurt Keutzer, and Amir Gholami. Kvquant: Towards 10 million context length llm inference with kv cache quantization. arXiv preprint arXiv:2401.18079, 2024."},{"key":"e_1_3_2_1_29_1","volume-title":"Optimize rag efficiency with llamaindex: The perfect chunk size. https:\/\/datasciencedojo.com\/blog\/rag-with-llamaindex\/, october","author":"Jan Muhammad","year":"2023","unstructured":"Muhammad Jan. Optimize rag efficiency with llamaindex: The perfect chunk size. https:\/\/datasciencedojo.com\/blog\/rag-with-llamaindex\/, october 2023."},{"key":"e_1_3_2_1_30_1","volume-title":"Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al. Mistral 7b. arXiv preprint arXiv:2310.06825","author":"Jiang Albert Q","year":"2023","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al. Mistral 7b. arXiv preprint arXiv:2310.06825, 2023."},{"key":"e_1_3_2_1_31_1","volume-title":"Llmlingua: Compressing prompts for accelerated inference of large language models. arXiv preprint arXiv:2310.05736","author":"Jiang Huiqiang","year":"2023","unstructured":"Huiqiang Jiang, Qianhui Wu, Chin-Yew Lin, Yuqing Yang, and Lili Qiu. Llmlingua: Compressing prompts for accelerated inference of large language models. arXiv preprint arXiv:2310.05736, 2023."},{"key":"e_1_3_2_1_32_1","volume-title":"Longllmlingua: Accelerating and enhancing llms in long context scenarios via prompt compression","author":"Jiang Huiqiang","year":"2023","unstructured":"Huiqiang Jiang, Qianhui Wu, Xufang Luo, Dongsheng Li, Chin-Yew Lin, Yuqing Yang, and Lili Qiu. Longllmlingua: Accelerating and enhancing llms in long context scenarios via prompt compression, 2023."},{"key":"e_1_3_2_1_33_1","volume-title":"Ragcache: Efficient knowledge caching for retrieval-augmented generation. arXiv preprint arXiv:2404.12457","author":"Jin Chao","year":"2024","unstructured":"Chao Jin, Zili Zhang, Xuanlin Jiang, Fangyue Liu, Xin Liu, Xuanzhe Liu, and Xin Jin. Ragcache: Efficient knowledge caching for retrieval-augmented generation. arXiv preprint arXiv:2404.12457, 2024."},{"key":"e_1_3_2_1_34_1","volume-title":"Billion-scale similarity search with GPUs","author":"Johnson Jeff","year":"2017","unstructured":"Jeff Johnson, Matthijs Douze, and Herv\u00e9 J\u00e9gou. Billion-scale similarity search with GPUs, 2017."},{"key":"e_1_3_2_1_35_1","volume-title":"Gear: An efficient kv cache compression recipe for near-lossless generative inference of llm","author":"Kang Hao","year":"2024","unstructured":"Hao Kang, Qingru Zhang, Souvik Kundu, Geonhwa Jeong, Zaoxing Liu, Tushar Krishna, and Tuo Zhao. Gear: An efficient kv cache compression recipe for near-lossless generative inference of llm, 2024."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_37_1","volume-title":"A survey on retrieval-augmented text generation. arXiv preprint arXiv:2202.01110","author":"Li Huayang","year":"2022","unstructured":"Huayang Li, Yixuan Su, Deng Cai, Yan Wang, and Lemao Liu. A survey on retrieval-augmented text generation. arXiv preprint arXiv:2202.01110, 2022."},{"key":"e_1_3_2_1_38_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Lin Chaofan","year":"2024","unstructured":"Chaofan Lin, Chengruidong Zhang Zhenhua Han, Yuqing Yang, Fan Yang, Chen Chen, and Lili Qiu. Parrot: Efficient serving of llm-based applications with semantic variable. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), Santa Clara, CA, July 2024. USENIX Association."},{"key":"e_1_3_2_1_39_1","first-page":"74","volume-title":"Text Summarization Branches Out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. ROUGE: A package for automatic evaluation of summaries. In Text Summarization Branches Out, pages 74--81, Barcelona, Spain, July 2004. Association for Computational Linguistics."},{"key":"e_1_3_2_1_40_1","volume-title":"Lost in the middle: How language models use long contexts. arXiv preprint arXiv:2307.03172","author":"Liu Nelson F","year":"2023","unstructured":"Nelson F Liu, Kevin Lin, John Hewitt, Ashwin Paranjape, Michele Bevilacqua, Fabio Petroni, and Percy Liang. Lost in the middle: How language models use long contexts. arXiv preprint arXiv:2307.03172, 2023."},{"key":"e_1_3_2_1_41_1","volume-title":"Optimizing llm queries in relational workloads","author":"Liu Shu","year":"2024","unstructured":"Shu Liu, Asim Biswal, Audrey Cheng, Xiangxi Mo, Shiyi Cao, Joseph E. Gonzalez, Ion Stoica, and Matei Zaharia. Optimizing llm queries in relational workloads, 2024."},{"key":"e_1_3_2_1_42_1","volume-title":"et al. Cachegen: Fast context loading for language model applications. arXiv preprint arXiv:2310.07240","author":"Liu Yuhan","year":"2023","unstructured":"Yuhan Liu, Hanchen Li, Kuntai Du, Jiayi Yao, Yihua Cheng, Yuyang Huang, Shan Lu, Michael Maire, Henry Hoffmann, Ari Holtzman, et al. Cachegen: Fast context loading for language model applications. arXiv preprint arXiv:2310.07240, 2023."},{"key":"e_1_3_2_1_43_1","first-page":"36","article-title":"Exploiting the persistence of importance hypothesis for llm kv cache compression at test time","author":"Liu Zichang","year":"2024","unstructured":"Zichang Liu, Aditya Desai, Fangshuo Liao, Weitao Wang, Victor Xie, Zhaozhuo Xu, Anastasios Kyrillidis, and Anshumali Shrivastava. Scissorhands: Exploiting the persistence of importance hypothesis for llm kv cache compression at test time. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_44_1","first-page":"22137","volume-title":"International Conference on Machine Learning","author":"Liu Zichang","year":"2023","unstructured":"Zichang Liu, Jue Wang, Tri Dao, Tianyi Zhou, Binhang Yuan, Zhao Song, Anshumali Shrivastava, Ce Zhang, Yuandong Tian, Christopher Re, et al. Deja vu: Contextual sparsity for efficient llms at inference time. In International Conference on Machine Learning, pages 22137--22176. PMLR, 2023."},{"key":"e_1_3_2_1_45_1","volume-title":"Kivi: A tuning-free asymmetric 2bit quantization for kv cache. arXiv preprint arXiv:2402.02750","author":"Liu Zirui","year":"2024","unstructured":"Zirui Liu, Jiayi Yuan, Hongye Jin, Shaochen Zhong, Zhaozhuo Xu, Vladimir Braverman, Beidi Chen, and Xia Hu. Kivi: A tuning-free asymmetric 2bit quantization for kv cache. arXiv preprint arXiv:2402.02750, 2024."},{"key":"e_1_3_2_1_46_1","volume-title":"Generation-augmented retrieval for open-domain question answering. arXiv preprint arXiv:2009.08553","author":"Mao Yuning","year":"2020","unstructured":"Yuning Mao, Pengcheng He, Xiaodong Liu, Yelong Shen, Jianfeng Gao, Jiawei Han, and Weizhu Chen. Generation-augmented retrieval for open-domain question answering. arXiv preprint arXiv:2009.08553, 2020."},{"key":"e_1_3_2_1_47_1","volume-title":"Fine-tuned transformers show clusters of similar representations across layers. arXiv preprint arXiv:2109.08406","author":"Phang Jason","year":"2021","unstructured":"Jason Phang, Haokun Liu, and Samuel R Bowman. Fine-tuned transformers show clusters of similar representations across layers. arXiv preprint arXiv:2109.08406, 2021."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00605"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"e_1_3_2_1_51_1","volume-title":"Musique: Multihop questions via single-hop question composition","author":"Trivedi Harsh","year":"2022","unstructured":"Harsh Trivedi, Niranjan Balasubramanian, Tushar Khot, and Ashish Sabharwal. Musique: Multihop questions via single-hop question composition, 2022."},{"key":"e_1_3_2_1_52_1","volume-title":"Attention Is All You Need","author":"Vaswani Ashish","year":"2023","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. Attention Is All You Need, 2023."},{"key":"e_1_3_2_1_53_1","volume-title":"Loongserve: Efficiently serving long-context large language models with elastic sequence parallelism","author":"Wu Bingyang","year":"2024","unstructured":"Bingyang Wu, Shengyu Liu, Yinmin Zhong, Peng Sun, Xuanzhe Liu, and Xin Jin. Loongserve: Efficiently serving long-context large language models with elastic sequence parallelism, 2024."},{"key":"e_1_3_2_1_54_1","volume-title":"Retrieval meets long context large language models. arXiv preprint arXiv:2310.03025","author":"Xu Peng","year":"2023","unstructured":"Peng Xu, Wei Ping, Xianchao Wu, Lawrence McAfee, Chen Zhu, Zihan Liu, Sandeep Subramanian, Evelina Bakhturina, Mohammad Shoeybi, and Bryan Catanzaro. Retrieval meets long context large language models. arXiv preprint arXiv:2310.03025, 2023."},{"key":"e_1_3_2_1_55_1","volume-title":"Llm as a system service on mobile devices","author":"Yin Wangsong","year":"2024","unstructured":"Wangsong Yin, Mengwei Xu, Yuanchun Li, and Xuanzhe Liu. Llm as a system service on mobile devices, 2024."},{"key":"e_1_3_2_1_56_1","volume-title":"et al. Yi: Open foundation models by 01. ai. arXiv preprint arXiv:2403.04652","author":"Young Alex","year":"2024","unstructured":"Alex Young, Bei Chen, Chao Li, Chengen Huang, Ge Zhang, Guanwei Zhang, Heng Li, Jiangcheng Zhu, Jianqun Chen, Jing Chang, et al. Yi: Open foundation models by 01. ai. arXiv preprint arXiv:2403.04652, 2024."},{"key":"e_1_3_2_1_57_1","first-page":"521","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 521--538, 2022."},{"key":"e_1_3_2_1_58_1","first-page":"36","article-title":"H2o: Heavy-hitter oracle for efficient generative inference of large language models","author":"Zhang Zhenyu","year":"2024","unstructured":"Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher R\u00e9, Clark Barrett, et al. H2o: Heavy-hitter oracle for efficient generative inference of large language models. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_59_1","volume-title":"Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. Efficiently programming large language models using sglang. arXiv preprint arXiv:2312.07104","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Jeff Huang, Chuyue Sun, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. Efficiently programming large language models using sglang. arXiv preprint arXiv:2312.07104, 2023."},{"key":"e_1_3_2_1_60_1","volume-title":"Distserve: Disaggregating prefill and decoding for goodput-optimized large language model serving. arXiv preprint arXiv:2401.09670","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. Distserve: Disaggregating prefill and decoding for goodput-optimized large language model serving. arXiv preprint arXiv:2401.09670, 2024."}],"event":{"name":"EuroSys '25: Twentieth European Conference on Computer Systems","location":"Rotterdam Netherlands","acronym":"EuroSys '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Twentieth European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689031.3696098","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3689031.3696098","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:23:39Z","timestamp":1755775419000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689031.3696098"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":60,"alternative-id":["10.1145\/3689031.3696098","10.1145\/3689031"],"URL":"https:\/\/doi.org\/10.1145\/3689031.3696098","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}