{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T07:56:14Z","timestamp":1780473374907,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":157,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,4]],"date-time":"2024-08-04T00:00:00Z","timestamp":1722729600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"NSF","award":["CNS-2146496"],"award-info":[{"award-number":["CNS-2146496"]}]},{"name":"NSF","award":["CNS-2131826"],"award-info":[{"award-number":["CNS-2131826"]}]},{"name":"NSF","award":["CNS-2313190"],"award-info":[{"award-number":["CNS-2313190"]}]},{"name":"NSF","award":["CNS-1901466"],"award-info":[{"award-number":["CNS-1901466"]}]},{"name":"NSF","award":["CNS-1956180"],"award-info":[{"award-number":["CNS-1956180"]}]},{"name":"NSF","award":["CCF-2119184"],"award-info":[{"award-number":["CCF-2119184"]}]},{"name":"UChicago CERES Center"},{"name":"MARIN and Stuart Rice Research Award"},{"name":"Chameleon Projects"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,4]]},"DOI":"10.1145\/3651890.3672274","type":"proceedings-article","created":{"date-parts":[[2024,7,31]],"date-time":"2024-07-31T13:11:43Z","timestamp":1722431503000},"page":"38-56","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":77,"title":["CacheGen: KV Cache Compression and Streaming for Fast Large Language Model Serving"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-5957-5071","authenticated-orcid":false,"given":"Yuhan","family":"Liu","sequence":"first","affiliation":[{"name":"University of Chicago, Chicago, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9980-028X","authenticated-orcid":false,"given":"Hanchen","family":"Li","sequence":"additional","affiliation":[{"name":"University of Chicago, Chicago, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3924-6886","authenticated-orcid":false,"given":"Yihua","family":"Cheng","sequence":"additional","affiliation":[{"name":"University of Chicago, Chicago, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0265-2144","authenticated-orcid":false,"given":"Siddhant","family":"Ray","sequence":"additional","affiliation":[{"name":"University of Chicago, Chicago, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8822-3115","authenticated-orcid":false,"given":"Yuyang","family":"Huang","sequence":"additional","affiliation":[{"name":"University of Chicago, Chicago, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3208-4601","authenticated-orcid":false,"given":"Qizheng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Stanford University, Palo Alto, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3964-4079","authenticated-orcid":false,"given":"Kuntai","family":"Du","sequence":"additional","affiliation":[{"name":"University of Chicago, Chicago, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8588-4356","authenticated-orcid":false,"given":"Jiayi","family":"Yao","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Shenzhen, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0757-4600","authenticated-orcid":false,"given":"Shan","family":"Lu","sequence":"additional","affiliation":[{"name":"Microsoft Research, Seattle, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7479-1664","authenticated-orcid":false,"given":"Ganesh","family":"Ananthanarayanan","sequence":"additional","affiliation":[{"name":"Microsoft, Seattle, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9778-6673","authenticated-orcid":false,"given":"Michael","family":"Maire","sequence":"additional","affiliation":[{"name":"The University of Chicago, Chicago, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0816-8150","authenticated-orcid":false,"given":"Henry","family":"Hoffmann","sequence":"additional","affiliation":[{"name":"University of Chicago, Chicago, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2344-3800","authenticated-orcid":false,"given":"Ari","family":"Holtzman","sequence":"additional","affiliation":[{"name":"Meta, University of Chicago, Chicago, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6877-1683","authenticated-orcid":false,"given":"Junchen","family":"Jiang","sequence":"additional","affiliation":[{"name":"University of Chicago, Chicago, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,8,4]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2021. How latency affects user engagement. https:\/\/pusher.com\/blog\/how-latency-affects-user-engagement\/. (2021). (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"2023. Best Practices for Deploying Large Language Models (LLMs) in Production. https:\/\/medium.com\/@_aigeek\/best-practices-for-deploying-large-language-models-llms-in-production-fdc5bf240d6a. (2023). (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_3_1","unstructured":"2023. Building RAG-based LLM Applications for Production. https:\/\/www.anyscale.com\/blog\/a-comprehensive-guide-for-building-rag-based-llm-applications-part-1. (2023). Accessed: 2024-01-25."},{"key":"e_1_3_2_1_4_1","unstructured":"2024. Amazon Bedrock Pricing. https:\/\/aws.amazon.com\/bedrock\/pricing\/. (2024). Accessed: 2024-01-25."},{"key":"e_1_3_2_1_5_1","unstructured":"2024. Anyscale Pricing. https:\/\/docs.endpoints.anyscale.com\/pricing. (2024). Accessed: 2024-01-25."},{"key":"e_1_3_2_1_6_1","unstructured":"2024. AWS Pricing examples. https:\/\/aws.amazon.com\/s3\/pricing\/. (2024). Accessed: 2024-01-25."},{"key":"e_1_3_2_1_7_1","unstructured":"2024. ChatGPT. https:\/\/chat.openai.com\/gpts. (2024). Accessed: 2024-01-25."},{"key":"e_1_3_2_1_8_1","unstructured":"2024. pathwaycom\/llmapp. https:\/\/github.com\/pathwaycom\/llm-app. (2024). Accessed: 2024-01-25."},{"key":"e_1_3_2_1_9_1","unstructured":"2024. Perplexity. https:\/\/www.perplexity.ai\/. (2024). Accessed: 2024-01-25."},{"key":"e_1_3_2_1_10_1","unstructured":"2024. RAG-Transform. https:\/\/huggingface.co\/transformers\/v4.3.0\/model_doc\/rag.html. (2024). Accessed: 2024-01-25."},{"key":"e_1_3_2_1_11_1","unstructured":"2024. Replicate Pricing. https:\/\/replicate.com\/pricing. (2024). Accessed: 2024-01-25."},{"key":"e_1_3_2_1_12_1","unstructured":"2024. together.pricing. https:\/\/www.together.ai\/pricing. (2024). Accessed: 2024-01-25."},{"key":"e_1_3_2_1_13_1","volume-title":"Le","author":"Adiwardana Daniel","year":"2020","unstructured":"Daniel Adiwardana, Minh-Thang Luong, David R. So, Jamie Hall, Noah Fiedel, Romal Thoppilan, Zi Yang, Apoorv Kulshreshtha, Gaurav Nemade, Yifeng Lu, and Quoc V. Le. 2020. Towards a Human-like Open-Domain Chatbot. (2020). arXiv:cs.CL\/2001.09977"},{"key":"e_1_3_2_1_14_1","volume-title":"LLM Inference Performance Engineering: Best Practices. (Oct","author":"Agarwal Megha","year":"2023","unstructured":"Megha Agarwal, Asfandyar Qureshi, Nikhil Sardana, Linden Li, Julian Quevedo, and Daya Khudia. 2023. LLM Inference Performance Engineering: Best Practices. (Oct. 2023). https:\/\/www.databricks.com\/blog\/llm-inference-performance-engineering-best-practices Accessed: 2024-06-01."},{"key":"e_1_3_2_1_15_1","volume-title":"Accordion: Adaptive gradient communication via critical learning regime identification. arXiv preprint arXiv:2010.16248","author":"Agarwal Saurabh","year":"2020","unstructured":"Saurabh Agarwal, Hongyi Wang, Kangwook Lee, Shivaram Venkataraman, and Dimitris Papailiopoulos. 2020. Accordion: Adaptive gradient communication via critical learning regime identification. arXiv preprint arXiv:2010.16248 (2020)."},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.)","volume":"4","author":"Agarwal Saurabh","year":"2022","unstructured":"Saurabh Agarwal, Hongyi Wang, Shivaram Venkataraman, and Dimitris Papailiopoulos. 2022. On the Utility of Gradient Compression in Distributed Training Systems. In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.), Vol. 4. 652--672. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2022\/file\/773862fcc2e29f650d68960ba5bd1101-Paper.pdf"},{"key":"e_1_3_2_1_17_1","volume-title":"SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills.","author":"Agrawal Amey","year":"2023","unstructured":"Amey Agrawal, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S. Gulavani, and Ramachandran Ramjee. 2023. SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills. (2023). arXiv:cs.LG\/2308.16369"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3551349.3559555"},{"key":"e_1_3_2_1_19_1","volume-title":"ETC: Encoding Long and Structured Inputs in Transformers.","author":"Ainslie Joshua","year":"2020","unstructured":"Joshua Ainslie, Santiago Ontanon, Chris Alberti, Vaclav Cvicek, Zachary Fisher, Philip Pham, Anirudh Ravula, Sumit Sanghai, Qifan Wang, and Li Yang. 2020. ETC: Encoding Long and Structured Inputs in Transformers. (2020). arXiv:cs.LG\/2004.08483"},{"key":"e_1_3_2_1_20_1","unstructured":"Amazon.com Inc. 2023. 2023 Annual Report. Annual Report. Amazon.com Inc. https:\/\/s2.q4cdn.com\/299287126\/files\/doc_financials\/2024\/ar\/Amazon-com-Inc-2023-Annual-Report.pdf"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_22_1","volume-title":"Applications of Large Language Models - InData Labs. https:\/\/indatalabs.com\/blog\/large-language-model-apps. (June","author":"Anastasiya Zharovskikh","year":"2023","unstructured":"Zharovskikh Anastasiya. 2023. Applications of Large Language Models - InData Labs. https:\/\/indatalabs.com\/blog\/large-language-model-apps. (June 2023). (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_23_1","unstructured":"Anonymous. 2024. ChunkAttention: Efficient Attention on KV Cache with Chunking Sharing and Batching. (2024). https:\/\/openreview.net\/forum?id=9k27IITeAZ"},{"key":"e_1_3_2_1_24_1","volume-title":"https:\/\/www.anthropic.com\/index\/100k-context-windows. (May","author":"Context Windows Anthropic","year":"2023","unstructured":"Anthropic. 2023. Anthropic \\ Introducing 100K Context Windows. https:\/\/www.anthropic.com\/index\/100k-context-windows. (May 2023). (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"Comparing LLM Performance: Introducing the Open Source Leaderboard for LLM APIs. (Dec","author":"Team Anyscale","year":"2023","unstructured":"Anyscale Team. 2023. Comparing LLM Performance: Introducing the Open Source Leaderboard for LLM APIs. (Dec. 2023). https:\/\/www.anyscale.com\/blog\/comparing-llm-performance-introducing-the-open-source-leaderboard-for-llm Accessed: 2024-06-01."},{"key":"e_1_3_2_1_26_1","unstructured":"AuthorName. Year. Can ChatGPT understand context and keep track of conversation history. https:\/\/www.quora.com\/Can-ChatGPT-understand-context-and-keep-track-of-conversation-history. (Year). Quora question."},{"key":"e_1_3_2_1_27_1","volume-title":"An experimental open-source attempt to make GPT-4 fully autonomous. https:\/\/github.com\/Significant-Gravitas\/Auto-GPT. (September","author":"GPT.","year":"2023","unstructured":"AutoGPT. 2023. Significant-Gravitas\/Auto-GPT: An experimental open-source attempt to make GPT-4 fully autonomous. https:\/\/github.com\/Significant-Gravitas\/Auto-GPT. (September 2023). (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/860435.860505"},{"key":"e_1_3_2_1_29_1","volume-title":"Multitask Benchmark for Long Context Understanding. arXiv preprint arXiv:2308.14508","author":"Bai Yushi","year":"2023","unstructured":"Yushi Bai, Xin Lv, Jiajie Zhang, Hongchang Lyu, Jiankai Tang, Zhidian Huang, Zhengxiao Du, Xiao Liu, Aohan Zeng, Lei Hou, Yuxiao Dong, Jie Tang, and Juanzi Li. 2023. LongBench: A Bilingual, Multitask Benchmark for Long Context Understanding. arXiv preprint arXiv:2308.14508 (2023)."},{"key":"e_1_3_2_1_30_1","unstructured":"Ramakrishna Bairi Atharv Sonwane Aditya Kanade Vageesh D C Arun Iyer Suresh Parthasarathy Sriram Rajamani B. Ashok and Shashank Shet. 2023. CodePlan: Repository-level Coding using LLMs and Planning. (2023). arXiv:cs.SE\/2309.12499"},{"key":"e_1_3_2_1_31_1","volume-title":"Longformer: The Long-Document Transformer.","author":"Beltagy Iz","year":"2020","unstructured":"Iz Beltagy, Matthew E. Peters, and Arman Cohan. 2020. Longformer: The Long-Document Transformer. (2020). arXiv:cs.CL\/2004.05150"},{"key":"e_1_3_2_1_32_1","volume-title":"Unlimiformer: Long-range transformers with unlimited length input. arXiv preprint arXiv:2305.01625","author":"Bertsch Amanda","year":"2023","unstructured":"Amanda Bertsch, Uri Alon, Graham Neubig, and Matthew R Gormley. 2023. Unlimiformer: Long-range transformers with unlimited length input. arXiv preprint arXiv:2305.01625 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Site Reliability Engineering: How Google Runs Production Systems","author":"Beyer Betsy","unstructured":"Betsy Beyer, Chris Jones, Jennifer Petoff, and Niall Richard Murphy. 2016. Site Reliability Engineering: How Google Runs Production Systems (1st ed.). O'Reilly Media, Inc.","edition":"1"},{"key":"e_1_3_2_1_34_1","volume-title":"Jianfeng Gao, and Yejin Choi.","author":"Bisk Yonatan","year":"2019","unstructured":"Yonatan Bisk, Rowan Zellers, Ronan Le Bras, Jianfeng Gao, and Yejin Choi. 2019. PIQA: Reasoning about Physical Commonsense in Natural Language. (2019). arXiv:cs.CL\/1911.11641"},{"key":"e_1_3_2_1_35_1","unstructured":"Sebastian Borgeaud Arthur Mensch Jordan Hoffmann Trevor Cai Eliza Rutherford Katie Millican George van den Driessche Jean-Baptiste Lespiau Bogdan Damoc Aidan Clark Diego de Las Casas Aurelia Guy Jacob Menick Roman Ring Tom Hennigan Saffron Huang Loren Maggiore Chris Jones Albin Cassirer Andy Brock Michela Paganini Geoffrey Irving Oriol Vinyals Simon Osindero Karen Simonyan Jack W. Rae Erich Elsen and Laurent Sifre. 2022. Improving language models by retrieving from trillions of tokens. (2022). arXiv:cs.CL\/2112.04426"},{"key":"e_1_3_2_1_36_1","unstructured":"Sebastian Borgeaud Arthur Mensch Jordan Hoffmann Trevor Cai Eliza Rutherford Katie Millican George van den Driessche Jean-Baptiste Lespiau Bogdan Damoc Aidan Clark Diego de Las Casas Aurelia Guy Jacob Menick Roman Ring Tom Hennigan Saffron Huang Loren Maggiore Chris Jones Albin Cassirer Andy Brock Michela Paganini Geoffrey Irving Oriol Vinyals Simon Osindero Karen Simonyan Jack W. Rae Erich Elsen and Laurent Sifre. 2022. Improving language models by retrieving from trillions of tokens. (2022). arXiv:cs.CL\/2112.04426 https:\/\/arxiv.org\/abs\/2112.04426"},{"key":"e_1_3_2_1_37_1","unstructured":"Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. (2020). arXiv:cs.CL\/2005.14165"},{"key":"e_1_3_2_1_38_1","volume-title":"Real-World Use Cases for Large Language Models (LLMs) | by CellStrat | Medium. https:\/\/cellstrat.medium.com\/real-world-use-cases-for-large-language-models-llms-d71c3a577bf2. (April","year":"2023","unstructured":"CellStrat. 2023. Real-World Use Cases for Large Language Models (LLMs) | by CellStrat | Medium. https:\/\/cellstrat.medium.com\/real-world-use-cases-for-large-language-models-llms-d71c3a577bf2. (April 2023). (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_39_1","unstructured":"Harrison Chase. 2022. LangChain. (Oct. 2022). https:\/\/github.com\/langchain-ai\/langchain"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Danqi Chen Adam Fisch Jason Weston and Antoine Bordes. 2017. Reading Wikipedia to Answer Open-Domain Questions. (2017). arXiv:cs.CL\/1704.00051","DOI":"10.18653\/v1\/P17-1171"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1184\/R1\/6605324.v1"},{"key":"e_1_3_2_1_42_1","unstructured":"Tianlong Chen Zhenyu Zhang Ajay Jaiswal Shiwei Liu and Zhangyang Wang. 2023. Sparse MoE as the New Dropout: Scaling Dense and Self-Slimmable Transformers. (2023). arXiv:cs.LG\/2303.01610"},{"key":"e_1_3_2_1_43_1","unstructured":"Rewon Child Scott Gray Alec Radford and Ilya Sutskever. 2019. Generating Long Sequences with Sparse Transformers. (2019). arXiv:cs.LG\/1904.10509"},{"key":"e_1_3_2_1_44_1","volume-title":"Charles Sutton, Sebastian Gehrmann, et al.","author":"Chowdhery Aakanksha","year":"2022","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. 2022. Palm: Scaling language modeling with pathways. arXiv preprint arXiv:2204.02311 (2022)."},{"key":"e_1_3_2_1_45_1","unstructured":"Zihang Dai* Zhilin Yang* Yiming Yang William W. Cohen Jaime Carbonell Quoc V. Le and Ruslan Salakhutdinov. 2019. Transformer-XL: Language Modeling with Longer-Term Dependency. (2019). https:\/\/openreview.net\/forum?id=HJePno0cYm"},{"key":"e_1_3_2_1_46_1","unstructured":"Daivi. 21. 7 Top Large Language Model Use Cases And Applications. https:\/\/www.projectpro.io\/article\/large-language-model-use-cases-and-applications\/887. (March 21). (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_47_1","unstructured":"Tri Dao Daniel Y. Fu Stefano Ermon Atri Rudra and Christopher R\u00e9. 2022. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. (2022). arXiv:cs.LG\/2205.14135"},{"key":"e_1_3_2_1_48_1","volume-title":"int8 (): 8-bit matrix multiplication for transformers at scale. arXiv preprint arXiv:2208.07339","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. 2022. Llm. int8 (): 8-bit matrix multiplication for transformers at scale. arXiv preprint arXiv:2208.07339 (2022)."},{"key":"e_1_3_2_1_49_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Jiayu Ding Shuming Ma Li Dong Xingxing Zhang Shaohan Huang Wenhui Wang Nanning Zheng and Furu Wei. 2023. LongNet: Scaling Transformers to 1 000 000 000 Tokens. (2023). arXiv:cs.CL\/2307.02486","DOI":"10.14218\/JCTH.2022.00006S"},{"key":"e_1_3_2_1_51_1","volume-title":"Chengruidong Zhang, Yuanyuan Xu, Ning Shang, Jiahang Xu, Fan Yang, and Mao Yang.","author":"Ding Yiran","year":"2024","unstructured":"Yiran Ding, Li Lyna Zhang, Chengruidong Zhang, Yuanyuan Xu, Ning Shang, Jiahang Xu, Fan Yang, and Mao Yang. 2024. LongRoPE: Extending LLM Context Window Beyond 2 Million Tokens. arXiv preprint arXiv:2402.13753 (2024)."},{"key":"e_1_3_2_1_52_1","volume-title":"AttentionStore: Cost-effective Attention Reuse across Multi-turn Conversations in Large Language Model Serving. arXiv preprint arXiv:2403.19708","author":"Gao Bin","year":"2024","unstructured":"Bin Gao, Zhuomin He, Puru Sharma, Qingxuan Kang, Djordje Jevdjic, Junbo Deng, Xingkun Yang, Zhou Yu, and Pengfei Zuo. 2024. AttentionStore: Cost-effective Attention Reuse across Multi-turn Conversations in Large Language Model Serving. arXiv preprint arXiv:2403.19708 (2024)."},{"key":"e_1_3_2_1_53_1","unstructured":"Yunfan Gao Yun Xiong Xinyu Gao Kangxiang Jia Jinliu Pan Yuxi Bi Yi Dai Jiawei Sun Qianyu Guo Meng Wang and Haofen Wang. 2024. Retrieval-Augmented Generation for Large Language Models: A Survey. (2024). arXiv:cs.CL\/2312.10997"},{"key":"e_1_3_2_1_54_1","unstructured":"Suyu Ge Yunan Zhang Liyuan Liu Minjia Zhang Jiawei Han and Jianfeng Gao. 2023. Model Tells You What to Discard: Adaptive KV Cache Compression for LLMs. (2023). arXiv:cs.CL\/2310.01801"},{"key":"e_1_3_2_1_55_1","volume-title":"In-context Autoencoder for Context Compression in a Large Language Model. arXiv preprint arXiv:2307.06945","author":"Ge Tao","year":"2023","unstructured":"Tao Ge, Jing Hu, Xun Wang, Si-Qing Chen, and Furu Wei. 2023. In-context Autoencoder for Context Compression in a Large Language Model. arXiv preprint arXiv:2307.06945 (2023)."},{"key":"e_1_3_2_1_56_1","volume-title":"The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=uREj4ZuGJE","author":"Ge Tao","year":"2024","unstructured":"Tao Ge, Hu Jing, Lei Wang, Xun Wang, Si-Qing Chen, and Furu Wei. 2024. In-context Autoencoder for Context Compression in a Large Language Model. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=uREj4ZuGJE"},{"key":"e_1_3_2_1_57_1","unstructured":"GGML. [n. d.]. GGML - AI at the edge. https:\/\/ggml.ai\/. ([n. d.])."},{"key":"e_1_3_2_1_58_1","volume-title":"Nikhil Sarda, Anurag Khandelwal, and Lin Zhong.","author":"Gim In","year":"2023","unstructured":"In Gim, Guojun Chen, Seung seob Lee, Nikhil Sarda, Anurag Khandelwal, and Lin Zhong. 2023. Prompt Cache: Modular Attention Reuse for Low-Latency Inference. (2023). arXiv:cs.CL\/2311.04934"},{"key":"e_1_3_2_1_59_1","volume-title":"Nikhil Sarda, Anurag Khandelwal, and Lin Zhong.","author":"Gim In","year":"2023","unstructured":"In Gim, Guojun Chen, Seung seob Lee, Nikhil Sarda, Anurag Khandelwal, and Lin Zhong. 2023. Prompt Cache: Modular Attention Reuse for Low-Latency Inference. (2023). arXiv:cs.CL\/2311.04934"},{"key":"e_1_3_2_1_60_1","volume-title":"Proceedings of the 39th International Conference on Machine Learning (Proceedings of Machine Learning Research)","author":"Hawthorne Curtis","unstructured":"Curtis Hawthorne, Andrew Jaegle, C\u0103t\u0103lina Cangea, Sebastian Borgeaud, Charlie Nash, Mateusz Malinowski, Sander Dieleman, Oriol Vinyals, Matthew Botvinick, Ian Simon, Hannah Sheahan, Neil Zeghidour, Jean-Baptiste Alayrac, Joao Carreira, and Jesse Engel. 2022. General-purpose, long-context autoregressive modeling with Perceiver AR. In Proceedings of the 39th International Conference on Machine Learning (Proceedings of Machine Learning Research), Kamalika Chaudhuri, Stefanie Jegelka, Le Song, Csaba Szepesvari, Gang Niu, and Sivan Sabato (Eds.), Vol. 162. PMLR, 8535--8558. https:\/\/proceedings.mlr.press\/v162\/hawthorne22a.html"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","unstructured":"Hermann Hellwagner Ingo Kofler Michael Eberhard Robert Kuschnig Michael Ransburg and Michael Sablatschan. 2011. Scalable Video Coding: Techniques and Applications for Adaptive Streaming. 1--23. 10.4018\/978-1-61692-831-5","DOI":"10.4018\/978-1-61692-831-5"},{"key":"e_1_3_2_1_62_1","volume-title":"Kurt Keutzer, and Amir Gholami.","author":"Hooper Coleman","year":"2024","unstructured":"Coleman Hooper, Sehoon Kim, Hiva Mohammadzadeh, Michael W Mahoney, Yakun Sophia Shao, Kurt Keutzer, and Amir Gholami. 2024. KVQuant: Towards 10 Million Context Length LLM Inference with KV Cache Quantization. arXiv preprint arXiv:2401.18079 (2024)."},{"key":"e_1_3_2_1_63_1","volume-title":"RULER: What's the Real Context Size of Your Long-Context Language Models? arXiv preprint arXiv:2404.06654","author":"Hsieh Cheng-Ping","year":"2024","unstructured":"Cheng-Ping Hsieh, Simeng Sun, Samuel Kriman, Shantanu Acharya, Dima Rekesh, Fei Jia, and Boris Ginsburg. 2024. RULER: What's the Real Context Size of Your Long-Context Language Models? arXiv preprint arXiv:2404.06654 (2024)."},{"key":"e_1_3_2_1_64_1","unstructured":"Huggingface. [n. d.]. Huggingface Transformers. https:\/\/huggingface.co\/docs\/transformers\/index. ([n. d.])."},{"key":"e_1_3_2_1_65_1","unstructured":"Huggingface. [n. d.]. Perplexity in fixed length models. https:\/\/huggingface.co\/docs\/transformers\/perplexity. ([n. d.])."},{"key":"e_1_3_2_1_66_1","unstructured":"Amazon Inc. [n. d.]. Amazon Mechanical Turk. https:\/\/www.mturk.com\/. ([n. d.])."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"crossref","unstructured":"Gautier Izacard and Edouard Grave. 2021. Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering. (2021). arXiv:cs.CL\/2007.01282","DOI":"10.18653\/v1\/2021.eacl-main.74"},{"key":"e_1_3_2_1_68_1","volume-title":"Few-shot learning with retrieval augmented language models. arXiv preprint arXiv:2208.03299","author":"Izacard Gautier","year":"2022","unstructured":"Gautier Izacard, Patrick Lewis, Maria Lomeli, Lucas Hosseini, Fabio Petroni, Timo Schick, Jane Dwivedi-Yu, Armand Joulin, Sebastian Riedel, and Edouard Grave. 2022. Few-shot learning with retrieval augmented language models. arXiv preprint arXiv:2208.03299 (2022)."},{"key":"e_1_3_2_1_69_1","unstructured":"Naman Jain Tianjun Zhang Wei-Lin Chiang Joseph E. Gonzalez Koushik Sen and Ion Stoica. 2023. LLM-Assisted Code Cleaning For Training Accurate Code Generators. (2023). arXiv:cs.LG\/2311.14904"},{"key":"e_1_3_2_1_70_1","volume-title":"Skyplane: Optimizing Transfer Cost and Throughput Using Cloud-Aware Overlays. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Jain Paras","year":"2023","unstructured":"Paras Jain, Sam Kumar, Sarah Wooders, Shishir G. Patil, Joseph E. Gonzalez, and Ion Stoica. 2023. Skyplane: Optimizing Transfer Cost and Throughput Using Cloud-Aware Overlays. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 1375--1389. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/jain"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"crossref","unstructured":"Huiqiang Jiang Qianhui Wu Chin-Yew Lin Yuqing Yang and Lili Qiu. 2023. LLMLingua: Compressing Prompts for Accelerated Inference of Large Language Models. (2023). arXiv:cs.CL\/2310.05736","DOI":"10.18653\/v1\/2023.emnlp-main.825"},{"key":"e_1_3_2_1_72_1","unstructured":"Huiqiang Jiang Qianhui Wu Xufang Luo Dongsheng Li Chin-Yew Lin Yuqing Yang and Lili Qiu. 2023. LongLLMLingua: Accelerating and Enhancing LLMs in Long Context Scenarios via Prompt Compression. (2023). arXiv:cs.CL\/2310.06839"},{"key":"e_1_3_2_1_73_1","volume-title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","author":"Jimenez Carlos E.","year":"2023","unstructured":"Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, and Karthik Narasimhan. 2023. SWE-bench: Can Language Models Resolve Real-World GitHub Issues? (2023). arXiv:cs.CL\/2310.06770"},{"key":"e_1_3_2_1_74_1","volume-title":"RAGCache: Efficient Knowledge Caching for Retrieval-Augmented Generation. arXiv preprint arXiv:2404.12457","author":"Jin Chao","year":"2024","unstructured":"Chao Jin, Zili Zhang, Xuanlin Jiang, Fangyue Liu, Xin Liu, Xuanzhe Liu, and Xin Jin. 2024. RAGCache: Efficient Knowledge Caching for Retrieval-Augmented Generation. arXiv preprint arXiv:2404.12457 (2024)."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"crossref","unstructured":"Mandar Joshi Eunsol Choi Daniel S. Weld and Luke Zettlemoyer. 2017. TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension. (2017). arXiv:cs.CL\/1705.03551","DOI":"10.18653\/v1\/P17-1147"},{"key":"e_1_3_2_1_76_1","volume-title":"How does ChatGPT store history of chat. https:\/\/community.openai.com\/t\/how-does-chatgpt-store-history-of-chat\/319608\/2. (Aug","year":"2023","unstructured":"jwatte. 2023. How does ChatGPT store history of chat. https:\/\/community.openai.com\/t\/how-does-chatgpt-store-history-of-chat\/319608\/2. (Aug 2023). OpenAI Community Forum."},{"key":"e_1_3_2_1_77_1","volume-title":"Reproducible Performance Metrics for LLM Inference. (Nov","author":"Kadous Waleed","year":"2023","unstructured":"Waleed Kadous, Kyle Huang, Wendi Ding, Liguang Xie, Avnish Narayan, and Ricky Xu. 2023. Reproducible Performance Metrics for LLM Inference. (Nov. 2023). https:\/\/www.anyscale.com\/blog\/reproducible-performance-metrics-for-llm-inference Accessed: 2024-06-01."},{"key":"e_1_3_2_1_78_1","volume-title":"Gear: An efficient kv cache compression recipefor near-lossless generative inference of llm. arXiv preprint arXiv:2403.05527","author":"Kang Hao","year":"2024","unstructured":"Hao Kang, Qingru Zhang, Souvik Kundu, Geonhwa Jeong, Zaoxing Liu, Tushar Krishna, and Tuo Zhao. 2024. Gear: An efficient kv cache compression recipefor near-lossless generative inference of llm. arXiv preprint arXiv:2403.05527 (2024)."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"crossref","unstructured":"Vladimir Karpukhin Barlas O\u011fuz Sewon Min Patrick Lewis Ledell Wu Sergey Edunov Danqi Chen and Wen tau Yih. 2020. Dense Passage Retrieval for Open-Domain Question Answering. (2020). arXiv:cs.CL\/2004.04906","DOI":"10.18653\/v1\/2020.emnlp-main.550"},{"key":"e_1_3_2_1_80_1","volume-title":"2020 USENIX Annual Technical Conference (USENIX ATC 20)","author":"Keahey Kate","year":"2020","unstructured":"Kate Keahey, Jason Anderson, Zhuo Zhen, Pierre Riteau, Paul Ruth, Dan Stanzione, Mert Cevik, Jacob Colleran, Haryadi S. Gunawi, Cody Hammock, Joe Mambretti, Alexander Barnes, Fran\u00e7ois Halbah, Alex Rocha, and Joe Stubbs. 2020. Lessons Learned from the Chameleon Testbed. In 2020 USENIX Annual Technical Conference (USENIX ATC 20). USENIX Association, 219--233. https:\/\/www.usenix.org\/conference\/atc20\/presentation\/keahey"},{"key":"e_1_3_2_1_81_1","volume-title":"G\u00e1bor Melis, and Edward Grefenstette.","author":"Ko\u010disk\u00fd Tom\u00e1\u0161","year":"2017","unstructured":"Tom\u00e1\u0161 Ko\u010disk\u00fd, Jonathan Schwarz, Phil Blunsom, Chris Dyer, Karl Moritz Hermann, G\u00e1bor Melis, and Edward Grefenstette. 2017. The NarrativeQA Reading Comprehension Challenge. (2017). arXiv:cs.CL\/1712.07040"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_83_1","volume-title":"langchain-ai\/langchain:Building applications with LLMs through composability. https:\/\/github.com\/langchain-ai\/langchain. (Feburary","year":"2024","unstructured":"LangChain. 2024. langchain-ai\/langchain:Building applications with LLMs through composability. https:\/\/github.com\/langchain-ai\/langchain. (Feburary 2024). (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_84_1","volume-title":"Store and reference chat history | Langchain. https:\/\/python.langchain.com\/docs\/use_cases\/question_answering\/how_to\/chat_vector_db. (Feburary","year":"2024","unstructured":"LangChain. 2024. Store and reference chat history | Langchain. https:\/\/python.langchain.com\/docs\/use_cases\/question_answering\/how_to\/chat_vector_db. (Feburary 2024). (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_85_1","unstructured":"Benjamin Lefaudeux Francisco Massa Diana Liskovich Wenhan Xiong Vittorio Caggiano Sean Naren Min Xu Jieru Hu Marta Tintore Susan Zhang Patrick Labatut Daniel Haziza Luca Wehrstedt Jeremy Reizenstein and Grigory Sizov. 2022. xFormers: A modular and hackable Transformer modelling library. https:\/\/github.com\/facebookresearch\/xformers. (2022)."},{"key":"e_1_3_2_1_86_1","volume-title":"International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:254096365","author":"Leviathan Yaniv","unstructured":"Yaniv Leviathan, Matan Kalman, and Y. Matias. 2022. Fast Inference from Transformers via Speculative Decoding. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:254096365"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","DOI":"10.1093\/jcmc\/zmy009"},{"key":"e_1_3_2_1_88_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems 33 (2020), 9459--9474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_89_1","volume-title":"Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela.","author":"Lewis Patrick","year":"2021","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen tau Yih, Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela. 2021. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. (2021). arXiv:cs.CL\/2005.11401"},{"key":"e_1_3_2_1_90_1","volume-title":"Joseph E. Gonzalez, Ion Stoica, Xuezhe Ma, and Hao Zhang.","author":"Dacheng","year":"2023","unstructured":"Dacheng Li*, Rulin Shao*, Anze Xie, Lianmin Zheng Ying Sheng, Joseph E. Gonzalez, Ion Stoica, Xuezhe Ma, and Hao Zhang. 2023. How Long Can Open-Source LLMs Truly Promise on Context Length? (June 2023). https:\/\/lmsys.org\/blog\/2023-06-29-longchat"},{"key":"e_1_3_2_1_91_1","unstructured":"Bin Lin Tao Peng Chen Zhang Minmin Sun Lanbo Li Hanyu Zhao Wencong Xiao Qi Xu Xiafei Qiu Shen Li Zhigang Ji Yong Li and Wei Lin. 2024. Infinite-LLM: Efficient LLM Service for Long Context with DistAttention and Distributed KVCache. (2024). arXiv:cs.DC\/2401.02669"},{"key":"e_1_3_2_1_92_1","unstructured":"Tianyang Lin Yuxin Wang Xiangyang Liu and Xipeng Qiu. 2021. A Survey of Transformers. (2021). arXiv:cs.LG\/2106.04554"},{"key":"e_1_3_2_1_93_1","volume-title":"Andes: Defining and Enhancing Quality-of-Experience in LLM-Based Text Streaming Services. arXiv preprint arXiv:2404.16283","author":"Liu Jiachen","year":"2024","unstructured":"Jiachen Liu, Zhiyu Wu, Jae-Won Chung, Fan Lai, Myungjin Lee, and Mosharaf Chowdhury. 2024. Andes: Defining and Enhancing Quality-of-Experience in LLM-Based Text Streaming Services. arXiv preprint arXiv:2404.16283 (2024)."},{"key":"e_1_3_2_1_94_1","volume-title":"Lost in the middle: How language models use long contexts. arXiv preprint arXiv:2307.03172","author":"Liu Nelson F","year":"2023","unstructured":"Nelson F Liu, Kevin Lin, John Hewitt, Ashwin Paranjape, Michele Bevilacqua, Fabio Petroni, and Percy Liang. 2023. Lost in the middle: How language models use long contexts. arXiv preprint arXiv:2307.03172 (2023)."},{"key":"e_1_3_2_1_95_1","volume-title":"Scissorhands: Exploiting the Persistence of Importance Hypothesis for LLM KV Cache Compression at Test Time. arXiv preprint arXiv:2305.17118","author":"Liu Zichang","year":"2023","unstructured":"Zichang Liu, Aditya Desai, Fangshuo Liao, Weitao Wang, Victor Xie, Zhaozhuo Xu, Anastasios Kyrillidis, and Anshumali Shrivastava. 2023. Scissorhands: Exploiting the Persistence of Importance Hypothesis for LLM KV Cache Compression at Test Time. arXiv preprint arXiv:2305.17118 (2023)."},{"key":"e_1_3_2_1_96_1","volume-title":"Scissorhands: Exploiting the Persistence of Importance Hypothesis for LLM KV Cache Compression at Test Time. arXiv preprint arXiv:2305.17118","author":"Liu Zichang","year":"2023","unstructured":"Zichang Liu, Aditya Desai, Fangshuo Liao, Weitao Wang, Victor Xie, Zhaozhuo Xu, Anastasios Kyrillidis, and Anshumali Shrivastava. 2023. Scissorhands: Exploiting the Persistence of Importance Hypothesis for LLM KV Cache Compression at Test Time. arXiv preprint arXiv:2305.17118 (2023)."},{"key":"e_1_3_2_1_97_1","volume-title":"KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache. arXiv preprint arXiv:2402.02750","author":"Liu Zirui","year":"2024","unstructured":"Zirui Liu, Jiayi Yuan, Hongye Jin, Shaochen Zhong, Zhaozhuo Xu, Vladimir Braverman, Beidi Chen, and Xia Hu. 2024. KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache. arXiv preprint arXiv:2402.02750 (2024)."},{"key":"e_1_3_2_1_98_1","unstructured":"llama.cpp. [n. d.]. llama.cpp. https:\/\/github.com\/ggerganov\/llama.cpp\/. ([n. d.])."},{"key":"e_1_3_2_1_99_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626111.3628183"},{"key":"e_1_3_2_1_100_1","unstructured":"Ignacio Martinez. 2023. privateGPT. https:\/\/github.com\/imartinez\/privateGPT. (2023)."},{"key":"e_1_3_2_1_101_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01088"},{"key":"e_1_3_2_1_102_1","unstructured":"Stephen Merity Caiming Xiong James Bradbury and Richard Socher. 2016. Pointer Sentinel Mixture Models. (2016). arXiv:cs.CL\/1609.07843"},{"key":"e_1_3_2_1_103_1","volume-title":"Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia.","author":"Miao Xupeng","year":"2023","unstructured":"Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. 2023. SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification. arXiv preprint arXiv:2305.09781 (2023)."},{"key":"e_1_3_2_1_104_1","volume-title":"Xiang Lisa Li, and Noah Goodman","author":"Mu Jesse","year":"2023","unstructured":"Jesse Mu, Xiang Lisa Li, and Noah Goodman. 2023. Learning to compress prompts with gist tokens. arXiv preprint arXiv:2304.08467 (2023)."},{"key":"e_1_3_2_1_105_1","unstructured":"Author's Name. Year of Publication. LLMs in Finance: BloombergGPT and FinGPT - What You Need to Know. Medium. (Year of Publication). https:\/\/12gunika.medium.com\/llms-in-finance-bloomberggpt-and-fingpt-what-you-need-to-know-2fdf3af29217"},{"key":"e_1_3_2_1_106_1","doi-asserted-by":"crossref","unstructured":"Yixin Nie Songhe Wang and Mohit Bansal. 2019. Revealing the Importance of Semantic Retrieval for Machine Reading at Scale. (2019). arXiv:cs.CL\/1909.08041","DOI":"10.18653\/v1\/D19-1258"},{"key":"e_1_3_2_1_107_1","unstructured":"Antonio Nucci. 2024. Large Language Models in Financial Services & Banking. (2024). https:\/\/aisera.com\/blog\/large-language-models-in-financial-services-banking\/"},{"key":"e_1_3_2_1_108_1","volume-title":"GPT-4 API general availability and deprecation of older models in the Completions API. https:\/\/openai.com\/blog\/gpt-4-api-general-availability. (April","author":"AI.","year":"2024","unstructured":"OpenAI. 2024. GPT-4 API general availability and deprecation of older models in the Completions API. https:\/\/openai.com\/blog\/gpt-4-api-general-availability. (April 2024). (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_109_1","doi-asserted-by":"publisher","DOI":"10.1137\/090752286"},{"key":"e_1_3_2_1_110_1","volume-title":"Percy Liang, and Michael S.","author":"Park Joon Sung","year":"2023","unstructured":"Joon Sung Park, Joseph C. O'Brien, Carrie J. Cai, Meredith Ringel Morris, Percy Liang, and Michael S. Bernstein. 2023. Generative Agents: Interactive Simulacra of Human Behavior. (2023). arXiv:cs.HC\/2304.03442"},{"key":"e_1_3_2_1_111_1","volume-title":"Splitwise: Efficient generative llm inference using phase splitting. arXiv preprint arXiv:2311.18677","author":"Patel Pratyush","year":"2023","unstructured":"Pratyush Patel, Esha Choukse, Chaojie Zhang, \u00cd\u00f1igo Goiri, Aashaka Shah, Saeed Maleki, and Ricardo Bianchini. 2023. Splitwise: Efficient generative llm inference using phase splitting. arXiv preprint arXiv:2311.18677 (2023)."},{"key":"e_1_3_2_1_112_1","unstructured":"Reiner Pope Sholto Douglas Aakanksha Chowdhery Jacob Devlin James Bradbury Anselm Levskaya Jonathan Heek Kefan Xiao Shivani Agrawal and Jeff Dean. 2022. Efficiently Scaling Transformer Inference. (2022). arXiv:cs.LG\/2211.05102"},{"key":"e_1_3_2_1_113_1","doi-asserted-by":"crossref","unstructured":"Ori Ram Yoav Levine Itay Dalmedigos Dor Muhlgay Amnon Shashua Kevin Leyton-Brown and Yoav Shoham. 2023. In-Context Retrieval-Augmented Language Models. (2023). arXiv:cs.CL\/2302.00083","DOI":"10.1162\/tacl_a_00605"},{"key":"e_1_3_2_1_114_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_115_1","volume-title":"Carlo Luschi, and Douglas Orr","author":"Ribar Luka","year":"2023","unstructured":"Luka Ribar, Ivan Chelombiev, Luke Hudlass-Galley, Charlie Blake, Carlo Luschi, and Douglas Orr. 2023. SparQ Attention: Bandwidth-Efficient LLM Inference. (2023). arXiv:cs.LG\/2312.04985"},{"key":"e_1_3_2_1_116_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00353"},{"key":"e_1_3_2_1_117_1","volume-title":"Long-range Language Modeling with Self-retrieval. arXiv preprint arXiv:2306.13421","author":"Rubin Ohad","year":"2023","unstructured":"Ohad Rubin and Jonathan Berant. 2023. Long-range Language Modeling with Self-retrieval. arXiv preprint arXiv:2306.13421 (2023)."},{"key":"e_1_3_2_1_118_1","volume-title":"Enrich Your Precedents with the Use of AI. Data Science Dojo. (25","author":"Saleem Ayesha","year":"2023","unstructured":"Ayesha Saleem. 2023. LLM for Lawyers, Enrich Your Precedents with the Use of AI. Data Science Dojo. (25 July 2023). https:\/\/datasciencedojo.com\/blog\/llm-for-lawyers\/"},{"key":"e_1_3_2_1_119_1","doi-asserted-by":"crossref","unstructured":"Hang Shao Bei Liu and Yanmin Qian. 2024. One-Shot Sensitivity-Aware Mixed Sparsity Pruning for Large Language Models. (2024). arXiv:cs.CL\/2310.09499","DOI":"10.1109\/ICASSP48485.2024.10445737"},{"key":"e_1_3_2_1_120_1","unstructured":"Ying Sheng Lianmin Zheng Binhang Yuan Zhuohan Li Max Ryabinin Daniel Y Fu Zhiqiang Xie Beidi Chen Clark Barrett Joseph E Gonzalez et al. 2023. High-throughput generative inference of large language models with a single gpu. arXiv preprint arXiv:2303.06865 (2023)."},{"key":"e_1_3_2_1_121_1","unstructured":"Zijing Shi Meng Fang Shunfeng Zheng Shilong Deng Ling Chen and Yali Du. 2023. Cooperation on the Fly: Exploring Language Agents for Ad Hoc Teamwork in the Avalon Game. (2023). arXiv:cs.CL\/2312.17515"},{"key":"e_1_3_2_1_122_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_123_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582688"},{"key":"e_1_3_2_1_124_1","volume-title":"Do long-range language models actually use long-range context? arXiv preprint arXiv:2109.09115","author":"Sun Simeng","year":"2021","unstructured":"Simeng Sun, Kalpesh Krishna, Andrew Mattarella-Micke, and Mohit Iyyer. 2021. Do long-range language models actually use long-range context? arXiv preprint arXiv:2109.09115 (2021)."},{"key":"e_1_3_2_1_125_1","unstructured":"Pavlo Sydorenko. 2023. Top 5 Applications of Large Language Models (LLMs) in Legal Practice. Medium. (2023). https:\/\/medium.com\/jurdep\/top-5-applications-of-large-language-models-llms-in-legal-practice-d29cde9c38ef"},{"key":"e_1_3_2_1_126_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2012.2221526"},{"key":"e_1_3_2_1_127_1","unstructured":"Zilliz Technology. 2023. GPTCache. https:\/\/github.com\/zilliztech\/GPTCache. (2023)."},{"key":"e_1_3_2_1_128_1","volume-title":"12 Practical Large Language Model (LLM) Applications - Techopedia. https:\/\/www.techopedia.com\/12-practical-large-language-model-llm-applications. (January","author":"Tim Keary","year":"2024","unstructured":"Keary Tim. 2024. 12 Practical Large Language Model (LLM) Applications - Techopedia. https:\/\/www.techopedia.com\/12-practical-large-language-model-llm-applications. (January 2024). (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_129_1","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. (2023). arXiv:cs.CL\/2302.13971 https:\/\/arxiv.org\/abs\/2302.13971"},{"key":"e_1_3_2_1_130_1","volume-title":"Focused transformer: Contrastive training for context scaling. arXiv preprint arXiv:2307.03170","author":"Tworkowski Szymon","year":"2023","unstructured":"Szymon Tworkowski, Konrad Staniszewski, Miko\u0142aj Pacek, Yuhuai Wu, Henryk Michalewski, and Piotr Mi\u0142o\u015b. 2023. Focused transformer: Contrastive training for context scaling. arXiv preprint arXiv:2307.03170 (2023)."},{"key":"e_1_3_2_1_131_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2023. Attention Is All You Need. (2023). arXiv:cs.CL\/1706.03762"},{"key":"e_1_3_2_1_132_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587451"},{"key":"e_1_3_2_1_133_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3567505"},{"key":"e_1_3_2_1_134_1","volume-title":"Zemi: Learning Zero-Shot Semi-Parametric Language Models from Multiple Tasks.","author":"Wang Zhenhailong","year":"2023","unstructured":"Zhenhailong Wang, Xiaoman Pan, Dian Yu, Dong Yu, Jianshu Chen, and Heng Ji. 2023. Zemi: Learning Zero-Shot Semi-Parametric Language Models from Multiple Tasks. (2023). arXiv:cs.CL\/2210.00185"},{"key":"e_1_3_2_1_135_1","doi-asserted-by":"publisher","DOI":"10.1145\/214762.214771"},{"key":"e_1_3_2_1_136_1","volume-title":"Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush.","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Perric Cistac, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. 2020. Transformers: State-of-the-Art Natural Language Processing. Association for Computational Linguistics, 38--45. https:\/\/www.aclweb.org\/anthology\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_137_1","volume-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38--45","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. 2020. Transformers: State-of-the-Art Natural Language Processing. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38--45. https:\/\/www.aclweb.org\/anthology\/2020.emnlpdemos.6"},{"key":"e_1_3_2_1_138_1","volume-title":"LoongServe: Efficiently Serving Long-context Large Language Models with Elastic Sequence Parallelism. arXiv preprint arXiv:2404.09526","author":"Wu Bingyang","year":"2024","unstructured":"Bingyang Wu, Shengyu Liu, Yinmin Zhong, Peng Sun, Xuanzhe Liu, and Xin Jin. 2024. LoongServe: Efficiently Serving Long-context Large Language Models with Elastic Sequence Parallelism. arXiv preprint arXiv:2404.09526 (2024)."},{"key":"e_1_3_2_1_139_1","unstructured":"Bingyang Wu Yinmin Zhong Zili Zhang Gang Huang Xuanzhe Liu and Xin Jin. 2023. Fast Distributed Inference Serving for Large Language Models. (2023). arXiv:cs.LG\/2305.05920"},{"key":"e_1_3_2_1_140_1","unstructured":"Dekun Wu Haochen Shi Zhiyuan Sun and Bang Liu. 2023. Deciphering Digital Detectives: Understanding LLM Behaviors and Capabilities in Multi-Agent Mystery Games. (2023). arXiv:cs.AI\/2312.00746"},{"key":"e_1_3_2_1_141_1","volume-title":"Memorizing Transformers. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=TrjbxzRcnf-","author":"Wu Yuhuai","year":"2022","unstructured":"Yuhuai Wu, Markus Norman Rabe, DeLesley Hutchins, and Christian Szegedy. 2022. Memorizing Transformers. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=TrjbxzRcnf-"},{"key":"e_1_3_2_1_142_1","volume-title":"International Conference on Machine Learning. PMLR, 38087--38099","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning. PMLR, 38087--38099."},{"key":"e_1_3_2_1_143_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.244"},{"key":"e_1_3_2_1_144_1","unstructured":"Peng Xu Wei Ping Xianchao Wu Lawrence McAfee Chen Zhu Zihan Liu Sandeep Subramanian Evelina Bakhturina Mohammad Shoeybi and Bryan Catanzaro. 2024. Retrieval meets Long Context Large Language Models. (2024). arXiv:cs.CL\/2310.03025"},{"key":"e_1_3_2_1_145_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/n19-4013"},{"key":"e_1_3_2_1_146_1","doi-asserted-by":"publisher","DOI":"10.1145\/3488560.3498495"},{"key":"e_1_3_2_1_147_1","volume-title":"CacheBlend: Fast Large Language Model Serving with Cached Knowledge Fusion. arXiv preprint arXiv:2405.16444","author":"Yao Jiayi","year":"2024","unstructured":"Jiayi Yao, Hanchen Li, Yuhan Liu, Siddhant Ray, Yihua Cheng, Qizheng Zhang, Kuntai Du, Shan Lu, and Junchen Jiang. 2024. CacheBlend: Fast Large Language Model Serving with Cached Knowledge Fusion. arXiv preprint arXiv:2405.16444 (2024)."},{"key":"e_1_3_2_1_148_1","volume-title":"EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models. arXiv preprint arXiv:2308.14352","author":"Yi Rongjie","year":"2023","unstructured":"Rongjie Yi, Liwei Guo, Shiyun Wei, Ao Zhou, Shangguang Wang, and Mengwei Xu. 2023. EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models. arXiv preprint arXiv:2308.14352 (2023)."},{"key":"e_1_3_2_1_149_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521--538."},{"key":"e_1_3_2_1_150_1","volume-title":"Big Bird: Transformers for Longer Sequences.","author":"Zaheer Manzil","year":"2021","unstructured":"Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, and Amr Ahmed. 2021. Big Bird: Transformers for Longer Sequences. (2021). arXiv:cs.LG\/2007.14062"},{"key":"e_1_3_2_1_151_1","unstructured":"Lin Zehui Pengfei Liu Luyao Huang Junkun Chen Xipeng Qiu and Xuanjing Huang. 2019. DropAttention: A Regularization Method for Fully-Connected Self-Attention Networks. (2019). arXiv:cs.CL\/1907.11065"},{"key":"e_1_3_2_1_152_1","volume-title":"Workshop on Efficient Systems for Foundation Models @ ICML2023","author":"Zhang Zhenyu","year":"2023","unstructured":"Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher Re, Clark Barrett, Zhangyang Wang, and Beidi Chen. 2023. H2O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models. In Workshop on Efficient Systems for Foundation Models @ ICML2023. https:\/\/openreview.net\/forum?id=ctPizehA9D"},{"key":"e_1_3_2_1_153_1","unstructured":"Zhenyu Zhang Ying Sheng Tianyi Zhou Tianlong Chen Lianmin Zheng Ruisi Cai Zhao Song Yuandong Tian Christopher R\u00e9 Clark Barrett Zhangyang Wang and Beidi Chen. 2023. H2O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models. (2023). arXiv:cs.LG\/2306.14048"},{"key":"e_1_3_2_1_154_1","unstructured":"Qibin Zhao Guoxu Zhou Shengli Xie Liqing Zhang and Andrzej Cichocki. 2016. Tensor Ring Decomposition. (2016). arXiv:cs.NA\/1606.05535"},{"key":"e_1_3_2_1_155_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric. P Xing Hao Zhang Joseph E. Gonzalez and Ion Stoica. 2023. Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. (2023). arXiv:cs.CL\/2306.05685"},{"key":"e_1_3_2_1_156_1","volume-title":"Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al.","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Jeff Huang, Chuyue Sun, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. 2023. Efficiently programming large language models using sglang. arXiv preprint arXiv:2312.07104 (2023)."},{"key":"e_1_3_2_1_157_1","unstructured":"Yinmin Zhong Shengyu Liu Junda Chen Jianbo Hu Yibo Zhu Xuanzhe Liu Xin Jin and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. (2024). arXiv:cs.DC\/2401.09670"}],"event":{"name":"ACM SIGCOMM '24: ACM SIGCOMM 2024 Conference","location":"Sydney NSW Australia","acronym":"ACM SIGCOMM '24","sponsor":["SIGCOMM ACM Special Interest Group on Data Communication"]},"container-title":["Proceedings of the ACM SIGCOMM 2024 Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3651890.3672274","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3651890.3672274","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T17:49:13Z","timestamp":1750268953000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3651890.3672274"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,4]]},"references-count":157,"alternative-id":["10.1145\/3651890.3672274","10.1145\/3651890"],"URL":"https:\/\/doi.org\/10.1145\/3651890.3672274","relation":{},"subject":[],"published":{"date-parts":[[2024,8,4]]},"assertion":[{"value":"2024-08-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}