{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,10]],"date-time":"2026-05-10T00:39:04Z","timestamp":1778373544552,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Swiss National Science Foundation","award":["10.001.796"],"award-info":[{"award-number":["10.001.796"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3721146.3721941","type":"proceedings-article","created":{"date-parts":[[2025,4,1]],"date-time":"2025-04-01T17:42:05Z","timestamp":1743529325000},"page":"66-73","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Leveraging Approximate Caching for Faster Retrieval-Augmented Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3484-1452","authenticated-orcid":false,"given":"Shai Aviram","family":"Bergman","sequence":"first","affiliation":[{"name":"Huawei, Zurich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6184-7279","authenticated-orcid":false,"given":"Zhang","family":"Ji","sequence":"additional","affiliation":[{"name":"Huawei, Zurich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8187-724X","authenticated-orcid":false,"given":"Anne-Marie","family":"Kermarrec","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2229-235X","authenticated-orcid":false,"given":"Diana","family":"Petrescu","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7826-1599","authenticated-orcid":false,"given":"Rafael","family":"Pires","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6844-4695","authenticated-orcid":false,"given":"Mathis","family":"Randl","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4157-4847","authenticated-orcid":false,"given":"Martijn","family":"de Vos","sequence":"additional","affiliation":[{"name":"EPFL, Lausanne, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,4]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Scaling laws for neural language models. arXiv preprint arXiv:2001.08361","author":"Kaplan Jared","year":"2020","unstructured":"Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. Scaling laws for neural language models. arXiv preprint arXiv:2001.08361, 2020."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-024-07930-y"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.123"},{"key":"e_1_3_2_1_4_1","first-page":"2518","volume-title":"The 2024 ACM Conference on Fairness, Accountability, and Transparency","author":"Lee Yoonjoo","year":"2024","unstructured":"Yoonjoo Lee, Kihoon Son, Tae Soo Kim, Jisu Kim, John Joon Young Chung, Eytan Adar, and Juho Kim. One vs. many: Comprehending accurate information from multiple erroneous and inconsistent ai generations. In The 2024 ACM Conference on Fairness, Accountability, and Transparency, pages 2518--2531, 2024."},{"key":"e_1_3_2_1_5_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems, 33:9459--9474, 2020.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1017\/nlp.2024.53"},{"key":"e_1_3_2_1_7_1","volume-title":"Towards understanding systems trade-offs in retrieval-augmented generation model inference. arXiv preprint arXiv:2412.11854","author":"Shen Michael","year":"2024","unstructured":"Michael Shen, Muhammad Umar, Kiwan Maeng, G Edward Suh, and Udit Gupta. Towards understanding systems trade-offs in retrieval-augmented generation model inference. arXiv preprint arXiv:2412.11854, 2024."},{"key":"e_1_3_2_1_8_1","volume-title":"Accelerating inference of retrieval-augmented generation via sparse context selection. arXiv preprint arXiv:2405.16178","author":"Zhu Yun","year":"2024","unstructured":"Yun Zhu, Jia-Chen Gu, Caitlin Sikora, Ho Ko, Yinxiao Liu, Chu-Cheng Lin, Lei Shu, Liangchen Luo, Lei Meng, Bang Liu, et al. Accelerating inference of retrieval-augmented generation via sparse context selection. arXiv preprint arXiv:2405.16178, 2024."},{"key":"e_1_3_2_1_9_1","volume-title":"Piperag: Fast retrieval-augmented generation via algorithm-system co-design. arXiv preprint arXiv:2403.05676","author":"Jiang Wenqi","year":"2024","unstructured":"Wenqi Jiang, Shuai Zhang, Boran Han, Jie Wang, Bernie Wang, and Tim Kraska. Piperag: Fast retrieval-augmented generation via algorithm-system co-design. arXiv preprint arXiv:2403.05676, 2024."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3578519"},{"key":"e_1_3_2_1_11_1","volume-title":"Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300, 2020."},{"key":"e_1_3_2_1_12_1","volume-title":"Findings of the Association for Computational Linguistics ACL 2024, pages 6233--6251, Bangkok, Thailand and virtual meeting","author":"Xiong Guangzhi","year":"2024","unstructured":"Guangzhi Xiong, Qiao Jin, Zhiyong Lu, and Aidong Zhang. Benchmarking retrieval-augmented generation for medicine. In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, Findings of the Association for Computational Linguistics ACL 2024, pages 6233--6251, Bangkok, Thailand and virtual meeting, August 2024. Association for Computational Linguistics."},{"key":"e_1_3_2_1_13_1","first-page":"387","volume-title":"International Conference on Artificial Intelligence in Education Technology","author":"Dayarathne Ranul","year":"2024","unstructured":"Ranul Dayarathne, Uvini Ranaweera, and Upeksha Ganegoda. Comparing the performance of llms in rag-based question-answering: A case study in computer science literature. In International Conference on Artificial Intelligence in Education Technology, pages 387--403. Springer, 2024."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00778-024-00864-x"},{"key":"e_1_3_2_1_15_1","first-page":"2206","volume-title":"International conference on machine learning","author":"Borgeaud Sebastian","year":"2022","unstructured":"Sebastian Borgeaud, Arthur Mensch, Jordan Hoffmann, Trevor Cai, Eliza Rutherford, Katie Millican, George Bm Van Den Driessche, Jean-Baptiste Lespiau, Bogdan Damoc, Aidan Clark, et al. Improving language models by retrieving from trillions of tokens. In International conference on machine learning, pages 2206--2240. PMLR, 2022."},{"key":"e_1_3_2_1_16_1","first-page":"5199","article-title":"Highly-efficient billion-scale approximate nearest neighborhood search","volume":"34","author":"Chen Qi","year":"2021","unstructured":"Qi Chen, Bing Zhao, Haidong Wang, Mingqin Li, Chuanjie Liu, Zengzhong Li, Mao Yang, and Jingdong Wang. Spann: Highly-efficient billion-scale approximate nearest neighborhood search. Advances in Neural Information Processing Systems, 34:5199--5212, 2021.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_17_1","volume-title":"Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs","author":"Malkov Yu A","year":"2018","unstructured":"Yu A Malkov and Dmitry A Yashunin. Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. IEEE transactions on pattern analysis and machine intelligence, 42(4):824--836, 2018."},{"key":"e_1_3_2_1_18_1","volume-title":"Product quantization for nearest neighbor search","author":"Jegou Herve","year":"2010","unstructured":"Herve Jegou, Matthijs Douze, and Cordelia Schmid. Product quantization for nearest neighbor search. IEEE transactions on pattern analysis and machine intelligence, 33(1):117--128, 2010."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","first-page":"681","DOI":"10.1162\/tacl_a_00667","article-title":"Evaluating Correctness and Faithfulness of Instruction-Following Models for Question Answering","volume":"12","author":"Adlakha Vaibhav","year":"2024","unstructured":"Vaibhav Adlakha, Parishad BehnamGhader, Xing Han Lu, Nicholas Meade, and Siva Reddy. Evaluating Correctness and Faithfulness of Instruction-Following Models for Question Answering. Transactions of the Association for Computational Linguistics, 12:681--699, 05 2024.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"e_1_3_2_1_20_1","volume-title":"The faiss library. arXiv preprint arXiv:2401.08281","author":"Douze Matthijs","year":"2024","unstructured":"Matthijs Douze, Alexandr Guzhva, Chengqi Deng, Jeff Johnson, Gergely Szilvasy, Pierre-Emmanuel Mazar\u00e9, Maria Lomeli, Lucas Hosseini, and Herv\u00e9 J\u00e9gou. The faiss library. arXiv preprint arXiv:2401.08281, 2024."},{"key":"e_1_3_2_1_21_1","volume-title":"The llama 3 herd of models. arXiv preprint arXiv:2407.21783","author":"Dubey Abhimanyu","year":"2024","unstructured":"Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, et al. The llama 3 herd of models. arXiv preprint arXiv:2407.21783, 2024."},{"key":"e_1_3_2_1_22_1","first-page":"32","article-title":"Fast accurate billion-point nearest neighbor search on a single node","author":"Subramanya Suhas Jayaram","year":"2019","unstructured":"Suhas Jayaram Subramanya, Fnu Devvrit, Harsha Vardhan Simhadri, Ravishankar Krishnawamy, and Rohan Kadekodi. Diskann: Fast accurate billion-point nearest neighbor search on a single node. Advances in Neural Information Processing Systems, 32, 2019.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_23_1","volume-title":"Ragserve: Fast quality-aware rag systems with configuration adaptation. arXiv preprint arXiv:2412.10543","author":"Ray Siddhant","year":"2024","unstructured":"Siddhant Ray, Rui Pan, Zhuohan Gu, Kuntai Du, Ganesh Anantha-narayanan, Ravi Netravali, and Junchen Jiang. Ragserve: Fast quality-aware rag systems with configuration adaptation. arXiv preprint arXiv:2412.10543, 2024."},{"key":"e_1_3_2_1_24_1","volume-title":"Ragcache: Efficient knowledge caching for retrieval-augmented generation. arXiv preprint arXiv:2404.12457","author":"Jin Chao","year":"2024","unstructured":"Chao Jin, Zili Zhang, Xuanlin Jiang, Fangyue Liu, Xin Liu, Xuanzhe Liu, and Xin Jin. Ragcache: Efficient knowledge caching for retrieval-augmented generation. arXiv preprint arXiv:2404.12457, 2024."},{"key":"e_1_3_2_1_25_1","volume-title":"Turborag: Accelerating retrieval-augmented generation with precomputed kv caches for chunked text. arXiv preprint arXiv:2410.07590","author":"Lu Songshuo","year":"2024","unstructured":"Songshuo Lu, Hua Wang, Yutian Rong, Zhi Chen, and Yaohua Tang. Turborag: Accelerating retrieval-augmented generation with precomputed kv caches for chunked text. arXiv preprint arXiv:2410.07590, 2024."},{"key":"e_1_3_2_1_26_1","volume-title":"Don't do rag: When cache-augmented generation is all you need for knowledge tasks. arXiv preprint arXiv:2412.15605","author":"Chan Brian J","year":"2024","unstructured":"Brian J Chan, Chao-Ting Chen, Jui-Hung Cheng, and Hen-Hsen Huang. Don't do rag: When cache-augmented generation is all you need for knowledge tasks. arXiv preprint arXiv:2412.15605, 2024."},{"key":"e_1_3_2_1_27_1","unstructured":"Zilong Wang Zifeng Wang Long Le Huaixiu Steven Zheng Swaroop Mishra Vincent Perot Yuwei Zhang Anush Mattapalli Ankur Taly Jingbo Shang et al. Speculative rag: Enhancing retrieval augmented generation through drafting. arXiv preprint arXiv:2407.08223 2024."},{"key":"e_1_3_2_1_28_1","volume-title":"Data caching issues in an information retrieval system. ACM Transactions on Database Systems (TODS), 15(3):359--384","author":"Alonso Rafael","year":"1990","unstructured":"Rafael Alonso, Daniel Barbara, and Hector Garcia-Molina. Data caching issues in an information retrieval system. ACM Transactions on Database Systems (TODS), 15(3):359--384, 1990."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/1559795.1559815"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/1526709.1526769"},{"key":"e_1_3_2_1_31_1","volume-title":"Similarity caching in large-scale image retrieval. Information processing & management, 48(5):803--818","author":"Falchi Fabrizio","year":"2012","unstructured":"Fabrizio Falchi, Claudio Lucchese, Salvatore Orlando, Raffaele Perego, and Fausto Rabitti. Similarity caching in large-scale image retrieval. Information processing & management, 48(5):803--818, 2012."},{"key":"e_1_3_2_1_32_1","volume-title":"Analysis of similarity caching on general cache networks","author":"Nakamura Ryo","year":"2024","unstructured":"Ryo Nakamura and Noriaki Kamiyama. Analysis of similarity caching on general cache networks. IEEE Access, 2024."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2018.2844983"},{"key":"e_1_3_2_1_34_1","volume-title":"Gpt semantic cache: Reducing llm costs and latency via semantic embedding caching. arXiv preprint arXiv:2411.05276","author":"Regmi Sajal","year":"2024","unstructured":"Sajal Regmi and Chetan Phakami Pun. Gpt semantic cache: Reducing llm costs and latency via semantic embedding caching. arXiv preprint arXiv:2411.05276, 2024."}],"event":{"name":"EuroMLSys '25: 5th Workshop on Machine Learning and Systems","location":"World Trade Center Rotterdam Netherlands","acronym":"EuroMLSys '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 5th Workshop on Machine Learning and Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721146.3721941","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721146.3721941","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:57:39Z","timestamp":1750298259000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721146.3721941"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":34,"alternative-id":["10.1145\/3721146.3721941","10.1145\/3721146"],"URL":"https:\/\/doi.org\/10.1145\/3721146.3721941","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-04-01","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}