{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,23]],"date-time":"2026-03-23T16:05:24Z","timestamp":1774281924831,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":77,"publisher":"ACM","funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2124039"],"award-info":[{"award-number":["2124039"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3731569.3764806","type":"proceedings-article","created":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:43:24Z","timestamp":1759322604000},"page":"623-638","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["HedraRAG: Co-Optimizing Generation and Retrieval for Heterogeneous RAG Workflows"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-8500-6173","authenticated-orcid":false,"given":"Zhengding","family":"Hu","sequence":"first","affiliation":[{"name":"Computer Science and Engineering, University of California San Diego, La Jolla, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3288-6858","authenticated-orcid":false,"given":"Vibha","family":"Murthy","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, University of California San Diego, La Jolla, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6759-2616","authenticated-orcid":false,"given":"Zaifeng","family":"Pan","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, University of California San Diego, La Jolla, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0098-0670","authenticated-orcid":false,"given":"Wanlu","family":"Li","sequence":"additional","affiliation":[{"name":"Nano and Chemical Engineering, University of California San Diego, La Jolla, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0660-0003","authenticated-orcid":false,"given":"Xiaoyi","family":"Fang","sequence":"additional","affiliation":[{"name":"RegAilator Inc, Hudson, Ohio, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8716-5793","authenticated-orcid":false,"given":"Yufei","family":"Ding","sequence":"additional","affiliation":[{"name":"University of California San Diego, La Jolla, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1634-8549","authenticated-orcid":false,"given":"Yuke","family":"Wang","sequence":"additional","affiliation":[{"name":"Computer Science, Rice University, Houston, Texas, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2019. Haystack. https:\/\/github.com\/deepset-ai\/haystack."},{"key":"e_1_3_2_1_2_1","unstructured":"2022. langChain. https:\/\/github.com\/langchain-ai\/langchain."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","first-page":"1295","DOI":"10.3390\/electronics9081295","article-title":"The k-means algorithm: A comprehensive survey and performance evaluation","volume":"9","author":"Ahmed Mohiuddin","year":"2020","unstructured":"Mohiuddin Ahmed, Raihan Seraj, and Syed Mohammed Shamsul Islam. 2020. The k-means algorithm: A comprehensive survey and performance evaluation. Electronics 9, 8 (2020), 1295.","journal-title":"Electronics"},{"key":"e_1_3_2_1_4_1","volume-title":"Kyle Lo, Luca Soldaini, Sergey Feldman, Mike D'arcy, et al.","author":"Asai Akari","year":"2024","unstructured":"Akari Asai, Jacqueline He, Rulin Shao, Weijia Shi, Amanpreet Singh, Joseph Chee Chang, Kyle Lo, Luca Soldaini, Sergey Feldman, Mike D'arcy, et al. 2024. Openscholar: Synthesizing scientific literature with retrieval-augmented lms. arXiv preprint arXiv:2411.14199 (2024)."},{"key":"e_1_3_2_1_5_1","volume-title":"Self-rag: Learning to retrieve, generate, and critique through self-reflection. arXiv preprint arXiv:2310.11511","author":"Asai Akari","year":"2023","unstructured":"Akari Asai, Zeqiu Wu, Yizhong Wang, Avirup Sil, and Hannaneh Hajishirzi. 2023. Self-rag: Learning to retrieve, generate, and critique through self-reflection. arXiv preprint arXiv:2310.11511 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"International conference on machine learning. PMLR, 2206\u20132240","author":"Borgeaud Sebastian","year":"2022","unstructured":"Sebastian Borgeaud, Arthur Mensch, Jordan Hoffmann, Trevor Cai, Eliza Rutherford, Katie Millican, George Bm Van Den Driessche, Jean-Baptiste Lespiau, Bogdan Damoc, Aidan Clark, et al. 2022. Improving language models by retrieving from trillions of tokens. In International conference on machine learning. PMLR, 2206\u20132240."},{"key":"e_1_3_2_1_7_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877\u20131901."},{"key":"e_1_3_2_1_8_1","volume-title":"Retrieval is accurate generation. arXiv preprint arXiv:2402.17532","author":"Cao Bowen","year":"2024","unstructured":"Bowen Cao, Deng Cai, Leyang Cui, Xuxin Cheng, Wei Bi, Yuexian Zou, and Shuming Shi. 2024. Retrieval is accurate generation. arXiv preprint arXiv:2402.17532 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"Reading wikipedia to answer open-domain questions. arXiv preprint arXiv:1704.00051","author":"Chen Danqi","year":"2017","unstructured":"Danqi Chen, Adam Fisch, Jason Weston, and Antoine Bordes. 2017. Reading wikipedia to answer open-domain questions. arXiv preprint arXiv:1704.00051 (2017)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Leonardo Dagum and Ramesh Menon. 1998. OpenMP: an industry standard API for shared-memory programming. IEEE computational science and engineering 5 1 (1998) 46\u201355.","DOI":"10.1109\/99.660313"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171\u20134186."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the 38th ACM SIGPLAN Conference on Programming Language Design and Implementation. 33\u201348","author":"Ding Yufei","year":"2017","unstructured":"Yufei Ding, Lin Ning, Hui Guan, and Xipeng Shen. 2017. Generalizations of the theory and deployment of triangular inequality for compiler-based strength reduction. In Proceedings of the 38th ACM SIGPLAN Conference on Programming Language Design and Implementation. 33\u201348."},{"key":"e_1_3_2_1_13_1","volume-title":"Top: A framework for enabling algorithmic optimizations for distance-related problems. In PVLDB. 1046\u20131057.","author":"Ding Yufei","year":"2015","unstructured":"Yufei Ding, Xipeng Shen, Madanlal Musuvathi, Todd Mytkowicz, and Madan Musuvathi. 2015. Top: A framework for enabling algorithmic optimizations for distance-related problems. In PVLDB. 1046\u20131057."},{"key":"e_1_3_2_1_14_1","unstructured":"Matthijs Douze Alexandr Guzhva Chengqi Deng Jeff Johnson Gergely Szilvasy Pierre-Emmanuel Mazar\u00e9 Maria Lomeli Lucas Hosseini and Herv\u00e9 J\u00e9gou. 2024. The Faiss library. (2024). arXiv:2401.08281 [cs.LG]"},{"key":"e_1_3_2_1_15_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"Precise zero-shot dense retrieval without relevance labels. arXiv preprint arXiv:2212.10496","author":"Gao Luyu","year":"2022","unstructured":"Luyu Gao, Xueguang Ma, Jimmy Lin, and Jamie Callan. 2022. Precise zero-shot dense retrieval without relevance labels. arXiv preprint arXiv:2212.10496 (2022)."},{"key":"e_1_3_2_1_17_1","first-page":"325","article-title":"Prompt cache: Modular attention reuse for low-latency inference","volume":"6","author":"Gim In","year":"2024","unstructured":"In Gim, Guojun Chen, Seung-seob Lee, Nikhil Sarda, Anurag Khandelwal, and Lin Zhong. 2024. Prompt cache: Modular attention reuse for low-latency inference. Proceedings of Machine Learning and Systems 6 (2024), 325\u2013338.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_18_1","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et al. 2025. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948 (2025)."},{"key":"e_1_3_2_1_19_1","volume-title":"Cognify: Supercharging Gen-AI Workflows With Hierarchical Autotuning. arXiv preprint arXiv:2502.08056","author":"He Zijian","year":"2025","unstructured":"Zijian He, Reyna Abhyankar, Vikranth Srivatsa, and Yiying Zhang. 2025. Cognify: Supercharging Gen-AI Workflows With Hierarchical Autotuning. arXiv preprint arXiv:2502.08056 (2025)."},{"key":"e_1_3_2_1_20_1","volume-title":"Saku Sugawara, and Akiko Aizawa.","author":"Ho Xanh","year":"2020","unstructured":"Xanh Ho, Anh-Khoa Duong Nguyen, Saku Sugawara, and Akiko Aizawa. 2020. Constructing a multi-hop qa dataset for comprehensive evaluation of reasoning steps. arXiv preprint arXiv:2011.01060 (2020)."},{"key":"e_1_3_2_1_21_1","volume-title":"Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al.","author":"Hoffmann Jordan","year":"2022","unstructured":"Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al. 2022. Training compute-optimal large language models. arXiv preprint arXiv:2203.15556 (2022)."},{"key":"e_1_3_2_1_22_1","volume-title":"Zijuan Lin, Liyang Zhou, et al.","author":"Hong Sirui","year":"2023","unstructured":"Sirui Hong, Xiawu Zheng, Jonathan Chen, Yuheng Cheng, Jinlin Wang, Ceyao Zhang, Zili Wang, Steven Ka Shing Yau, Zijuan Lin, Liyang Zhou, et al. 2023. Metagpt: Meta programming for multi-agent collaborative framework. arXiv preprint arXiv:2308.00352 3, 4 (2023), 6."},{"key":"e_1_3_2_1_23_1","first-page":"1","article-title":"Atlas: Few-shot learning with retrieval augmented language models","volume":"24","author":"Izacard Gautier","year":"2023","unstructured":"Gautier Izacard, Patrick Lewis, Maria Lomeli, Lucas Hosseini, Fabio Petroni, Timo Schick, Jane Dwivedi-Yu, Armand Joulin, Sebastian Riedel, and Edouard Grave. 2023. Atlas: Few-shot learning with retrieval augmented language models. Journal of Machine Learning Research 24, 251 (2023), 1\u201343.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_24_1","volume-title":"Sung Ju Hwang, and Jong C Park","author":"Jeong Soyeong","year":"2024","unstructured":"Soyeong Jeong, Jinheon Baek, Sukmin Cho, Sung Ju Hwang, and Jong C Park. 2024. Adaptive-rag: Learning to adapt retrieval-augmented large language models through question complexity. arXiv preprint arXiv:2403.14403 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"Longllmlingua: Accelerating and enhancing llms in long context scenarios via prompt compression. arXiv preprint arXiv:2310.06839","author":"Jiang Huiqiang","year":"2023","unstructured":"Huiqiang Jiang, Qianhui Wu, Xufang Luo, Dongsheng Li, Chin-Yew Lin, Yuqing Yang, and Lili Qiu. 2023. Longllmlingua: Accelerating and enhancing llms in long context scenarios via prompt compression. arXiv preprint arXiv:2310.06839 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"RAGO: Systematic Performance Optimization for Retrieval-Augmented Generation Serving. arXiv preprint arXiv:2503.14649","author":"Jiang Wenqi","year":"2025","unstructured":"Wenqi Jiang, Suvinay Subramanian, Cat Graves, Gustavo Alonso, Amir Yazdanbakhsh, and Vidushi Dadu. 2025. RAGO: Systematic Performance Optimization for Retrieval-Augmented Generation Serving. arXiv preprint arXiv:2503.14649 (2025)."},{"key":"e_1_3_2_1_27_1","volume-title":"Chameleon: a heterogeneous and disaggregated accelerator system for retrieval-augmented language models. arXiv preprint arXiv:2310.09949","author":"Jiang Wenqi","year":"2023","unstructured":"Wenqi Jiang, Marco Zeller, Roger Waleffe, Torsten Hoefler, and Gustavo Alonso. 2023. Chameleon: a heterogeneous and disaggregated accelerator system for retrieval-augmented language models. arXiv preprint arXiv:2310.09949 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"Piperag: Fast retrieval-augmented generation via algorithm-system co-design. arXiv preprint arXiv:2403.05676","author":"Jiang Wenqi","year":"2024","unstructured":"Wenqi Jiang, Shuai Zhang, Boran Han, Jie Wang, Bernie Wang, and Tim Kraska. 2024. Piperag: Fast retrieval-augmented generation via algorithm-system co-design. arXiv preprint arXiv:2403.05676 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"Active retrieval augmented generation. arXiv preprint arXiv:2305.06983","author":"Jiang Zhengbao","year":"2023","unstructured":"Zhengbao Jiang, Frank F Xu, Luyu Gao, Zhiqing Sun, Qian Liu, Jane Dwivedi-Yu, Yiming Yang, Jamie Callan, and Graham Neubig. 2023. Active retrieval augmented generation. arXiv preprint arXiv:2305.06983 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"RAGCache: Efficient Knowledge Caching for Retrieval-Augmented Generation. arXiv preprint arXiv:2404.12457","author":"Jin Chao","year":"2024","unstructured":"Chao Jin, Zili Zhang, Xuanlin Jiang, Fangyue Liu, Xin Liu, Xuanzhe Liu, and Xin Jin. 2024. RAGCache: Efficient Knowledge Caching for Retrieval-Augmented Generation. arXiv preprint arXiv:2404.12457 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"FlashRAG: A Modular Toolkit for Efficient Retrieval-Augmented Generation Research. arXiv preprint arXiv:2405.13576","author":"Jin Jiajie","year":"2024","unstructured":"Jiajie Jin, Yutao Zhu, Xinyu Yang, Chenghao Zhang, and Zhicheng Dou. 2024. FlashRAG: A Modular Toolkit for Efficient Retrieval-Augmented Generation Research. arXiv preprint arXiv:2405.13576 (2024)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","first-page":"535","DOI":"10.1109\/TBDATA.2019.2921572","article-title":"Billion-scale similarity search with GPUs","volume":"7","author":"Johnson Jeff","year":"2019","unstructured":"Jeff Johnson, Matthijs Douze, and Herv\u00e9 J\u00e9gou. 2019. Billion-scale similarity search with GPUs. IEEE Transactions on Big Data 7, 3 (2019), 535\u2013547.","journal-title":"IEEE Transactions on Big Data"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","first-page":"535","DOI":"10.1109\/TBDATA.2019.2921572","article-title":"Billion-scale similarity search with GPUs","volume":"7","author":"Johnson Jeff","year":"2019","unstructured":"Jeff Johnson, Matthijs Douze, and Herv\u00e9 J\u00e9gou. 2019. Billion-scale similarity search with GPUs. IEEE Transactions on Big Data 7, 3 (2019), 535\u2013547.","journal-title":"IEEE Transactions on Big Data"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","first-page":"535","DOI":"10.1109\/TBDATA.2019.2921572","article-title":"Billion-scale similarity search with GPUs","volume":"7","author":"Johnson Jeff","year":"2019","unstructured":"Jeff Johnson, Matthijs Douze, and Herv\u00e9 J\u00e9gou. 2019. Billion-scale similarity search with GPUs. IEEE Transactions on Big Data 7, 3 (2019), 535\u2013547.","journal-title":"IEEE Transactions on Big Data"},{"key":"e_1_3_2_1_35_1","volume-title":"Charles LA Clarke, and Davood Rafiei","author":"Kamalloo Ehsan","year":"2023","unstructured":"Ehsan Kamalloo, Nouha Dziri, Charles LA Clarke, and Davood Rafiei. 2023. Evaluating open-domain question answering in the era of large language models. arXiv preprint arXiv:2305.06984 (2023)."},{"key":"e_1_3_2_1_36_1","volume-title":"International Conference on Machine Learning. PMLR, 15696\u201315707","author":"Kandpal Nikhil","year":"2023","unstructured":"Nikhil Kandpal, Haikang Deng, Adam Roberts, Eric Wallace, and Colin Raffel. 2023. Large language models struggle to learn long-tail knowledge. In International Conference on Machine Learning. PMLR, 15696\u201315707."},{"key":"e_1_3_2_1_37_1","volume-title":"Scaling laws for neural language models. arXiv preprint arXiv:2001.08361","author":"Kaplan Jared","year":"2020","unstructured":"Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. 2020. Scaling laws for neural language models. arXiv preprint arXiv:2001.08361 (2020)."},{"key":"e_1_3_2_1_38_1","volume-title":"Tree of clarifications: Answering ambiguous questions with retrieval-augmented large language models. arXiv preprint arXiv:2310.14696","author":"Kim Gangwoo","year":"2023","unstructured":"Gangwoo Kim, Sungdong Kim, Byeongguk Jeon, Joonsuk Park, and Jaewoo Kang. 2023. Tree of clarifications: Answering ambiguous questions with retrieval-augmented large language models. arXiv preprint arXiv:2310.14696 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"5th online world conference on soft computing in industrial applications (WSC5)","author":"K\u00f6ppen Mario","unstructured":"Mario K\u00f6ppen. 2000. The curse of dimensionality. In 5th online world conference on soft computing in industrial applications (WSC5), Vol. 1. 4\u20138."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","first-page":"453","DOI":"10.1162\/tacl_a_00276","article-title":"Natural questions: a benchmark for question answering research","volume":"7","author":"Kwiatkowski Tom","year":"2019","unstructured":"Tom Kwiatkowski, Jennimaria Palomaki, Olivia Redfield, Michael Collins, Ankur Parikh, Chris Alberti, Danielle Epstein, Illia Polosukhin, Jacob Devlin, Kenton Lee, et al. 2019. Natural questions: a benchmark for question answering research. Transactions of the Association for Computational Linguistics 7 (2019), 453\u2013466.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"e_1_3_2_1_41_1","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles. 611\u2013626","author":"Kwon Woosuk","year":"2023","unstructured":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph Gonzalez, Hao Zhang, and Ion Stoica. 2023. Efficient memory management for large language model serving with pagedattention. In Proceedings of the 29th Symposium on Operating Systems Principles. 611\u2013626."},{"key":"e_1_3_2_1_42_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems 33 (2020), 9459\u20139474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_43_1","unstructured":"Chien-Yu Lin Keisuke Kamahori Yiyu Liu Xiaoxiang Shi Madhav Kashyap Yile Gu Rulin Shao Zihao Ye Kan Zhu Stephanie Wang et al. 2025. TeleRAG: Efficient Retrieval-Augmented Generation Inference with Lookahead Retrieval. arXiv preprint arXiv:2502.20969 (2025)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","unstructured":"Jerry Liu. 2022. LlamaIndex. https:\/\/github.com\/jerryjliu\/llama_index. 10.5281\/zenodo.1234","DOI":"10.5281\/zenodo.1234"},{"key":"e_1_3_2_1_45_1","volume-title":"Superposition Prompting: Improving and Accelerating Retrieval-Augmented Generation. arXiv preprint arXiv:2404.06910","author":"Merth Thomas","year":"2024","unstructured":"Thomas Merth, Qichen Fu, Mohammad Rastegari, and Mahyar Najibi. 2024. Superposition Prompting: Improving and Accelerating Retrieval-Augmented Generation. arXiv preprint arXiv:2404.06910 (2024)."},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the 2018 Conference of the ACM Special Interest Group on Data Communication. 327\u2013341","author":"Neugebauer Rolf","year":"2018","unstructured":"Rolf Neugebauer, Gianni Antichi, Jos\u00e9 Fernando Zazo, Yury Audzevich, Sergio L\u00f3pez-Buedo, and Andrew W Moore. 2018. Understanding PCIe performance for end host networking. In Proceedings of the 2018 Conference of the ACM Special Interest Group on Data Communication. 327\u2013341."},{"key":"e_1_3_2_1_47_1","unstructured":"OpenAI. 2024. Learning to Reason with Language Models. https:\/\/openai.com\/index\/learning-to-reason-with-llms\/"},{"key":"e_1_3_2_1_48_1","volume-title":"Companion Proceedings of the ACM on Web Conference","author":"Peng Wenjun","year":"2024","unstructured":"Wenjun Peng, Guiyang Li, Yue Jiang, Zilong Wang, Dan Ou, Xiaoyi Zeng, Derong Xu, Tong Xu, and Enhong Chen. 2024. Large language model based long-tail query rewriting in taobao search. In Companion Proceedings of the ACM on Web Conference 2024. 20\u201328."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","first-page":"1316","DOI":"10.1162\/tacl_a_00605","article-title":"In-context retrieval-augmented language models","volume":"11","author":"Ram Ori","year":"2023","unstructured":"Ori Ram, Yoav Levine, Itay Dalmedigos, Dor Muhlgay, Amnon Shashua, Kevin Leyton-Brown, and Yoav Shoham. 2023. In-context retrieval-augmented language models. Transactions of the Association for Computational Linguistics 11 (2023), 1316\u20131331.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"e_1_3_2_1_50_1","volume-title":"RAGServe: Fast Quality-Aware RAG Systems with Configuration Adaptation. arXiv preprint arXiv:2412.10543","author":"Ray Siddhant","year":"2024","unstructured":"Siddhant Ray, Rui Pan, Zhuohan Gu, Kuntai Du, Ganesh Ananthanarayanan, Ravi Netravali, and Junchen Jiang. 2024. RAGServe: Fast Quality-Aware RAG Systems with Configuration Adaptation. arXiv preprint arXiv:2412.10543 (2024)."},{"key":"e_1_3_2_1_51_1","volume-title":"Enhancing retrieval-augmented large language models with iterative retrieval-generation synergy. arXiv preprint arXiv:2305.15294","author":"Shao Zhihong","year":"2023","unstructured":"Zhihong Shao, Yeyun Gong, Yelong Shen, Minlie Huang, Nan Duan, and Weizhu Chen. 2023. Enhancing retrieval-augmented large language models with iterative retrieval-generation synergy. arXiv preprint arXiv:2305.15294 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision. 2998\u20133009","author":"Song Chan Hee","year":"2023","unstructured":"Chan Hee Song, Jiaman Wu, Clayton Washington, Brian M Sadler, Wei-Lun Chao, and Yu Su. 2023. Llm-planner: Few-shot grounded planning for embodied agents with large language models. In Proceedings of the IEEE\/CVF international conference on computer vision. 2998\u20133009."},{"key":"e_1_3_2_1_53_1","volume-title":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"2","author":"Tan Xin","year":"2025","unstructured":"Xin Tan, Yimin Jiang, Yitao Yang, and Hong Xu. 2025. Towards End-to-End Optimization of LLM-based Applications with Ayo. In Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2. 1302\u20131316."},{"key":"e_1_3_2_1_54_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_55_1","volume-title":"Attention is all you need. Advances in Neural Information Processing Systems","author":"Vaswani A","year":"2017","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_1_56_1","volume-title":"Proceedings of the 2021 International Conference on Management of Data. 2614\u20132627","author":"Wang Jianguo","year":"2021","unstructured":"Jianguo Wang, Xiaomeng Yi, Rentong Guo, Hai Jin, Peng Xu, Shengjun Li, Xiangyu Wang, Xiangzhou Guo, Chengming Li, Xiaohai Xu, et al. 2021. Milvus: A purpose-built vector data management system. In Proceedings of the 2021 International Conference on Management of Data. 2614\u20132627."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","first-page":"186345","DOI":"10.1007\/s11704-024-40231-1","article-title":"A survey on large language model based autonomous agents","volume":"18","author":"Wang Lei","year":"2024","unstructured":"Lei Wang, Chen Ma, Xueyang Feng, Zeyu Zhang, Hao Yang, Jingsen Zhang, Zhiyuan Chen, Jiakai Tang, Xu Chen, Yankai Lin, et al. 2024. A survey on large language model based autonomous agents. Frontiers of Computer Science 18, 6 (2024), 186345.","journal-title":"Frontiers of Computer Science"},{"key":"e_1_3_2_1_58_1","volume-title":"Text embeddings by weakly-supervised contrastive pre-training. arXiv preprint arXiv:2212.03533","author":"Wang Liang","year":"2022","unstructured":"Liang Wang, Nan Yang, Xiaolong Huang, Binxing Jiao, Linjun Yang, Daxin Jiang, Rangan Majumder, and Furu Wei. 2022. Text embeddings by weakly-supervised contrastive pre-training. arXiv preprint arXiv:2212.03533 (2022)."},{"key":"e_1_3_2_1_59_1","volume-title":"Multilingual e5 text embeddings: A technical report. arXiv preprint arXiv:2402.05672","author":"Wang Liang","year":"2024","unstructured":"Liang Wang, Nan Yang, Xiaolong Huang, Linjun Yang, Rangan Majumder, and Furu Wei. 2024. Multilingual e5 text embeddings: A technical report. arXiv preprint arXiv:2402.05672 (2024)."},{"key":"e_1_3_2_1_60_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems 35 (2022), 24824\u201324837."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"crossref","first-page":"121101","DOI":"10.1007\/s11432-024-4222-0","article-title":"The rise and potential of large language model based agents: A survey","volume":"68","author":"Xi Zhiheng","year":"2025","unstructured":"Zhiheng Xi, Wenxiang Chen, Xin Guo, Wei He, Yiwen Ding, Boyang Hong, Ming Zhang, Junzhe Wang, Senjie Jin, Enyu Zhou, et al. 2025. The rise and potential of large language model based agents: A survey. Science China Information Sciences 68, 2 (2025), 121101.","journal-title":"Science China Information Sciences"},{"key":"e_1_3_2_1_62_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Xu Fangyuan","year":"2024","unstructured":"Fangyuan Xu, Weijia Shi, and Eunsol Choi. 2024. RECOMP: Improving retrieval-augmented LMs with context compression and selective augmentation. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_63_1","first-page":"1","article-title":"Tribase: A Vector Data Query Engine for Reliable and Lossless Pruning Compression using Triangle Inequalities","volume":"3","author":"Xu Qian","year":"2025","unstructured":"Qian Xu, Juan Yang, Feng Zhang, Junda Pan, Kang Chen, Youren Shen, Amelie Chi Zhou, and Xiaoyong Du. 2025. Tribase: A Vector Data Query Engine for Reliable and Lossless Pruning Compression using Triangle Inequalities. Proceedings of the ACM on Management of Data 3, 1 (2025), 1\u201328.","journal-title":"Proceedings of the ACM on Management of Data"},{"key":"e_1_3_2_1_64_1","volume-title":"Corrective retrieval augmented generation. arXiv preprint arXiv:2401.15884","author":"Yan Shi-Qi","year":"2024","unstructured":"Shi-Qi Yan, Jia-Chen Gu, Yun Zhu, and Zhen-Hua Ling. 2024. Corrective retrieval augmented generation. arXiv preprint arXiv:2401.15884 (2024)."},{"key":"e_1_3_2_1_65_1","volume-title":"HotpotQA: A dataset for diverse, explainable multi-hop question answering. arXiv preprint arXiv:1809.09600","author":"Yang Zhilin","year":"2018","unstructured":"Zhilin Yang, Peng Qi, Saizheng Zhang, Yoshua Bengio, William W Cohen, Ruslan Salakhutdinov, and Christopher D Manning. 2018. HotpotQA: A dataset for diverse, explainable multi-hop question answering. arXiv preprint arXiv:1809.09600 (2018)."},{"key":"e_1_3_2_1_66_1","volume-title":"Proceedings of the Twentieth European Conference on Computer Systems. 94\u2013109","author":"Yao Jiayi","year":"2025","unstructured":"Jiayi Yao, Hanchen Li, Yuhan Liu, Siddhant Ray, Yihua Cheng, Qizheng Zhang, Kuntai Du, Shan Lu, and Junchen Jiang. 2025. CacheBlend: Fast Large Language Model Serving for RAG with Cached Knowledge Fusion. In Proceedings of the Twentieth European Conference on Computer Systems. 94\u2013109."},{"key":"e_1_3_2_1_67_1","volume-title":"A survey on large language model (llm) security and privacy: The good, the bad, and the ugly. High-Confidence Computing","author":"Yao Yifan","year":"2024","unstructured":"Yifan Yao, Jinhao Duan, Kaidi Xu, Yuanfang Cai, Zhibo Sun, and Yue Zhang. 2024. A survey on large language model (llm) security and privacy: The good, the bad, and the ugly. High-Confidence Computing (2024), 100211."},{"key":"e_1_3_2_1_68_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521\u2013538."},{"key":"e_1_3_2_1_69_1","volume-title":"Rankrag: Unifying context ranking with retrieval-augmented generation in llms. arXiv preprint arXiv:2407.02485","author":"Yu Yue","year":"2024","unstructured":"Yue Yu, Wei Ping, Zihan Liu, Boxin Wang, Jiaxuan You, Chao Zhang, Mohammad Shoeybi, and Bryan Catanzaro. 2024. Rankrag: Unifying context ranking with retrieval-augmented generation in llms. arXiv preprint arXiv:2407.02485 (2024)."},{"key":"e_1_3_2_1_70_1","volume-title":"Inference scaling for long-context retrieval augmented generation. arXiv preprint arXiv:2410.04343","author":"Yue Zhenrui","year":"2024","unstructured":"Zhenrui Yue, Honglei Zhuang, Aijun Bai, Kai Hui, Rolf Jagerman, Hansi Zeng, Zhen Qin, Dong Wang, Xuanhui Wang, and Michael Bendersky. 2024. Inference scaling for long-context retrieval augmented generation. arXiv preprint arXiv:2410.04343 (2024)."},{"key":"e_1_3_2_1_71_1","volume-title":"Honeycomb: A flexible llm-based agent system for materials science. arXiv preprint arXiv:2409.00135","author":"Zhang Huan","year":"2024","unstructured":"Huan Zhang, Yu Song, Ziyu Hou, Santiago Miret, and Bang Liu. 2024. Honeycomb: A flexible llm-based agent system for materials science. arXiv preprint arXiv:2409.00135 (2024)."},{"key":"e_1_3_2_1_72_1","volume-title":"Xi Victoria Lin, et al","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)."},{"key":"e_1_3_2_1_73_1","unstructured":"Yue Zhang Yafu Li Leyang Cui Deng Cai Lemao Liu Tingchen Fu Xinting Huang Enbo Zhao Yu Zhang Yulong Chen et al. 2023. Siren's song in the AI ocean: a survey on hallucination in large language models. arXiv preprint arXiv:2309.01219 (2023)."},{"key":"e_1_3_2_1_74_1","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Zhang Zili","year":"2024","unstructured":"Zili Zhang, Fangyue Liu, Gang Huang, Xuanzhe Liu, and Xin Jin. 2024. Fast Vector Query Processing for Large Datasets Beyond {GPU} Memory with Reordered Pipelining. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). 23\u201340."},{"key":"e_1_3_2_1_75_1","volume-title":"Phitchaya Mangpo Phothilimthana, and Zhihao Jia","author":"Zhang Zhihao","year":"2024","unstructured":"Zhihao Zhang, Alan Zhu, Lijie Yang, Yihua Xu, Lanting Li, Phitchaya Mangpo Phothilimthana, and Zhihao Jia. 2024. Accelerating retrieval-augmented language model serving with speculation. arXiv preprint arXiv:2401.14021 (2024)."},{"key":"e_1_3_2_1_76_1","volume-title":"Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics (NAACL).","author":"Zhuang Honglei","year":"2024","unstructured":"Honglei Zhuang, Zhen Qin, Kai Hui, Junru Wu, Le Yan, Xuanhui Wang, and Michael Bendersky. 2024. Beyond Yes and No: Improving Zero-Shot Pointwise LLM Rankers via Scoring Fine-Grained Relevance Labels. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)."},{"key":"e_1_3_2_1_77_1","volume-title":"Inverted files for text search engines. ACM computing surveys (CSUR) 38, 2","author":"Zobel Justin","year":"2006","unstructured":"Justin Zobel and Alistair Moffat. 2006. Inverted files for text search engines. ACM computing surveys (CSUR) 38, 2 (2006), 6\u2013es."}],"event":{"name":"SOSP '25: ACM SIGOPS 31st Symposium on Operating Systems Principles","location":"Lotte Hotel World Seoul Republic of Korea","acronym":"SOSP '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","USENIX"]},"container-title":["Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles"],"original-title":[],"deposited":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:48:46Z","timestamp":1759322926000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731569.3764806"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":77,"alternative-id":["10.1145\/3731569.3764806","10.1145\/3731569"],"URL":"https:\/\/doi.org\/10.1145\/3731569.3764806","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}