{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T01:10:24Z","timestamp":1780708224420,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":81,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Research Grants Council of Hong Kong","award":["C7004-22G"],"award-info":[{"award-number":["C7004-22G"]}]},{"name":"CUHK","award":["4937007,4937008,5501329,5501517"],"award-info":[{"award-number":["4937007,4937008,5501329,5501517"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3676641.3716278","type":"proceedings-article","created":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T16:47:32Z","timestamp":1743094052000},"page":"1302-1316","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Towards End-to-End Optimization of LLM-based Applications with Ayo"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3785-9700","authenticated-orcid":false,"given":"Xin","family":"Tan","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0049-873X","authenticated-orcid":false,"given":"Yimin","family":"Jiang","sequence":"additional","affiliation":[{"name":"Unaffiliated, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1332-7342","authenticated-orcid":false,"given":"Yitao","family":"Yang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9359-9571","authenticated-orcid":false,"given":"Hong","family":"Xu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"https:\/\/github.com\/jerryjliu\/llama_index","year":"2022","unstructured":"LlamaIndex. https:\/\/github.com\/jerryjliu\/llama_index, 2022."},{"key":"e_1_3_2_1_2_1","volume-title":"https:\/\/https:\/\/github.com\/microsoft\/promptflow","year":"2023","unstructured":"Promptflow. https:\/\/https:\/\/github.com\/microsoft\/promptflow, 2023."},{"key":"e_1_3_2_1_3_1","volume-title":"https:\/\/github.com\/Significant-Gravitas\/AutoGPT","year":"2024","unstructured":"Autogpt. https:\/\/github.com\/Significant-Gravitas\/AutoGPT, 2024."},{"key":"e_1_3_2_1_4_1","volume-title":"https:\/\/www.bing.com\/chat","author":"Copilot Bing","year":"2024","unstructured":"Bing Copilot. https:\/\/www.bing.com\/chat, 2024."},{"key":"e_1_3_2_1_5_1","volume-title":"https:\/\/character.ai\/","year":"2024","unstructured":"Character.ai\/. https:\/\/character.ai\/, 2024."},{"key":"e_1_3_2_1_6_1","volume-title":"https:\/\/www.anthropic.com\/news\/contextualretrieval","year":"2024","unstructured":"contextual-retrieval. https:\/\/www.anthropic.com\/news\/contextualretrieval, 2024."},{"key":"e_1_3_2_1_7_1","volume-title":"https:\/\/fastapi.tiangolo.com\/","year":"2024","unstructured":"Fastapi. https:\/\/fastapi.tiangolo.com\/, 2024."},{"key":"e_1_3_2_1_8_1","volume-title":"https:\/\/huggingface.co\/datasets\/lighthouzai\/ finqabench","author":"Dataset Finqabench","year":"2024","unstructured":"Finqabench Dataset. https:\/\/huggingface.co\/datasets\/lighthouzai\/ finqabench, 2024."},{"key":"e_1_3_2_1_9_1","volume-title":"https:\/\/programmablesearchengine.google.com\/","author":"Google","year":"2024","unstructured":"Google custom search. https:\/\/programmablesearchengine.google.com\/, 2024."},{"key":"e_1_3_2_1_10_1","volume-title":"https:\/\/github.com\/Azure\/GPT-RAG","year":"2024","unstructured":"Gpt-rag. https:\/\/github.com\/Azure\/GPT-RAG, 2024."},{"key":"e_1_3_2_1_11_1","volume-title":"https:\/\/github.com\/deepset-ai\/haystack","year":"2024","unstructured":"haystack. https:\/\/github.com\/deepset-ai\/haystack, 2024."},{"key":"e_1_3_2_1_12_1","volume-title":"https:\/\/github.com\/langchain-ai\/langchain","year":"2024","unstructured":"Langchain. https:\/\/github.com\/langchain-ai\/langchain, 2024."},{"key":"e_1_3_2_1_13_1","volume-title":"https:\/\/python.langchain.com\/docs\/langgraph\/","year":"2024","unstructured":"LangGraph. https:\/\/python.langchain.com\/docs\/langgraph\/, 2024."},{"key":"e_1_3_2_1_14_1","volume-title":"https:\/\/github.com\/LazyAGI\/LazyLLM","year":"2024","unstructured":"Lazyllm. https:\/\/github.com\/LazyAGI\/LazyLLM, 2024."},{"key":"e_1_3_2_1_15_1","volume-title":"Exploration and practice from the perspective of trace. https:\/\/www.alibabacloud.com\/blog\/ observability-of-llm-applications-exploration-and-practice-fromthe- perspective-of-trace_601604","author":"Observability","year":"2024","unstructured":"Observability of llm applications: Exploration and practice from the perspective of trace. https:\/\/www.alibabacloud.com\/blog\/ observability-of-llm-applications-exploration-and-practice-fromthe- perspective-of-trace_601604, 2024."},{"key":"e_1_3_2_1_16_1","volume-title":"https:\/\/platform.openai.com\/docs\/guides\/function-calling","author":"Openai","year":"2024","unstructured":"Openai function calling. https:\/\/platform.openai.com\/docs\/guides\/function-calling, 2024."},{"key":"e_1_3_2_1_17_1","volume-title":"https:\/\/github.com\/aigc-apps\/PAI-RAG","year":"2024","unstructured":"Pairag. https:\/\/github.com\/aigc-apps\/PAI-RAG, 2024."},{"key":"e_1_3_2_1_18_1","volume-title":"https:\/\/www.perplexity.ai\/","author":"Perplexity","year":"2024","unstructured":"Perplexity ai. https:\/\/www.perplexity.ai\/, 2024."},{"key":"e_1_3_2_1_19_1","volume-title":"https:\/\/github.com\/pgvector\/pgvector","year":"2024","unstructured":"Pgvector. https:\/\/github.com\/pgvector\/pgvector, 2024."},{"key":"e_1_3_2_1_20_1","volume-title":"https:\/\/www.postgresql.org\/","year":"2024","unstructured":"Postgresql. https:\/\/www.postgresql.org\/, 2024."},{"key":"e_1_3_2_1_21_1","volume-title":"https:\/\/privatellm.app\/en","year":"2024","unstructured":"Privatellm. https:\/\/privatellm.app\/en, 2024."},{"key":"e_1_3_2_1_22_1","volume-title":"https:\/\/github.com\/triton-inference-server","author":"Triton","year":"2024","unstructured":"Triton inference server. https:\/\/github.com\/triton-inference-server, 2024."},{"key":"e_1_3_2_1_23_1","volume-title":"Proc. USENIX OSDI","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. Tensor- Flow: A system for Large-Scale machine learning. In Proc. USENIX OSDI, 2016."},{"key":"e_1_3_2_1_24_1","volume-title":"Taming throughput-latency tradeoff in llm inference with sarathi-serve. arXiv preprint arXiv:2403.02310","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S Gulavani, Alexey Tumanov, and Ramachandran Ramjee. Taming throughput-latency tradeoff in llm inference with sarathi-serve. arXiv preprint arXiv:2403.02310, 2024."},{"key":"e_1_3_2_1_25_1","volume-title":"Sarathi: Efficient llm inference by piggybacking decodes with chunked prefills. arXiv preprint arXiv:2308.16369","author":"Agrawal Amey","year":"2023","unstructured":"Amey Agrawal, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S Gulavani, and Ramachandran Ramjee. Sarathi: Efficient llm inference by piggybacking decodes with chunked prefills. arXiv preprint arXiv:2308.16369, 2023."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.nlposs-1.24"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2012.71"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D13-1160"},{"key":"e_1_3_2_1_29_1","volume-title":"Proc. USENIX NSDI","author":"Crankshaw Daniel","year":"2017","unstructured":"Daniel Crankshaw, Xin Wang, Guilio Zhou, Michael J Franklin, Joseph E Gonzalez, and Ion Stoica. Clipper: A {Low-Latency} online prediction serving system. In Proc. USENIX NSDI, 2017."},{"key":"e_1_3_2_1_30_1","volume-title":"Proc. NeurIPS","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. Flashattention: Fast and memory-efficient exact attention with ioawareness. In Proc. NeurIPS, 2022."},{"key":"e_1_3_2_1_31_1","volume-title":"Proc. ACL","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proc. ACL, 2018."},{"key":"e_1_3_2_1_32_1","volume-title":"Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997","author":"Gao Yunfan","year":"2023","unstructured":"Yunfan Gao, Yun Xiong, Xinyu Gao, Kangxiang Jia, Jinliu Pan, Yuxi Bi, Yi Dai, Jiawei Sun, and HaofenWang. Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997, 2023."},{"key":"e_1_3_2_1_33_1","volume-title":"Prompt cache: Modular attention reuse for low-latency inference. arXiv preprint arXiv:2311.04934","author":"Gim In","year":"2023","unstructured":"In Gim, Guojun Chen, Seung-seob Lee, Nikhil Sarda, Anurag Khandelwal, and Lin Zhong. Prompt cache: Modular attention reuse for low-latency inference. arXiv preprint arXiv:2311.04934, 2023."},{"key":"e_1_3_2_1_34_1","volume-title":"Proc. ACM Eurosys","author":"Gog Ionel","year":"2015","unstructured":"Ionel Gog, Malte Schwarzkopf, Natacha Crooks, Matthew P Grosvenor, Allen Clement, and Steven Hand. Musketeer: all for one, one for all in data processing systems. In Proc. ACM Eurosys, 2015."},{"key":"e_1_3_2_1_35_1","volume-title":"Proc. USENIX OSDI","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. Serving {DNNs} like clockwork: Performance predictability from the bottom up. In Proc. USENIX OSDI, 2020."},{"key":"e_1_3_2_1_36_1","volume-title":"Proc. Machine Learning and Systems","author":"Hong Ke","year":"2023","unstructured":"Ke Hong, Guohao Dai, Jiaming Xu, Qiuli Mao, Xiuhong Li, Jun Liu, Kangdi Chen, Hanyu Dong, and Yu Wang. Flashdecoding: Faster large language model inference on gpus. In Proc. Machine Learning and Systems, 2023."},{"key":"e_1_3_2_1_37_1","volume-title":"Data interpreter: An llm agent for data science. arXiv preprint arXiv:2402.18679","author":"Hong Sirui","year":"2024","unstructured":"Sirui Hong, Yizhang Lin, Bang Liu, Bangbang Liu, Binhao Wu, Danyang Li, Jiaqi Chen, Jiayi Zhang, Jinlin Wang, Li Zhang, Lingyao Zhang, Min Yang, Mingchen Zhuge, Taicheng Guo, Tuo Zhou, Wei Tao, Wenyi Wang, Xiangru Tang, Xiangtao Lu, Xiawu Zheng, Xinbing Liang, Yaying Fei, Yuheng Cheng, Zongze Xu, and Chenglin Wu. Data interpreter: An llm agent for data science. arXiv preprint arXiv:2402.18679, 2024."},{"key":"e_1_3_2_1_38_1","volume-title":"Zijuan Lin, Liyang Zhou, Chenyu Ran, Lingfeng Xiao, Chenglin Wu, and J\u00fcrgen Schmidhuber. Metagpt: Meta programming for a multiagent collaborative framework. arXiv preprint arXiv:2308.00352","author":"Hong Sirui","year":"2023","unstructured":"Sirui Hong, Mingchen Zhuge, Jonathan Chen, Xiawu Zheng, Yuheng Cheng, Ceyao Zhang, Jinlin Wang, Zili Wang, Steven Ka Shing Yau, Zijuan Lin, Liyang Zhou, Chenyu Ran, Lingfeng Xiao, Chenglin Wu, and J\u00fcrgen Schmidhuber. Metagpt: Meta programming for a multiagent collaborative framework. arXiv preprint arXiv:2308.00352, 2023."},{"key":"e_1_3_2_1_39_1","volume-title":"Inference without interference: Disaggregate llm inference for mixed downstream workloads. arXiv preprint arXiv:2401.11181","author":"Hu Cunchen","year":"2024","unstructured":"Cunchen Hu, Heyang Huang, Liangliang Xu, Xusheng Chen, Jiang Xu, Shuang Chen, Hao Feng, Chenxi Wang, Sa Wang, Yungang Bao, et al. Inference without interference: Disaggregate llm inference for mixed downstream workloads. arXiv preprint arXiv:2401.11181, 2024."},{"key":"e_1_3_2_1_40_1","volume-title":"A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions. arXiv preprint arXiv:2311.05232","author":"Huang Lei","year":"2023","unstructured":"Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions. arXiv preprint arXiv:2311.05232, 2023."},{"key":"e_1_3_2_1_41_1","volume-title":"Tool calling: Enhancing medication consultation via retrieval-augmented large language models. arXiv preprint arXiv:2404.17897","author":"Huang Zhongzhen","year":"2024","unstructured":"Zhongzhen Huang, Kui Xue, Yongqi Fan, Linjie Mu, Ruoyu Liu, Tong Ruan, Shaoting Zhang, and Xiaofan Zhang. Tool calling: Enhancing medication consultation via retrieval-augmented large language models. arXiv preprint arXiv:2404.17897, 2024."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/1272996.1273005"},{"key":"e_1_3_2_1_43_1","volume-title":"Query expansion by prompting large language models. arXiv preprint arXiv:2305.03653","author":"Jagerman Rolf","year":"2023","unstructured":"Rolf Jagerman, Honglei Zhuang, Zhen Qin, Xuanhui Wang, and Michael Bendersky. Query expansion by prompting large language models. arXiv preprint arXiv:2305.03653, 2023."},{"key":"e_1_3_2_1_44_1","volume-title":"Sung Ju Hwang, and Jong C Park. Adaptive-rag: Learning to adapt retrieval-augmented large language models through question complexity. arXiv preprint arXiv:2403.14403","author":"Jeong Soyeong","year":"2024","unstructured":"Soyeong Jeong, Jinheon Baek, Sukmin Cho, Sung Ju Hwang, and Jong C Park. Adaptive-rag: Learning to adapt retrieval-augmented large language models through question complexity. arXiv preprint arXiv:2403.14403, 2024."},{"key":"e_1_3_2_1_45_1","volume-title":"Ragcache: Efficient knowledge caching for retrievalaugmented generation. arXiv preprint arXiv:2404.12457","author":"Jin Chao","year":"2024","unstructured":"Chao Jin, Zili Zhang, Xuanlin Jiang, Fangyue Liu, Xin Liu, Xuanzhe Liu, and Xin Jin. Ragcache: Efficient knowledge caching for retrievalaugmented generation. arXiv preprint arXiv:2404.12457, 2024."},{"key":"e_1_3_2_1_46_1","volume-title":"et al. Dspy: Compiling declarative language model calls into self-improving pipelines. arXiv preprint arXiv:2310.03714","author":"Khattab Omar","year":"2023","unstructured":"Omar Khattab, Arnav Singhvi, Paridhi Maheshwari, Zhiyuan Zhang, Keshav Santhanam, Sri Vardhamanan, Saiful Haq, Ashutosh Sharma, Thomas T Joshi, Hanna Moazam, et al. Dspy: Compiling declarative language model calls into self-improving pipelines. arXiv preprint arXiv:2310.03714, 2023."},{"key":"e_1_3_2_1_47_1","volume-title":"An llm compiler for parallel function calling. arXiv preprint arXiv:2312.04511","author":"Kim Sehoon","year":"2023","unstructured":"Sehoon Kim, Suhong Moon, Ryan Tabrizi, Nicholas Lee, Michael W Mahoney, Kurt Keutzer, and Amir Gholami. An llm compiler for parallel function calling. arXiv preprint arXiv:2312.04511, 2023."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_49_1","volume-title":"Retrieval-augmented generation for knowledgeintensive nlp tasks","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al. Retrieval-augmented generation for knowledgeintensive nlp tasks, 2020."},{"key":"e_1_3_2_1_50_1","volume-title":"Proc. USENIX OSDI","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E Gonzalez, et al. {AlpaServe}: Statistical multiplexing with model parallelism for deep learning serving. In Proc. USENIX OSDI, 2023."},{"key":"e_1_3_2_1_51_1","volume-title":"et al. Infinite-llm: Efficient llm service for long context with distattention and distributed kvcache. arXiv preprint arXiv:2401.02669","author":"Lin Bin","year":"2024","unstructured":"Bin Lin, Tao Peng, Chen Zhang, Minmin Sun, Lanbo Li, Hanyu Zhao, Wencong Xiao, Qi Xu, Xiafei Qiu, Shen Li, et al. Infinite-llm: Efficient llm service for long context with distattention and distributed kvcache. arXiv preprint arXiv:2401.02669, 2024."},{"key":"e_1_3_2_1_52_1","volume-title":"Proc. USENIX OSDI","author":"Lin Chaofan","year":"2024","unstructured":"Chaofan Lin, Zhenhua Han, Chengruidong Zhang, Yuqing Yang, Fan Yang, Chen Chen, and Lili Qiu. Parrot: Efficient serving of llm-based applications with semantic variable. In Proc. USENIX OSDI, 2024."},{"key":"e_1_3_2_1_53_1","volume-title":"Truthfulqa: Measuring how models mimic human falsehoods. arXiv preprint arXiv:2109.07958","author":"Lin Stephanie","year":"2021","unstructured":"Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: Measuring how models mimic human falsehoods. arXiv preprint arXiv:2109.07958, 2021."},{"key":"e_1_3_2_1_54_1","volume-title":"Optimizing llm queries in relational workloads. arXiv preprint arXiv:2403.05821","author":"Liu Shu","year":"2024","unstructured":"Shu Liu, Asim Biswal, Audrey Cheng, Xiangxi Mo, Shiyi Cao, Joseph E Gonzalez, Ion Stoica, and Matei Zaharia. Optimizing llm queries in relational workloads. arXiv preprint arXiv:2403.05821, 2024."},{"key":"e_1_3_2_1_55_1","volume-title":"Optimizing llm queries in relational workloads. arXiv preprint arXiv:2403.05821","author":"Liu Shu","year":"2024","unstructured":"Shu Liu, Asim Biswal, Audrey Cheng, Xiangxi Mo, Shiyi Cao, Joseph E Gonzalez, Ion Stoica, and Matei Zaharia. Optimizing llm queries in relational workloads. arXiv preprint arXiv:2403.05821, 2024."},{"key":"e_1_3_2_1_56_1","volume-title":"Online speculative decoding. arXiv preprint arXiv:2310.07177","author":"Liu Xiaoxuan","year":"2023","unstructured":"Xiaoxuan Liu, Lanxiang Hu, Peter Bailis, Ion Stoica, Zhijie Deng, Alvin Cheung, and Hao Zhang. Online speculative decoding. arXiv preprint arXiv:2310.07177, 2023."},{"key":"e_1_3_2_1_57_1","volume-title":"Ra-isf: Learning to answer and understand from retrieval augmentation via iterative self-feedback. arXiv preprint arXiv:2403.06840","author":"Liu Yanming","year":"2024","unstructured":"Yanming Liu, Xinyue Peng, Xuhong Zhang, Weihao Liu, Jianwei Yin, Jiannan Cao, and Tianyu Du. Ra-isf: Learning to answer and understand from retrieval augmentation via iterative self-feedback. arXiv preprint arXiv:2403.06840, 2024."},{"key":"e_1_3_2_1_58_1","volume-title":"Proc. NeurIPS","author":"Madaan Aman","year":"2024","unstructured":"Aman Madaan, Niket Tandon, Prakhar Gupta, Skyler Hallinan, Luyu Gao, Sarah Wiegreffe, Uri Alon, Nouha Dziri, Shrimai Prabhumoye, Yiming Yang, et al. Self-refine: Iterative refinement with self-feedback. In Proc. NeurIPS, 2024."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651335"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640411"},{"key":"e_1_3_2_1_61_1","volume-title":"Proc. USENIX OSDI","author":"Moritz Philipp","year":"2018","unstructured":"Philipp Moritz, Robert Nishihara, Stephanie Wang, Alexey Tumanov, Richard Liaw, Eric Liang, Melih Elibol, Zongheng Yang, William Paul, Michael I Jordan, et al. Ray: A distributed framework for emerging {AI} applications. In Proc. USENIX OSDI, 2018."},{"key":"e_1_3_2_1_62_1","volume-title":"Lossless acceleration of large language model via adaptive n-gram parallel decoding. arXiv preprint arXiv:2404.08698","author":"Ou Jie","year":"2024","unstructured":"Jie Ou, Yueming Chen, and Wenhong Tian. Lossless acceleration of large language model via adaptive n-gram parallel decoding. arXiv preprint arXiv:2404.08698, 2024."},{"key":"e_1_3_2_1_63_1","volume-title":"Splitwise: Efficient generative llm inference using phase splitting. arXiv preprint arXiv:2311.18677","author":"Patel Pratyush","year":"2023","unstructured":"Pratyush Patel, Esha Choukse, Chaojie Zhang, \u00cd\u00f1igo Goiri, Aashaka Shah, Saeed Maleki, and Ricardo Bianchini. Splitwise: Efficient generative llm inference using phase splitting. arXiv preprint arXiv:2311.18677, 2023."},{"key":"e_1_3_2_1_64_1","volume-title":"Proc. NeurIPS","author":"Shen Yongliang","year":"2023","unstructured":"Yongliang Shen, Kaitao Song, Xu Tan, Dongsheng Li,Weiming Lu, and Yueting Zhuang. Hugginggpt: Solving ai tasks with chatgpt and its friends in huggingface. In Proc. NeurIPS, 2023."},{"key":"e_1_3_2_1_65_1","volume-title":"Fairness in serving large language models. arXiv preprint arXiv:2401.00588","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Shiyi Cao, Dacheng Li, Banghua Zhu, Zhuohan Li, Danyang Zhuo, Joseph E Gonzalez, and Ion Stoica. Fairness in serving large language models. arXiv preprint arXiv:2401.00588, 2023."},{"key":"e_1_3_2_1_66_1","volume-title":"Small models, big insights: Leveraging slim proxy models to decide when and what to retrieve for llms. arXiv preprint arXiv:2402.12052","author":"Tan Jiejun","year":"2024","unstructured":"Jiejun Tan, Zhicheng Dou, Yutao Zhu, Peidong Guo, Kun Fang, and Ji-Rong Wen. Small models, big insights: Leveraging slim proxy models to decide when and what to retrieve for llms. arXiv preprint arXiv:2402.12052, 2024."},{"key":"e_1_3_2_1_67_1","unstructured":"Gemma Team Morgane Riviere Shreya Pathak Pier Giuseppe Sessa Cassidy Hardin Surya Bhupatiraju L\u00e9onard Hussenot Thomas Mesnard Bobak Shahriari Alexandre Ram\u00e9 et al. Gemma 2: Improving open language models at a practical size. arXiv preprint arXiv:2408.00118 2024."},{"key":"e_1_3_2_1_68_1","volume-title":"Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie- Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971, 2023."},{"key":"e_1_3_2_1_69_1","volume-title":"Llama 2: Open foundation and finetuned chat models. arXiv preprint arXiv:2307.09288","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, et al. Llama 2: Open foundation and finetuned chat models. arXiv preprint arXiv:2307.09288, 2023."},{"key":"e_1_3_2_1_70_1","volume-title":"Proc. NeurIPS","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. Attention is All you Need. In Proc. NeurIPS, 2017."},{"key":"e_1_3_2_1_71_1","volume-title":"Loongserve: Efficiently serving long-context large language models with elastic sequence parallelism. arXiv preprint arXiv:2404.09526","author":"Wu Bingyang","year":"2024","unstructured":"Bingyang Wu, Shengyu Liu, Yinmin Zhong, Peng Sun, Xuanzhe Liu, and Xin Jin. Loongserve: Efficiently serving long-context large language models with elastic sequence parallelism. arXiv preprint arXiv:2404.09526, 2024."},{"key":"e_1_3_2_1_72_1","volume-title":"Autogen: Enabling next-gen llm applications via multi-agent conversation framework. arXiv preprint arXiv:2308.08155","author":"Wu Qingyun","year":"2023","unstructured":"Qingyun Wu, Gagan Bansal, Jieyu Zhang, Yiran Wu, Shaokun Zhang, Erkang Zhu, Beibin Li, Li Jiang, Xiaoyun Zhang, and Chi Wang. Autogen: Enabling next-gen llm applications via multi-agent conversation framework. arXiv preprint arXiv:2308.08155, 2023."},{"key":"e_1_3_2_1_73_1","volume-title":"Proc. ICML","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. Smoothquant: Accurate and efficient post-training quantization for large language models. In Proc. ICML, 2023."},{"key":"e_1_3_2_1_74_1","volume-title":"Cpack: Packaged resources to advance general chinese embedding. arXiv preprint arXiv:2309.07597","author":"Xiao Shitao","year":"2023","unstructured":"Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighof. Cpack: Packaged resources to advance general chinese embedding. arXiv preprint arXiv:2309.07597, 2023."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1259"},{"key":"e_1_3_2_1_76_1","volume-title":"Proc. USENIX OSDI","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. Orca: A distributed serving system for {Transformer-Based} generative models. In Proc. USENIX OSDI, 2022."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/2934664"},{"key":"e_1_3_2_1_78_1","volume-title":"Proc. USENIX NSDI","author":"Zhang Hong","year":"2023","unstructured":"Hong Zhang, Yupeng Tang, Anurag Khandelwal, and Ion Stoica. {SHEPHERD}: Serving {DNNs} in the wild. In Proc. USENIX NSDI, 2023."},{"key":"e_1_3_2_1_79_1","volume-title":"Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. Efficiently programming large language models using sglang. arXiv preprint arXiv:2312.07104","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Jeff Huang, Chuyue Sun, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. Efficiently programming large language models using sglang. arXiv preprint arXiv:2312.07104, 2023."},{"key":"e_1_3_2_1_80_1","volume-title":"Distserve: Disaggregating prefill and decoding for goodput-optimized large language model serving. arXiv preprint arXiv:2401.09670","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. Distserve: Disaggregating prefill and decoding for goodput-optimized large language model serving. arXiv preprint arXiv:2401.09670, 2024."},{"key":"e_1_3_2_1_81_1","volume-title":"On optimal caching and model multiplexing for large model inference. arXiv preprint arXiv:2306.02003","author":"Zhu Banghua","year":"2023","unstructured":"Banghua Zhu, Ying Sheng, Lianmin Zheng, Clark Barrett, Michael I Jordan, and Jiantao Jiao. On optimal caching and model multiplexing for large model inference. arXiv preprint arXiv:2306.02003, 2023."}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3716278","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676641.3716278","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:14:00Z","timestamp":1755774840000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3716278"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":81,"alternative-id":["10.1145\/3676641.3716278","10.1145\/3676641"],"URL":"https:\/\/doi.org\/10.1145\/3676641.3716278","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}