{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,10]],"date-time":"2026-05-10T00:36:34Z","timestamp":1778373394956,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CNS-2146496"],"award-info":[{"award-number":["CNS-2146496"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CNS-2131826"],"award-info":[{"award-number":["CNS-2131826"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CNS-2313190"],"award-info":[{"award-number":["CNS-2313190"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CNS-1901466"],"award-info":[{"award-number":["CNS-1901466"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100006785","name":"Google","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006785","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3731569.3764834","type":"proceedings-article","created":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:43:24Z","timestamp":1759322604000},"page":"399-414","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["PrefillOnly: An Inference Engine for Prefill-only Workloads in Large Language Model Applications"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3964-4079","authenticated-orcid":false,"given":"Kuntai","family":"Du","sequence":"first","affiliation":[{"name":"University of Chicago \/ TensorMesh, Inc., Foster City, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9535-5350","authenticated-orcid":false,"given":"Bowen","family":"Wang","sequence":"additional","affiliation":[{"name":"Sky Computing Lab, Tsinghua University \/ UC Berkeley, Berkeley, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9045-9269","authenticated-orcid":false,"given":"Chen","family":"Zhang","sequence":"additional","affiliation":[{"name":"Sky Computing Lab, Tsinghua University \/ UC Berkeley, Berkeley, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1754-6880","authenticated-orcid":false,"given":"Yiming","family":"Cheng","sequence":"additional","affiliation":[{"name":"University of Chicago, Chicago, Illinois, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6223-4160","authenticated-orcid":false,"given":"Qing","family":"Lan","sequence":"additional","affiliation":[{"name":"LinkedIn, Mountain View, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2001-677X","authenticated-orcid":false,"given":"Hejian","family":"Sang","sequence":"additional","affiliation":[{"name":"LinkedIn, Mountain View, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3924-6886","authenticated-orcid":false,"given":"Yihua","family":"Cheng","sequence":"additional","affiliation":[{"name":"University of Chicago \/ TensorMesh, Inc., Foster City, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8588-4356","authenticated-orcid":false,"given":"Jiayi","family":"Yao","sequence":"additional","affiliation":[{"name":"University of Chicago \/ TensorMesh, Inc., Foster City, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4968-1027","authenticated-orcid":false,"given":"Xiaoxuan","family":"Liu","sequence":"additional","affiliation":[{"name":"Sky Computing Lab, UC Berkeley, Berkeley, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3651-6973","authenticated-orcid":false,"given":"Yifan","family":"Qiao","sequence":"additional","affiliation":[{"name":"Sky Computing Lab, UC Berkeley, Berkeley, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5373-0088","authenticated-orcid":false,"given":"Ion","family":"Stoica","sequence":"additional","affiliation":[{"name":"Sky Computing Lab, UC Berkeley, Berkeley, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6877-1683","authenticated-orcid":false,"given":"Junchen","family":"Jiang","sequence":"additional","affiliation":[{"name":"University of Chicago \/ TensorMesh, Inc., Foster City, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"add multi-item scoring by arde171 \u00b7 Pull Request #1015 \u00b7 flashinfer-ai\/flashinfer \u2014 github.com. https:\/\/github.com\/flashinfer-ai\/flashinfer\/pull\/1015. [Accessed 20-08-2025]."},{"key":"e_1_3_2_1_2_1","unstructured":"character.ai | personalized ai for every moment of your day. https:\/\/character.ai\/. (Accessed on 09\/07\/2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"How to import, add, & open \u2013 arc help center. [Online","author":"Extensions","year":"2025","unstructured":"Extensions in arc: How to import, add, & open \u2013 arc help center. [Online; accessed 2025-04-17]."},{"key":"e_1_3_2_1_4_1","unstructured":"GitHub - kvcache-ai\/Mooncake: Mooncake is the serving platform for Kimi a leading LLM service provided by Moonshot AI. \u2014 github.com. https:\/\/github.com\/kvcache-ai\/Mooncake. [Accessed 18-08-2025]."},{"key":"e_1_3_2_1_5_1","first-page":"134","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. Taming {Throughput-Latency} tradeoff in {LLM} inference with {Sarathi-Serve}. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pages 117\u2013134, 2024."},{"key":"e_1_3_2_1_6_1","first-page":"947","volume-title":"Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"2","author":"Ansel Jason","year":"2024","unstructured":"Jason Ansel, Edward Yang, Horace He, Natalia Gimelshein, Animesh Jain, Michael Voznesensky, Bin Bao, Peter Bell, David Berard, Evgeni Burovski, et al. Pytorch 2: Faster machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2, pages 929\u2013947, 2024."},{"key":"e_1_3_2_1_7_1","volume-title":"The ai code editor","year":"2025","unstructured":"Anysphere. The ai code editor. https:\/\/cursor.com\/, 2025."},{"key":"e_1_3_2_1_8_1","volume-title":"Locality-aware fair scheduling in llm serving. arXiv preprint arXiv:2501.14312","author":"Cao Shiyi","year":"2025","unstructured":"Shiyi Cao, Yichuan Wang, Ziming Mao, Pin-Lun Hsu, Liangsheng Yin, Tian Xia, Dacheng Li, Shu Liu, Yineng Zhang, Yang Zhou, et al. Locality-aware fair scheduling in llm serving. arXiv preprint arXiv:2501.14312, 2025."},{"key":"e_1_3_2_1_9_1","first-page":"168","volume-title":"Proceedings of the 13th ACM Conference on Embedded Networked Sensor Systems","author":"Yu-Han Chen Tiffany","year":"2015","unstructured":"Tiffany Yu-Han Chen, Lenin Ravindranath, Shuo Deng, Paramvir Bahl, and Hari Balakrishnan. Glimpse: Continuous, real-time object recognition on mobile devices. In Proceedings of the 13th ACM Conference on Embedded Networked Sensor Systems, pages 155\u2013168, 2015."},{"key":"e_1_3_2_1_10_1","volume-title":"A scalable approach to distributed large language model inference","author":"Cheng Yihua","year":"2025","unstructured":"Yihua Cheng. A scalable approach to distributed large language model inference. 2025."},{"key":"e_1_3_2_1_11_1","volume-title":"Do large language models need a content delivery network? arXiv preprint arXiv:2409.13761","author":"Cheng Yihua","year":"2024","unstructured":"Yihua Cheng, Kuntai Du, Jiayi Yao, and Junchen Jiang. Do large language models need a content delivery network? arXiv preprint arXiv:2409.13761, 2024."},{"key":"e_1_3_2_1_12_1","volume-title":"Onerec: Unifying retrieve and rank with generative recommender and iterative preference alignment. arXiv preprint arXiv:2502.18965","author":"Deng Jiaxin","year":"2025","unstructured":"Jiaxin Deng, Shiyao Wang, Kuo Cai, Lejian Ren, Qigen Hu, Weifeng Ding, Qiang Luo, and Guorui Zhou. Onerec: Unifying retrieve and rank with generative recommender and iterative preference alignment. arXiv preprint arXiv:2502.18965, 2025."},{"key":"e_1_3_2_1_13_1","volume-title":"Xgrammar: Flexible and efficient structured generation engine for large language models. arXiv preprint arXiv:2411.15100","author":"Dong Yixin","year":"2024","unstructured":"Yixin Dong, Charlie F Ruan, Yaxing Cai, Ruihang Lai, Ziyi Xu, Yilong Zhao, and Tianqi Chen. Xgrammar: Flexible and efficient structured generation engine for large language models. arXiv preprint arXiv:2411.15100, 2024."},{"key":"e_1_3_2_1_14_1","first-page":"176","volume-title":"Proceedings of the 2023 ACM Symposium on Cloud Computing","author":"Du Kuntai","year":"2023","unstructured":"Kuntai Du, Yuhan Liu, Yitian Hao, Qizheng Zhang, Haodong Wang, Yuyang Huang, Ganesh Ananthanarayanan, and Junchen Jiang. Oneadapt: Fast adaptation for deep learning applications via back-propagation. In Proceedings of the 2023 ACM Symposium on Cloud Computing, pages 158\u2013176, 2023."},{"key":"e_1_3_2_1_15_1","first-page":"450","article-title":"Optimizing video encoding for accurate video analytics","volume":"4","author":"Du Kuntai","year":"2022","unstructured":"Kuntai Du, Qizheng Zhang, Anton Arapin, Haodong Wang, Zhengxu Xia, and Junchen Jiang. Accmpeg: Optimizing video encoding for accurate video analytics. Proceedings of Machine Learning and Systems, 4:450\u2013466, 2022.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_16_1","volume-title":"Empowering many, biasing a few: Generalist credit scoring through large language models. arXiv preprint arXiv:2310.00566","author":"Feng Duanyu","year":"2023","unstructured":"Duanyu Feng, Yongfu Dai, Jimin Huang, Yifang Zhang, Qianqian Xie, Weiguang Han, Zhengyu Chen, Alejandro Lopez-Lira, and Hao Wang. Empowering many, biasing a few: Generalist credit scoring through large language models. arXiv preprint arXiv:2310.00566, 2023."},{"key":"e_1_3_2_1_17_1","volume-title":"360brew: A decoder-only foundation model for personalized ranking and recommendation. arXiv preprint arXiv:2501.16450","author":"Firooz Hamed","year":"2025","unstructured":"Hamed Firooz, Maziar Sanjabi, Adrian Englhardt, Aman Gupta, Ben Levine, Dre Olgiati, Gungor Polatkan, Iuliia Melnychuk, Karthik Ramgopal, Kirill Talanine, et al. 360brew: A decoder-only foundation model for personalized ranking and recommendation. arXiv preprint arXiv:2501.16450, 2025."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11023-020-09548-1"},{"key":"e_1_3_2_1_19_1","volume-title":"Github copilot - write code faster. https:\/\/copilot.github.com\/","year":"2025","unstructured":"GitHub. Github copilot - write code faster. https:\/\/copilot.github.com\/, 2025."},{"key":"e_1_3_2_1_20_1","first-page":"500","volume-title":"16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu, Mosharaf Chowdhury, Kang G Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Liu, and Chuanxiong Guo. Tiresias: A {GPU} cluster manager for distributed deep learning. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19), pages 485\u2013500, 2019."},{"key":"e_1_3_2_1_21_1","volume-title":"et al. Mtgr: Industrial-scale generative recommendation framework in meituan. arXiv preprint arXiv:2505.18654","author":"Han Ruidong","year":"2025","unstructured":"Ruidong Han, Bin Yin, Shangyu Chen, He Jiang, Fei Jiang, Xiang Li, Chi Ma, Mincong Huang, Xiaoguang Li, Chunzhen Jing, et al. Mtgr: Industrial-scale generative recommendation framework in meituan. arXiv preprint arXiv:2505.18654, 2025."},{"key":"e_1_3_2_1_22_1","volume-title":"Nan Duan, Weizhu Chen, et al. Annollm: Making large language models to be better crowdsourced annotators. arXiv preprint arXiv:2303.16854","author":"He Xingwei","year":"2023","unstructured":"Xingwei He, Zhenghao Lin, Yeyun Gong, Alex Jin, Hang Zhang, Chen Lin, Jian Jiao, Siu Ming Yiu, Nan Duan, Weizhu Chen, et al. Annollm: Making large language models to be better crowdsourced annotators. arXiv preprint arXiv:2303.16854, 2023."},{"key":"e_1_3_2_1_23_1","volume-title":"Kurt Keutzer, and Amir Gholami. Kvquant: Towards 10 million context length llm inference with kv cache quantization. arXiv preprint arXiv:2401.18079","author":"Hooper Coleman","year":"2024","unstructured":"Coleman Hooper, Sehoon Kim, Hiva Mohammadzadeh, Michael W Mahoney, Yakun Sophia Shao, Kurt Keutzer, and Amir Gholami. Kvquant: Towards 10 million context length llm inference with kv cache quantization. arXiv preprint arXiv:2401.18079, 2024."},{"key":"e_1_3_2_1_24_1","volume-title":"Epic: Efficient position-independent context caching for serving large language models","author":"Hu Junhao","year":"2024","unstructured":"Junhao Hu, Wenrui Huang, Haoyi Wang, Weidong Wang, Tiancheng Hu, Qin Zhang, Hao Feng, Xusheng Chen, Yizhou Shan, and Tao Xie. Epic: Efficient position-independent context caching for serving large language models, 2024."},{"key":"e_1_3_2_1_25_1","volume-title":"Towards large-scale generative ranking. arXiv preprint arXiv:2505.04180","author":"Huang Yanhua","year":"2025","unstructured":"Yanhua Huang, Yuqi Chen, Xiong Cao, Rui Yang, Mingliang Qi, Yinghao Zhu, Qingchang Han, Yaowei Liu, Zhaoyu Liu, Xuefeng Yao, et al. Towards large-scale generative ranking. arXiv preprint arXiv:2505.04180, 2025."},{"key":"e_1_3_2_1_26_1","first-page":"11869","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"39","author":"Jia Jian","year":"2025","unstructured":"Jian Jia, Yipei Wang, Yan Li, Honggang Chen, Xuehan Bai, Zhaocheng Liu, Jian Liang, Quan Chen, Han Li, Peng Jiang, et al. Learn: Knowledge adaptation from large language model to recommendation for practical industrial application. In Proceedings of the AAAI Conference on Artificial Intelligence, volume 39, pages 11861\u201311869, 2025."},{"key":"e_1_3_2_1_27_1","volume-title":"Ragcache: Efficient knowledge caching for retrieval-augmented generation. arXiv preprint arXiv:2404.12457","author":"Jin Chao","year":"2024","unstructured":"Chao Jin, Zili Zhang, Xuanlin Jiang, Fangyue Liu, Xin Liu, Xuanzhe Liu, and Xin Jin. Ragcache: Efficient knowledge caching for retrieval-augmented generation. arXiv preprint arXiv:2404.12457, 2024."},{"key":"e_1_3_2_1_28_1","volume-title":"Gear: An efficient kv cache compression recipe for near-lossless generative inference of llm","author":"Kang Hao","year":"2024","unstructured":"Hao Kang, Qingru Zhang, Souvik Kundu, Geonhwa Jeong, Zaoxing Liu, Tushar Krishna, and Tuo Zhao. Gear: An efficient kv cache compression recipe for near-lossless generative inference of llm, 2024."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_31_1","volume-title":"Spam-t5: Benchmarking large language models for few-shot email spam detection. arXiv preprint arXiv:2304.01238","author":"Labonne Maxime","year":"2023","unstructured":"Maxime Labonne and Sean Moran. Spam-t5: Benchmarking large language models for few-shot email spam detection. arXiv preprint arXiv:2304.01238, 2023."},{"key":"e_1_3_2_1_32_1","volume-title":"Depression detection on social media with large language models. arXiv preprint arXiv:2403.10750","author":"Lan Xiaochong","year":"2024","unstructured":"Xiaochong Lan, Yiming Cheng, Li Sheng, Chen Gao, and Yong Li. Depression detection on social media with large language models. arXiv preprint arXiv:2403.10750, 2024."},{"key":"e_1_3_2_1_33_1","first-page":"376","volume-title":"Proceedings of the Annual conference of the ACM Special Interest Group on Data Communication on the applications, technologies, architectures, and protocols for computer communication","author":"Li Yuanqi","year":"2020","unstructured":"Yuanqi Li, Arthi Padmanabhan, Pengzhan Zhao, Yufei Wang, Guoqing Harry Xu, and Ravi Netravali. Reducto: On-camera filtering for resource-efficient real-time video analytics. In Proceedings of the Annual conference of the ACM Special Interest Group on Data Communication on the applications, technologies, architectures, and protocols for computer communication, pages 359\u2013376, 2020."},{"key":"e_1_3_2_1_34_1","first-page":"6552","volume-title":"International Conference on Machine Learning","author":"Li Zhuohan","unstructured":"Zhuohan Li, Siyuan Zhuang, Shiyuan Guo, Danyang Zhuo, Hao Zhang, Dawn Song, and Ion Stoica. Terapipe: Token-level pipeline parallelism for training large-scale language models. In International Conference on Machine Learning, pages 6543\u20136552. PMLR, 2021."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3300061.3300116"},{"key":"e_1_3_2_1_36_1","volume-title":"Optimizing llm queries in relational workloads","author":"Liu Shu","year":"2024","unstructured":"Shu Liu, Asim Biswal, Audrey Cheng, Xiangxi Mo, Shiyi Cao, Joseph E. Gonzalez, Ion Stoica, and Matei Zaharia. Optimizing llm queries in relational workloads, 2024."},{"key":"e_1_3_2_1_37_1","volume-title":"Fingpt: Democratizing internet-scale data for financial large language models. arXiv preprint arXiv:2307.10485","author":"Liu Xiao-Yang","year":"2023","unstructured":"Xiao-Yang Liu, Guoxuan Wang, Hongyang Yang, and Daochen Zha. Fingpt: Democratizing internet-scale data for financial large language models. arXiv preprint arXiv:2307.10485, 2023."},{"key":"e_1_3_2_1_38_1","first-page":"56","volume-title":"Proceedings of the ACM SIGCOMM 2024 Conference, ACM SIGCOMM '24","author":"Liu Yuhan","year":"2024","unstructured":"Yuhan Liu, Hanchen Li, Yihua Cheng, Siddhant Ray, Yuyang Huang, Qizheng Zhang, Kuntai Du, Jiayi Yao, Shan Lu, Ganesh Ananthanarayanan, Michael Maire, Henry Hoffmann, Ari Holtzman, and Junchen Jiang. Cachegen: Kv cache compression and streaming for fast large language model serving. In Proceedings of the ACM SIGCOMM 2024 Conference, ACM SIGCOMM '24, page 38\u201356, New York, NY, USA, 2024. Association for Computing Machinery."},{"key":"e_1_3_2_1_39_1","volume-title":"Kivi: A tuning-free asymmetric 2bit quantization for kv cache. arXiv preprint arXiv:2402.02750","author":"Liu Zirui","year":"2024","unstructured":"Zirui Liu, Jiayi Yuan, Hongye Jin, Shaochen Zhong, Zhaozhuo Xu, Vladimir Braverman, Beidi Chen, and Xia Hu. Kivi: A tuning-free asymmetric 2bit quantization for kv cache. arXiv preprint arXiv:2402.02750, 2024."},{"key":"e_1_3_2_1_40_1","volume-title":"Github - lmcache\/lmcache: Redis for llms. [Online","year":"2025","unstructured":"LMCache. Github - lmcache\/lmcache: Redis for llms. [Online; accessed 2025-04-17]."},{"key":"e_1_3_2_1_41_1","volume-title":"Chatgpt: Conversational language model. https:\/\/chat.openai.com","author":"AI.","year":"2025","unstructured":"OpenAI. Chatgpt: Conversational language model. https:\/\/chat.openai.com, 2025."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606763"},{"key":"e_1_3_2_1_43_1","first-page":"132","volume-title":"2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA)","author":"Patel Pratyush","unstructured":"Pratyush Patel, Esha Choukse, Chaojie Zhang, Aashaka Shah, \u00cd\u00f1igo Goiri, Saeed Maleki, and Ricardo Bianchini. Splitwise: Efficient generative llm inference using phase splitting. In 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA), pages 118\u2013132. IEEE, 2024."},{"key":"e_1_3_2_1_44_1","first-page":"14","volume-title":"Proceedings of the Thirteenth EuroSys Conference","author":"Peng Yanghua","year":"2018","unstructured":"Yanghua Peng, Yixin Bao, Yangrui Chen, Chuan Wu, and Chuanxiong Guo. Optimus: an efficient dynamic resource scheduler for deep learning clusters. In Proceedings of the Thirteenth EuroSys Conference, pages 1\u201314, 2018."},{"key":"e_1_3_2_1_45_1","volume-title":"Perplexity is a free ai search engine. https:\/\/www.perplexity.ai\/","author":"Perlexity","year":"2025","unstructured":"Perlexity AI. Perplexity is a free ai search engine. https:\/\/www.perplexity.ai\/, 2025."},{"key":"e_1_3_2_1_46_1","volume-title":"Mooncake: A kvcache-centric disaggregated architecture for llm serving. arXiv preprint arXiv:2407.00079","author":"Qin Ruoyu","year":"2024","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. Mooncake: A kvcache-centric disaggregated architecture for llm serving. arXiv preprint arXiv:2407.00079, 2024."},{"key":"e_1_3_2_1_47_1","volume-title":"Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053, 2019."},{"key":"e_1_3_2_1_48_1","article-title":"A human-centered recommendation framework with llm agents","author":"Shu Yubo","year":"2024","unstructured":"Yubo Shu, Haonan Zhang, Hansu Gu, Peng Zhang, Tun Lu, Dongsheng Li, and Ning Gu. Rah! recsys-assistant-human: A human-centered recommendation framework with llm agents. IEEE Transactions on Computational Social Systems, 2024.","journal-title":"IEEE Transactions on Computational Social Systems"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11023-022-09602-0"},{"key":"e_1_3_2_1_50_1","volume-title":"Beyond classification: Financial reasoning in state-of-the-art language models. arXiv preprint arXiv:2305.01505","author":"Son Guijin","year":"2023","unstructured":"Guijin Son, Hanearl Jung, Moonjeong Hahm, Keonju Na, and Sol Jin. Beyond classification: Financial reasoning in state-of-the-art language models. arXiv preprint arXiv:2305.01505, 2023."},{"key":"e_1_3_2_1_51_1","volume-title":"Enhancing recommender systems with large language model reasoning graphs. arXiv preprint arXiv:2308.10835","author":"Wang Yan","year":"2023","unstructured":"Yan Wang, Zhixuan Chu, Xin Ouyang, Simeng Wang, Hongyan Hao, Yue Shen, Jinjie Gu, Siqiao Xue, James Y Zhang, Qing Cui, et al. Enhancing recommender systems with large language model reasoning graphs. arXiv preprint arXiv:2308.10835, 2023."},{"key":"e_1_3_2_1_52_1","volume-title":"Denny Zhou, et al. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, 35:24824\u201324837","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, 35:24824\u201324837, 2022."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11280-024-01291-2"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1002\/for.2625"},{"key":"e_1_3_2_1_55_1","volume-title":"Auto-gpt for online decision making: Benchmarks and additional opinions. arXiv preprint arXiv:2306.02224","author":"Yang Hui","year":"2023","unstructured":"Hui Yang, Sifu Yue, and Yunzhong He. Auto-gpt for online decision making: Benchmarks and additional opinions. arXiv preprint arXiv:2306.02224, 2023."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.3390\/app15073676"},{"key":"e_1_3_2_1_57_1","volume-title":"et al. Gr-llms: Recent advances in generative recommendation based on large language models. arXiv preprint arXiv:2507.06507","author":"Yang Zhen","year":"2025","unstructured":"Zhen Yang, Haitao Lin, Ziji Zhang, et al. Gr-llms: Recent advances in generative recommendation based on large language models. arXiv preprint arXiv:2507.06507, 2025."},{"key":"e_1_3_2_1_58_1","first-page":"109","volume-title":"Proceedings of the Twentieth European Conference on Computer Systems, EuroSys '25","author":"Yao Jiayi","year":"2025","unstructured":"Jiayi Yao, Hanchen Li, Yuhan Liu, Siddhant Ray, Yihua Cheng, Qizheng Zhang, Kuntai Du, Shan Lu, and Junchen Jiang. Cacheblend: Fast large language model serving for rag with cached knowledge fusion. In Proceedings of the Twentieth European Conference on Computer Systems, EuroSys '25, page 94\u2013109, New York, NY, USA, 2025. Association for Computing Machinery."},{"key":"e_1_3_2_1_59_1","volume-title":"et al. Flashinfer: Efficient and customizable attention engine for llm inference serving. arXiv preprint arXiv:2501.01005","author":"Ye Zihao","year":"2025","unstructured":"Zihao Ye, Lequn Chen, Ruihang Lai, Wuwei Lin, Yineng Zhang, Stephanie Wang, Tianqi Chen, Baris Kasikci, Vinod Grover, Arvind Krishnamurthy, et al. Flashinfer: Efficient and customizable attention engine for llm inference serving. arXiv preprint arXiv:2501.01005, 2025."},{"key":"e_1_3_2_1_60_1","first-page":"538","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 521\u2013538, 2022."},{"key":"e_1_3_2_1_61_1","volume-title":"Actions speak louder than words: Trillion-parameter sequential transducers for generative recommendations. arXiv preprint arXiv:2402.17152","author":"Zhai Jiaqi","year":"2024","unstructured":"Jiaqi Zhai, Lucy Liao, Xing Liu, Yueming Wang, Rui Li, Xuan Cao, Leon Gao, Zhaojie Gong, Fangda Gu, Michael He, et al. Actions speak louder than words: Trillion-parameter sequential transducers for generative recommendations. arXiv preprint arXiv:2402.17152, 2024."},{"key":"e_1_3_2_1_62_1","first-page":"20","volume-title":"Proceedings of the 3rd Workshop on Practical Adoption Challenges of ML for Systems","author":"Zhang Qizheng","year":"2024","unstructured":"Qizheng Zhang, Ali Imran, Enkeleda Bardhi, Tushar Swamy, Nathan Zhang, Muhammad Shahbaz, and Kunle Olukotun. Caravan: practical online learning of in-network ml models with labeling agents. In Proceedings of the 3rd Workshop on Practical Adoption Challenges of ML for Systems, pages 17\u201320, 2024."},{"key":"e_1_3_2_1_63_1","first-page":"13103","volume-title":"Findings of the Association for Computational Linguistics: EMNLP 2023","author":"Zhang Ruoyu","year":"2023","unstructured":"Ruoyu Zhang, Yanzeng Li, Yongliang Ma, Ming Zhou, and Lei Zou. Llmaaa: Making large language models as active annotators. In Findings of the Association for Computational Linguistics: EMNLP 2023, pages 13088\u201313103, 2023."},{"key":"e_1_3_2_1_64_1","volume-title":"Qwen3 embedding: Advancing text embedding and reranking through foundation models. arXiv preprint arXiv:2506.05176","author":"Zhang Yanzhao","year":"2025","unstructured":"Yanzhao Zhang, Mingxin Li, Dingkun Long, Xin Zhang, Huan Lin, Baosong Yang, Pengjun Xie, An Yang, Dayiheng Liu, Junyang Lin, et al. Qwen3 embedding: Advancing text embedding and reranking through foundation models. arXiv preprint arXiv:2506.05176, 2025."},{"key":"e_1_3_2_1_65_1","first-page":"36","article-title":"H2o: Heavy-hitter oracle for efficient generative inference of large language models","author":"Zhang Zhenyu","year":"2024","unstructured":"Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher R\u00e9, Clark Barrett, et al. H2o: Heavy-hitter oracle for efficient generative inference of large language models. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_66_1","volume-title":"Mpic: Position-independent multimodal context caching system for efficient mllm serving","author":"Zhao Shiju","year":"2025","unstructured":"Shiju Zhao, Junhao Hu, Rongxiao Huang, Jiaqi Zheng, and Guihai Chen. Mpic: Position-independent multimodal context caching system for efficient mllm serving, 2025."},{"key":"e_1_3_2_1_67_1","volume-title":"Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E. Gonzalez, Clark Barrett, and Ying Sheng. Sglang: Efficient execution of structured language model programs","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Chuyue Sun, Jeff Huang, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E. Gonzalez, Clark Barrett, and Ying Sheng. Sglang: Efficient execution of structured language model programs, 2024."},{"key":"e_1_3_2_1_68_1","first-page":"210","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. {DistServe}: Disaggregating prefill and decoding for goodput-optimized large language model serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pages 193\u2013210, 2024."},{"key":"e_1_3_2_1_69_1","first-page":"765","volume-title":"19th USENIX Symposium on Operating Systems Design and Implementation (OSDI 25)","author":"Zhu Kan","year":"2025","unstructured":"Kan Zhu, Yufei Gao, Yilong Zhao, Liangyu Zhao, Gefei Zuo, Yile Gu, Dedong Xie, Zihao Ye, Keisuke Kamahori, Chien-Yu Lin, et al. {NanoFlow}: Towards optimal large language model serving throughput. In 19th USENIX Symposium on Operating Systems Design and Implementation (OSDI 25), pages 749\u2013765, 2025."}],"event":{"name":"SOSP '25: ACM SIGOPS 31st Symposium on Operating Systems Principles","location":"Lotte Hotel World Seoul Republic of Korea","acronym":"SOSP '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","USENIX"]},"container-title":["Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles"],"original-title":[],"deposited":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:44:31Z","timestamp":1759322671000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731569.3764834"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":69,"alternative-id":["10.1145\/3731569.3764834","10.1145\/3731569"],"URL":"https:\/\/doi.org\/10.1145\/3731569.3764834","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}