{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T15:31:52Z","timestamp":1773588712804,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":22,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,22]]},"DOI":"10.1145\/3779212.3790206","type":"proceedings-article","created":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T13:55:26Z","timestamp":1773150926000},"page":"1492-1507","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["QoServe: Breaking the Silos of LLM Inference Serving"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-0675-4771","authenticated-orcid":false,"given":"Kanishk","family":"Goel","sequence":"first","affiliation":[{"name":"Microsoft Research, Bengaluru, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5260-3203","authenticated-orcid":false,"given":"Jayashree","family":"Mohan","sequence":"additional","affiliation":[{"name":"Microsoft Research, Bengaluru, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0354-6204","authenticated-orcid":false,"given":"Nipun","family":"Kwatra","sequence":"additional","affiliation":[{"name":"Microsoft Research, Bengaluru, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7563-5188","authenticated-orcid":false,"given":"Ravi Shreyas","family":"Anupindi","sequence":"additional","affiliation":[{"name":"Microsoft Research, Bengaluru, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0007-6040","authenticated-orcid":false,"given":"Ramachandran","family":"Ramjee","sequence":"additional","affiliation":[{"name":"Microsoft Research, Bengaluru, India"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,3,22]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n.d.]. vLLM: Easy fast and cheap LLM serving for everyone. https:\/\/github.com\/vllm-project\/vllm."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2407.07000"},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of The Seventh Annual Conference on Machine Learning and Systems, 2024","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Jayashree Mohan, Ashish Panwar, Nipun Kwatra, Bhargav S Gulavani, Ramachandran Ramjee, and Alexey Tumanov. 2024. Vidur: A Large-Scale Simulation Framework For LLM Inference. Proceedings of The Seventh Annual Conference on Machine Learning and Systems, 2024, Santa Clara (2024)."},{"key":"e_1_3_2_1_4_1","first-page":"117","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming {Throughput-Latency} tradeoff in {LLM} inference with {Sarathi-Serve}. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 117-134."},{"key":"e_1_3_2_1_5_1","volume-title":"SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills. arXiv:2308.16369 [cs.LG]","author":"Agrawal Amey","year":"2023","unstructured":"Amey Agrawal, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S. Gulavani, and Ramachandran Ramjee. 2023. SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills. arXiv:2308.16369 [cs.LG]"},{"key":"e_1_3_2_1_6_1","volume-title":"Medha: Efficiently Serving Multi-Million Context Length LLM Inference Requests Without Approximations. arXiv:2409.17264 [cs.LG] https:\/\/arxiv.org\/abs\/2409.17264","author":"Agrawal Amey","year":"2025","unstructured":"Amey Agrawal, Haoran Qiu, Junda Chen, \u00cd\u00f1igo Goiri, Chaojie Zhang, Rayyan Shahid, Ramachandran Ramjee, Alexey Tumanov, and Esha Choukse. 2025. Medha: Efficiently Serving Multi-Million Context Length LLM Inference Requests Without Approximations. arXiv:2409.17264 [cs.LG] https:\/\/arxiv.org\/abs\/2409.17264"},{"key":"e_1_3_2_1_7_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang Binyuan Hui Luo Ji Mei Li Junyang Lin Runji Lin Dayiheng Liu Gao Liu Chengqiang Lu Keming Lu Jianxin Ma Rui Men Xingzhang Ren Xuancheng Ren Chuanqi Tan Sinan Tan Jianhong Tu Peng Wang Shijie Wang Wei Wang Shengguang Wu Benfeng Xu Jin Xu An Yang Hao Yang Jian Yang Shusheng Yang Yang Yao Bowen Yu Hongyi Yuan Zheng Yuan Jianwei Zhang Xingxuan Zhang Yichang Zhang Zhenru Zhang Chang Zhou Jingren Zhou Xiaohuan Zhou and Tianhang Zhu. 2023. Qwen Technical Report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Gibbons","author":"Chen Siyuan","year":"2025","unstructured":"Siyuan Chen, Zhipeng Jia, Samira Khan, Arvind Krishnamurthy, and Phillip B. Gibbons. 2025. SLOs-Serve: Optimized Serving of Multi-SLO LLMs. arXiv:2504.08784 [cs.DC] https:\/\/arxiv.org\/abs\/2504.08784"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Shashwat Jaiswal Kunal Jain Yogesh Simmhan Anjaly Parayil Ankur Mallick Rujia Wang Renee St. Amant Chetan Bansal Victor R\u00fchle Anoop Kulkarni Steve Kofsky and Saravan Rajmohan. 2025. Sage-Serve: Optimizing LLM Serving on Cloud Data Centers with Forecast Aware Auto-Scaling. arXiv:2502.14617 [cs.DC] https:\/\/arxiv.org\/abs\/2502.14617","DOI":"10.1145\/3771576"},{"key":"e_1_3_2_1_10_1","unstructured":"Siddharth Jha Coleman Hooper Xiaoxuan Liu Sehoon Kim and Kurt Keutzer. 2024. Learned Best-Effort LLM Serving. arXiv:2401.07886 [cs.LG] https:\/\/arxiv.org\/abs\/2401.07886"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_12_1","unstructured":"Zhuohan Li Lianmin Zheng Yinmin Zhong Vincent Liu Ying Sheng Xin Jin Yanping Huang Zhifeng Chen Hao Zhang Joseph E. Gonzalez and Ion Stoica. 2023. AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. arXiv:2302.11665 [cs.LG] https:\/\/arxiv.org\/abs\/2302.11665"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Ramya Prabhu Ajay Nayak Jayashree Mohan Ramachandran Ramjee and Ashish Panwar. 2025. vAttention: Dynamic Memory Management for Serving LLMs without PagedAttention. arXiv:2405.04437 [cs.LG] https:\/\/arxiv.org\/abs\/2405.04437","DOI":"10.1145\/3669940.3707256"},{"key":"e_1_3_2_1_14_1","unstructured":"Yifan Qiao Shu Anzai Shan Yu Haoran Ma Yang Wang Miryung Kim and Harry Xu. 2024. ConServe: Harvesting GPUs for Low- Latency and High-Throughput Large Language Model Serving. arXiv:2410.01228 [cs.DC] https:\/\/arxiv.org\/abs\/2410.01228"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Jovan Stojkovic Chaojie Zhang \u00cd\u00f1igo Goiri Josep Torrellas and Esha Choukse. 2024. DynamoLLM: Designing LLM Inference Clusters for Performance and Energy Efficiency. arXiv:2408.00741 [cs.AI] https:\/\/arxiv.org\/abs\/2408.00741","DOI":"10.1109\/HPCA61900.2025.00102"},{"key":"e_1_3_2_1_16_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv:2307.09288 [cs.CL]"},{"key":"e_1_3_2_1_17_1","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, ? ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.), Vol. 30. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/ file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_1_18_1","unstructured":"vLLM Project. 2024. [RFC] Upstream Chunked Prefill #3130. https:\/\/github.com\/vllm-project\/vllm\/issues\/3130."},{"key":"e_1_3_2_1_19_1","unstructured":"Guan Wang Sijie Cheng Xianyuan Zhan Xiangang Li Sen Song and Yang Liu. 2024. OpenChat: Advancing Open-source Language Models with Mixed-Quality Data. arXiv:2309.11235 [cs.CL] https:\/\/arxiv.org\/abs\/2309.11235"},{"key":"e_1_3_2_1_20_1","volume-title":"Tempo: Application-aware LLM Serving with Mixed SLO Requirements. arXiv:2504.20068 [cs.DC] https:\/\/arxiv.org\/abs\/2504.20068","author":"Zhang Wei","year":"2025","unstructured":"Wei Zhang, Zhiyu Wu, Yi Mu, Banruo Liu, Myungjin Lee, and Fan Lai. 2025. Tempo: Application-aware LLM Serving with Mixed SLO Requirements. arXiv:2504.20068 [cs.DC] https:\/\/arxiv.org\/abs\/2504.20068"},{"key":"e_1_3_2_1_21_1","volume-title":"Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E. Gonzalez, Clark Barrett, and Ying Sheng.","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Chuyue Sun, Jeff Huang, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E. Gonzalez, Clark Barrett, and Ying Sheng. 2024. SGLang: Efficient Execution of Structured Language Model Programs. arXiv:2312.07104 [cs.AI] https:\/\/arxiv.org\/abs\/2312.07104"},{"key":"e_1_3_2_1_22_1","unstructured":"Kan Zhu Haiyang Shi Le Xu Jiaxin Shan Arvind Krishnamurthy Baris Kasikci and Liguang Xie. 2025. PolyServe: Efficient Multi-SLO Serving at Scale. arXiv:2507.17769 [cs.DC] https:\/\/arxiv.org\/abs\/2507.17769"}],"event":{"name":"ASPLOS '26: 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Pittsburgh PA USA","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"deposited":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T14:08:09Z","timestamp":1773583689000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3779212.3790206"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,22]]},"references-count":22,"alternative-id":["10.1145\/3779212.3790206","10.1145\/3779212"],"URL":"https:\/\/doi.org\/10.1145\/3779212.3790206","relation":{},"subject":[],"published":{"date-parts":[[2026,3,22]]},"assertion":[{"value":"2026-03-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}