{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T16:30:02Z","timestamp":1759336202278,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3731569.3764823","type":"proceedings-article","created":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:43:24Z","timestamp":1759322604000},"page":"446-461","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Jenga: Effective Memory Management for Serving LLM with Heterogeneity"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9045-9269","authenticated-orcid":false,"given":"Chen","family":"Zhang","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, Beijing, China"},{"name":"UC Berkeley, Berkeley, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3964-4079","authenticated-orcid":false,"given":"Kuntai","family":"Du","sequence":"additional","affiliation":[{"name":"University of Chicago, Chicago, Illinois, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0825-6178","authenticated-orcid":false,"given":"Shu","family":"Liu","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8870-4892","authenticated-orcid":false,"given":"Woosuk","family":"Kwon","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4427-0036","authenticated-orcid":false,"given":"Xiangxi","family":"Mo","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9290-8978","authenticated-orcid":false,"given":"Yufeng","family":"Wang","sequence":"additional","affiliation":[{"name":"Independent Researcher, San Mateo, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4968-1027","authenticated-orcid":false,"given":"Xiaoxuan","family":"Liu","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1955-3743","authenticated-orcid":false,"given":"Kaichao","family":"You","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1534-9106","authenticated-orcid":false,"given":"Zhuohan","family":"Li","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5412-9120","authenticated-orcid":false,"given":"Mingsheng","family":"Long","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7656-6428","authenticated-orcid":false,"given":"Jidong","family":"Zhai","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2921-956X","authenticated-orcid":false,"given":"Joseph","family":"Gonzalez","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5373-0088","authenticated-orcid":false,"given":"Ion","family":"Stoica","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, California, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Nguyen Bach, Amit Bahree, Arash Bakhtiari, Jianmin Bao, Harkirat Behl, et al.","author":"Abdin Marah","year":"2024","unstructured":"Marah Abdin, Jyoti Aneja, Hany Awadalla, Ahmed Awadallah, Ammar Ahmad Awan, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Jianmin Bao, Harkirat Behl, et al. 2024. Phi-3 technical report: A highly capable language model locally on your phone. arXiv preprint arXiv:2404.14219 (2024)."},{"key":"e_1_3_2_1_2_1","first-page":"114","article-title":"Keyformer: Kv cache reduction through key tokens selection for efficient generative inference","volume":"6","author":"Adnan Muhammad","year":"2024","unstructured":"Muhammad Adnan, Akhil Arunkumar, Gaurav Jain, Prashant Nair, Ilya Soloveychik, and Purushotham Kamath. 2024. Keyformer: Kv cache reduction through key tokens selection for efficient generative inference. Proceedings of Machine Learning and Systems 6 (2024), 114\u2013127.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_3_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming {Throughput-Latency} Tradeoff in {LLM} Inference with {Sarathi-Serve}. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 117\u2013134."},{"key":"e_1_3_2_1_4_1","unstructured":"Mistral AI. 2024. Introducing the world's best edge models. https:\/\/huggingface.co\/mistralai\/Ministral-8B-Instruct-2410 Accessed: 2024-12-08."},{"key":"e_1_3_2_1_5_1","volume-title":"Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai. 2023. Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150","author":"Beltagy Iz","year":"2020","unstructured":"Iz Beltagy, Matthew E Peters, and Arman Cohan. 2020. Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150 (2020)."},{"key":"e_1_3_2_1_7_1","unstructured":"Jeff Bonwick et al. 1994. The slab allocator: An object-caching kernel memory allocator.. In USENIX summer Vol. 16. Boston MA USA."},{"key":"e_1_3_2_1_8_1","volume-title":"Pyramidkv: Dynamic kv cache compression based on pyramidal information funneling. arXiv preprint arXiv:2406.02069","author":"Cai Zefan","year":"2024","unstructured":"Zefan Cai, Yichi Zhang, Bofei Gao, Yuliang Liu, Tianyu Liu, Keming Lu, Wayne Xiong, Yue Dong, Baobao Chang, Junjie Hu, et al. 2024. Pyramidkv: Dynamic kv cache compression based on pyramidal information funneling. arXiv preprint arXiv:2406.02069 (2024)."},{"key":"e_1_3_2_1_9_1","unstructured":"Character.ai. 2024. Optimizing AI Inference at Character.AI. https:\/\/research.character.ai\/optimizing-inference\/?ref=blog.character.ai Accessed: 2024-12-10."},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185\u201324198","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al. 2024. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185\u201324198."},{"key":"e_1_3_2_1_11_1","volume-title":"Nvlm: Open frontier-class multimodal llms. arXiv preprint arXiv:2409.11402","author":"Dai Wenliang","year":"2024","unstructured":"Wenliang Dai, Nayeon Lee, Boxin Wang, Zhuolin Yang, Zihan Liu, Jon Barker, Tuomas Rintamaki, Mohammad Shoeybi, Bryan Catanzaro, and Wei Ping. 2024. Nvlm: Open frontier-class multimodal llms. arXiv preprint arXiv:2409.11402 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691","author":"Dao Tri","year":"2023","unstructured":"Tri Dao. 2023. Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691 (2023)."},{"key":"e_1_3_2_1_13_1","first-page":"16344","article-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. Flashattention: Fast and memory-efficient exact attention with io-awareness. Advances in Neural Information Processing Systems 35 (2022), 16344\u201316359.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_14_1","volume-title":"Shih-Yang Liu, Matthijs Van Keirsbilck, Min-Hung Chen, Yoshi Suhara, et al.","author":"Dong Xin","year":"2024","unstructured":"Xin Dong, Yonggan Fu, Shizhe Diao, Wonmin Byeon, Zijia Chen, Ameya Sunil Mahabaleshwarkar, Shih-Yang Liu, Matthijs Van Keirsbilck, Min-Hung Chen, Yoshi Suhara, et al. 2024. Hymba: A Hybridhead Architecture for Small Language Models. arXiv preprint arXiv:2411.13676 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"XGrammar: Flexible and Efficient Structured Generation Engine for Large Language Models. arXiv preprint arXiv:2411.15100","author":"Dong Yixin","year":"2024","unstructured":"Yixin Dong, Charlie F Ruan, Yaxing Cai, Ruihang Lai, Ziyi Xu, Yilong Zhao, and Tianqi Chen. 2024. XGrammar: Flexible and Efficient Structured Generation Engine for Large Language Models. arXiv preprint arXiv:2411.15100 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"MuxServe: Flexible Multiplexing for Efficient Multiple LLM Serving. arXiv preprint arXiv:2404.02015","author":"Duan Jiangfei","year":"2024","unstructured":"Jiangfei Duan, Runyu Lu, Haojie Duanmu, Xiuhong Li, Xingcheng Zhang, Dahua Lin, Ion Stoica, and Hao Zhang. 2024. MuxServe: Flexible Multiplexing for Efficient Multiple LLM Serving. arXiv preprint arXiv:2404.02015 (2024)."},{"key":"e_1_3_2_1_17_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_18_1","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Gao Bin","year":"2024","unstructured":"Bin Gao, Zhuomin He, Puru Sharma, Qingxuan Kang, Djordje Jevdjic, Junbo Deng, Xingkun Yang, Zhou Yu, and Pengfei Zuo. 2024. {Cost-Efficient} Large Language Model Serving for Multi-turn Conversations with {CachedAttention}. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). 111\u2013126."},{"key":"e_1_3_2_1_19_1","volume-title":"Mamba: Linear-time sequence modeling with selective state spaces. arXiv preprint arXiv:2312.00752","author":"Gu Albert","year":"2023","unstructured":"Albert Gu and Tri Dao. 2023. Mamba: Linear-time sequence modeling with selective state spaces. arXiv preprint arXiv:2312.00752 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"2","author":"Guo Cong","year":"2024","unstructured":"Cong Guo, Rui Zhang, Jiale Xu, Jingwen Leng, Zihan Liu, Ziyu Huang, Minyi Guo, Hao Wu, Shouren Zhao, Junping Zhao, et al. 2024. GMLake: Efficient and Transparent GPU Memory Defragmentation for Large-scale DNN Training with Virtual Memory Stitching. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2. 450\u2013466."},{"key":"e_1_3_2_1_21_1","volume-title":"Jeff Rasley, Samyam Rajbhandari, Reza Yazdani Aminabadi, Heyang Qin, Arash Bakhtiari, Lev Kurilenko, et al.","author":"Holmes Connor","year":"2024","unstructured":"Connor Holmes, Masahiro Tanaka, Michael Wyatt, Ammar Ahmad Awan, Jeff Rasley, Samyam Rajbhandari, Reza Yazdani Aminabadi, Heyang Qin, Arash Bakhtiari, Lev Kurilenko, et al. 2024. Deepspeed-fastgen: High-throughput text generation for llms via mii and deepspeed-inference. arXiv preprint arXiv:2401.08671 (2024)."},{"key":"e_1_3_2_1_22_1","unstructured":"Huggingface. 2024. arxiv-march-2023. https:\/\/huggingface.co\/datasets\/liyucheng\/arxiv-march-2023 Accessed: 2024-12-10."},{"key":"e_1_3_2_1_23_1","unstructured":"huggingface. 2024. Text Generation Inference. https:\/\/github.com\/huggingface\/text-generation-inference Accessed: 2024-12-10."},{"key":"e_1_3_2_1_24_1","volume-title":"International conference on machine learning. PMLR, 5156\u20135165","author":"Katharopoulos Angelos","year":"2020","unstructured":"Angelos Katharopoulos, Apoorv Vyas, Nikolaos Pappas, and Fran\u00e7ois Fleuret. 2020. Transformers are rnns: Fast autoregressive transformers with linear attention. In International conference on machine learning. PMLR, 5156\u20135165."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles. 611\u2013626","author":"Kwon Woosuk","year":"2023","unstructured":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph Gonzalez, Hao Zhang, and Ion Stoica. 2023. Efficient memory management for large language model serving with pagedattention. In Proceedings of the 29th Symposium on Operating Systems Principles. 611\u2013626."},{"key":"e_1_3_2_1_26_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Leviathan Yaniv","year":"2023","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. 2023. Fast inference from transformers via speculative decoding. In International Conference on Machine Learning. PMLR, 19274\u201319286."},{"key":"e_1_3_2_1_27_1","volume-title":"Llavaonevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326","author":"Li Bo","year":"2024","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Peiyuan Zhang, Yanwei Li, Ziwei Liu, et al. 2024. Llavaonevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"Jamba: A hybrid transformer-mamba language model. arXiv preprint arXiv:2403.19887","author":"Lieber Opher","year":"2024","unstructured":"Opher Lieber, Barak Lenz, Hofit Bata, Gal Cohen, Jhonathan Osin, Itay Dalmedigos, Erez Safahi, Shaked Meirom, Yonatan Belinkov, Shai Shalev-Shwartz, et al. 2024. Jamba: A hybrid transformer-mamba language model. arXiv preprint arXiv:2403.19887 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"Parrot: Efficient Serving of LLM-based Applications with Semantic Variable. arXiv preprint arXiv:2405.19888","author":"Lin Chaofan","year":"2024","unstructured":"Chaofan Lin, Zhenhua Han, Chengruidong Zhang, Yuqing Yang, Fan Yang, Chen Chen, and Lili Qiu. 2024. Parrot: Efficient Serving of LLM-based Applications with Semantic Variable. arXiv preprint arXiv:2405.19888 (2024)."},{"key":"e_1_3_2_1_30_1","unstructured":"Aixin Liu Bei Feng Bin Wang Bingxuan Wang Bo Liu Chenggang Zhao Chengqi Dengr Chong Ruan Damai Dai Daya Guo et al. 2024. Deepseek-v2: A strong economical and efficient mixture-of-experts language model. arXiv preprint arXiv:2405.04434 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems 36","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems 36 (2024)."},{"key":"e_1_3_2_1_32_1","volume-title":"Optimizing Speculative Decoding for Serving Large Language Models Using Goodput. arXiv preprint arXiv:2406.14066","author":"Liu Xiaoxuan","year":"2024","unstructured":"Xiaoxuan Liu, Cade Daniel, Langxiang Hu, Woosuk Kwon, Zhuohan Li, Xiangxi Mo, Alvin Cheung, Zhijie Deng, Ion Stoica, and Hao Zhang. 2024. Optimizing Speculative Decoding for Serving Large Language Models Using Goodput. arXiv preprint arXiv:2406.14066 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the ACM SIGCOMM 2024 Conference. 38\u201356","author":"Liu Yuhan","year":"2024","unstructured":"Yuhan Liu, Hanchen Li, Yihua Cheng, Siddhant Ray, Yuyang Huang, Qizheng Zhang, Kuntai Du, Jiayi Yao, Shan Lu, Ganesh Ananthanarayanan, et al. 2024. Cachegen: Kv cache compression and streaming for fast large language model serving. In Proceedings of the ACM SIGCOMM 2024 Conference. 38\u201356."},{"key":"e_1_3_2_1_34_1","unstructured":"Meta. [n. d.]. The Llama 4 herd: The beginning of a new era of natively multimodal AI innovation. https:\/\/ai.meta.com\/blog\/llama-4-multimodal-intelligence\/. [Accessed 2025-04-17]."},{"key":"e_1_3_2_1_35_1","volume-title":"International Conference on Machine Learning. PMLR, 26670\u201326698","author":"Orvieto Antonio","year":"2023","unstructured":"Antonio Orvieto, Samuel L Smith, Albert Gu, Anushan Fernando, Caglar Gulcehre, Razvan Pascanu, and Soham De. 2023. Resurrecting recurrent neural networks for long sequences. In International Conference on Machine Learning. PMLR, 26670\u201326698."},{"key":"e_1_3_2_1_36_1","volume-title":"Marconi: Prefix Caching for the Era of Hybrid LLMs. arXiv preprint arXiv:2411.19379","author":"Pan Rui","year":"2024","unstructured":"Rui Pan, Zhuang Wang, Zhen Jia, Can Karakus, Luca Zancato, Tri Dao, Ravi Netravali, and Yida Wang. 2024. Marconi: Prefix Caching for the Era of Hybrid LLMs. arXiv preprint arXiv:2411.19379 (2024)."},{"key":"e_1_3_2_1_37_1","volume-title":"Rwkv: Reinventing rnns for the transformer era. arXiv preprint arXiv:2305.13048","author":"Peng Bo","year":"2023","unstructured":"Bo Peng, Eric Alcaide, Quentin Anthony, Alon Albalak, Samuel Arcadinho, Stella Biderman, Huanqi Cao, Xin Cheng, Michael Chung, Matteo Grella, et al. 2023. Rwkv: Reinventing rnns for the transformer era. arXiv preprint arXiv:2305.13048 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"vAttention: Dynamic Memory Management for Serving LLMs without PagedAttention. arXiv preprint arXiv:2405.04437","author":"Prabhu Ramya","year":"2024","unstructured":"Ramya Prabhu, Ajay Nayak, Jayashree Mohan, Ramachandran Ramjee, and Ashish Panwar. 2024. vAttention: Dynamic Memory Management for Serving LLMs without PagedAttention. arXiv preprint arXiv:2405.04437 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Kimi's kvcachecentric architecture for llm serving. arXiv preprint arXiv:2407.00079","author":"Qin Ruoyu","year":"2024","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu Mooncake. 2024. Kimi's kvcachecentric architecture for llm serving. arXiv preprint arXiv:2407.00079 (2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"Flashattention-3: Fast and accurate attention with asynchrony and low-precision. arXiv preprint arXiv:2407.08608","author":"Shah Jay","year":"2024","unstructured":"Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, and Tri Dao. 2024. Flashattention-3: Fast and accurate attention with asynchrony and low-precision. arXiv preprint arXiv:2407.08608 (2024)."},{"key":"e_1_3_2_1_41_1","volume-title":"Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150","author":"Shazeer Noam","year":"2019","unstructured":"Noam Shazeer. 2019. Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150 (2019)."},{"key":"e_1_3_2_1_42_1","first-page":"296","article-title":"SLoRA: Scalable Serving of Thousands of LoRA Adapters","volume":"6","author":"Sheng Ying","year":"2024","unstructured":"Ying Sheng, Shiyi Cao, Dacheng Li, Coleman Hooper, Nicholas Lee, Shuo Yang, Christopher Chou, Banghua Zhu, Lianmin Zheng, Kurt Keutzer, et al. 2024. SLoRA: Scalable Serving of Thousands of LoRA Adapters. Proceedings of Machine Learning and Systems 6 (2024), 296\u2013311.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_43_1","volume-title":"Michael Tschannen, Daniel Keysers, Xiao Wang, Yonatan Bitton, Alexey Gritsenko, Matthias Minderer, Anthony Sherbondy, Shangbang Long, et al.","author":"Steiner Andreas","year":"2024","unstructured":"Andreas Steiner, Andr\u00e9 Susano Pinto, Michael Tschannen, Daniel Keysers, Xiao Wang, Yonatan Bitton, Alexey Gritsenko, Matthias Minderer, Anthony Sherbondy, Shangbang Long, et al. 2024. PaliGemma 2: A Family of Versatile VLMs for Transfer. arXiv preprint arXiv:2412.03555 (2024)."},{"key":"e_1_3_2_1_44_1","unstructured":"Gemma Team Aishwarya Kamath Johan Ferret Shreya Pathak Nino Vieillard Ramona Merhej Sarah Perrin Tatiana Matejovicova Alexandre Ram\u00e9 Morgane Rivi\u00e8re et al. 2025. Gemma 3 technical report. arXiv preprint arXiv:2503.19786 (2025)."},{"key":"e_1_3_2_1_45_1","volume-title":"Cassidy Hardin, Surya Bhupatiraju, L\u00e9onard Hussenot, Thomas Mesnard, Bobak Shahriari, Alexandre Ram\u00e9, et al.","author":"Team Gemma","year":"2024","unstructured":"Gemma Team, Morgane Riviere, Shreya Pathak, Pier Giuseppe Sessa, Cassidy Hardin, Surya Bhupatiraju, L\u00e9onard Hussenot, Thomas Mesnard, Bobak Shahriari, Alexandre Ram\u00e9, et al. 2024. Gemma 2: Improving open language models at a practical size. arXiv e-prints (2024), arXiv\u20132408."},{"key":"e_1_3_2_1_46_1","volume-title":"Attention is all you need. Advances in Neural Information Processing Systems","author":"Vaswani A","year":"2017","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_1_47_1","unstructured":"vLLM Team. [n. d.]. vLLM v0.6.0: 2.7x Throughput Improvement and 5x Latency Reduction \u2014 blog.vllm.ai. https:\/\/blog.vllm.ai\/2024\/09\/05\/perf-update.html. [Accessed 10-12-2024]."},{"key":"e_1_3_2_1_48_1","unstructured":"Bingning Wang Haizhou Zhao Huozhi Zhou Liang Song Mingyu Xu Wei Cheng Xiangrong Zeng Yupeng Zhang Yuqi Huo Zecheng Wang et al. 2025. Baichuan-m1: Pushing the medical capability of large language models. arXiv preprint arXiv:2502.12671 (2025)."},{"key":"e_1_3_2_1_49_1","unstructured":"Wikipedia. 2025. Buddy memory allocation. https:\/\/en.wikipedia.org\/wiki\/Buddy_memory_allocation Accessed: 2025-04-17."},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles. 640\u2013654","author":"Wu Bingyang","year":"2024","unstructured":"Bingyang Wu, Shengyu Liu, Yinmin Zhong, Peng Sun, Xuanzhe Liu, and Xin Jin. 2024. Loongserve: Efficiently serving long-context large language models with elastic sequence parallelism. In Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles. 640\u2013654."},{"key":"e_1_3_2_1_51_1","volume-title":"Efficient streaming language models with attention sinks. arXiv preprint arXiv:2309.17453","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Yuandong Tian, Beidi Chen, Song Han, and Mike Lewis. 2023. Efficient streaming language models with attention sinks. arXiv preprint arXiv:2309.17453 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"PyramidInfer: Pyramid KV Cache Compression for High-throughput LLM Inference. arXiv preprint arXiv:2405.12532","author":"Yang Dongjie","year":"2024","unstructured":"Dongjie Yang, XiaoDong Han, Yan Gao, Yao Hu, Shilin Zhang, and Hai Zhao. 2024. PyramidInfer: Pyramid KV Cache Compression for High-throughput LLM Inference. arXiv preprint arXiv:2405.12532 (2024)."},{"key":"e_1_3_2_1_53_1","volume-title":"CacheBlend: Fast Large Language Model Serving with Cached Knowledge Fusion. arXiv preprint arXiv:2405.16444","author":"Yao Jiayi","year":"2024","unstructured":"Jiayi Yao, Hanchen Li, Yuhan Liu, Siddhant Ray, Yihua Cheng, Qizheng Zhang, Kuntai Du, Shan Lu, and Junchen Jiang. 2024. CacheBlend: Fast Large Language Model Serving with Cached Knowledge Fusion. arXiv preprint arXiv:2405.16444 (2024)."},{"key":"e_1_3_2_1_54_1","unstructured":"Zihao Ye Lequn Chen Ruihang Lai Yilong Zhao Size Zheng Junru Shao Bohan Hou Hongyi Jin Yifei Zuo Liangsheng Yin Tianqi Chen and Luis Ceze. 2024. Accelerating Self-Attentions for LLM Serving with FlashInfer. https:\/\/flashinfer.ai\/2024\/02\/02\/introduce-flashinfer.html"},{"key":"e_1_3_2_1_55_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521\u2013538."},{"key":"e_1_3_2_1_56_1","volume-title":"Mmmu-pro: A more robust multi-discipline multimodal understanding benchmark. arXiv preprint arXiv:2409.02813","author":"Yue Xiang","year":"2024","unstructured":"Xiang Yue, Tianyu Zheng, Yuansheng Ni, Yubo Wang, Kai Zhang, Shengbang Tong, Yuxuan Sun, Botao Yu, Ge Zhang, Huan Sun, et al. 2024. Mmmu-pro: A more robust multi-discipline multimodal understanding benchmark. arXiv preprint arXiv:2409.02813 (2024)."},{"key":"e_1_3_2_1_57_1","volume-title":"Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al.","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Jeff Huang, Chuyue Sun, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. 2023. Efficiently programming large language models using sglang. arXiv e-prints (2023), arXiv\u20132312."},{"key":"e_1_3_2_1_58_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. {DistServe}: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 193\u2013210."}],"event":{"name":"SOSP '25: ACM SIGOPS 31st Symposium on Operating Systems Principles","location":"Lotte Hotel World Seoul Republic of Korea","acronym":"SOSP '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","USENIX"]},"container-title":["Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles"],"original-title":[],"deposited":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:55:19Z","timestamp":1759323319000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731569.3764823"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":58,"alternative-id":["10.1145\/3731569.3764823","10.1145\/3731569"],"URL":"https:\/\/doi.org\/10.1145\/3731569.3764823","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}