{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:18:38Z","timestamp":1775229518329,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,19]]},"DOI":"10.1145\/3772052.3772230","type":"proceedings-article","created":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:19:00Z","timestamp":1768321140000},"page":"776-789","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Symbiosis: Multi-Adapter Inference and Fine-Tuning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5814-3934","authenticated-orcid":false,"given":"Saransh","family":"Gupta","sequence":"first","affiliation":[{"name":"IBM Research, San Jose, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9506-0003","authenticated-orcid":false,"given":"Umesh","family":"Deshpande","sequence":"additional","affiliation":[{"name":"IBM Research, San Jose, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3033-3964","authenticated-orcid":false,"given":"Travis","family":"Janssen","sequence":"additional","affiliation":[{"name":"IBM Research, San Jose, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4468-3061","authenticated-orcid":false,"given":"Swaminathan","family":"Sundararaman","sequence":"additional","affiliation":[{"name":"IBM Research, San Jose, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. Best Practices for Generation with Cache. https:\/\/huggingface.co\/docs\/transformers\/en\/kv_cache#offloaded-cache. Accessed: 2024-09-23."},{"key":"e_1_3_2_1_2_1","volume-title":"Azure LLM inference Trace","year":"2024","unstructured":"2025. Azure LLM inference Trace 2024. https:\/\/github.com\/Azure\/AzurePublicDataset\/blob\/master\/AzureLLMInferenceDataset2024.md"},{"key":"e_1_3_2_1_3_1","unstructured":"2025. Fine-tune Meta Llama 3.2 using Amazon SageMaker. https:\/\/aws.amazon.com\/blogs\/machine-learning\/fine-tune-meta-llama-3-2-text-generation-models-for-generative-ai-inference-using-amazon-sagemaker-jumpstart"},{"key":"e_1_3_2_1_4_1","unstructured":"2025. FlexFlow: DNN Framework. \u201chttps:\/\/flexflow.ai\u201d"},{"key":"e_1_3_2_1_5_1","unstructured":"2025. LoRA fine-tuning Granite LLM. https:\/\/www.ibm.com\/think\/tutorials\/lora-fine-tuning-granite-llm"},{"key":"e_1_3_2_1_6_1","unstructured":"2025. LoRA Hyperparameters Guide. https:\/\/docs.unsloth.ai\/get-started\/fine-tuning-llms-guide\/lora-hyperparameters-guide"},{"key":"e_1_3_2_1_7_1","unstructured":"2025. NVIDIA H100 MIG Documentation. https:\/\/docs.nvidia.com\/launchpad\/ai\/h100-mig\/latest\/h100-mig-gpu.html. Accessed: 2025-01-14."},{"key":"e_1_3_2_1_8_1","volume-title":"SYMPHONY: Improving Memory Management for LLM Inference Workloads. arXiv:2412.16434 [cs.DC] https:\/\/arxiv.org\/abs\/2412.16434","author":"Agarwal Saurabh","year":"2024","unstructured":"Saurabh Agarwal, Anyong Mao, Aditya Akella, and Shivaram Venkataraman. 2024. SYMPHONY: Improving Memory Management for LLM Inference Workloads. arXiv:2412.16434 [cs.DC] https:\/\/arxiv.org\/abs\/2412.16434"},{"key":"e_1_3_2_1_9_1","volume-title":"SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills. arXiv preprint arXiv:2308.16369","author":"Agrawal Aayush","year":"2023","unstructured":"Aayush Agrawal, Animesh Panwar, Jatin Mohan, Naman Kwatra, Bhargav S. Gulavani, and Ramachandran Ramjee. 2023. SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills. arXiv preprint arXiv:2308.16369 (2023). https:\/\/arxiv.org\/abs\/2308.16369"},{"key":"e_1_3_2_1_10_1","unstructured":"Syed Ahmed Christian Sarofeen Mike Ruberry Eddie Yan Natalia Gimelshein Michael Carilli Szymon Migacz Piotr Bialecki Paulius Micikevicius Dusan Stosic Dong Yang and Naoya Maruyama. 2024. What Every User Should Know About Mixed Precision Training in PyTorch. (2024). https:\/\/pytorch.org\/blog\/what-every-user-should-know-about-mixed-precision-training-in-pytorch\/"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_12_1","volume-title":"Punica: Multi-Tenant LoRA Serving. arXiv:2310.18547 [cs.DC] https:\/\/arxiv.org\/abs\/2310.18547","author":"Chen Lequn","year":"2023","unstructured":"Lequn Chen, Zihao Ye, Yongji Wu, Danyang Zhuo, Luis Ceze, and Arvind Krishnamurthy. 2023. Punica: Multi-Tenant LoRA Serving. arXiv:2310.18547 [cs.DC] https:\/\/arxiv.org\/abs\/2310.18547"},{"key":"e_1_3_2_1_13_1","volume-title":"Efficient and Economic Large Language Model Inference with Attention Offloading. arXiv preprint arXiv:2405.01814","author":"Chen Shaoyuan","year":"2024","unstructured":"Shaoyuan Chen, Yutong Lin, Mingxing Zhang, and Yongwei Wu. 2024. Efficient and Economic Large Language Model Inference with Attention Offloading. arXiv preprint arXiv:2405.01814 (2024)."},{"key":"e_1_3_2_1_14_1","unstructured":"LMDeploy Contributors. 2023. LMDeploy: A Toolkit for Compressing Deploying and Serving LLM. https:\/\/github.com\/InternLM\/lmdeploy."},{"key":"e_1_3_2_1_15_1","unstructured":"LoRAX Contributors. 2024. LoRAX: Multi-LoRA inference server that scales to 1000s of fine-tuned LLMs. https:\/\/github.com\/lorax\/lorax. Accessed: 2024-09-16."},{"key":"e_1_3_2_1_16_1","unstructured":"Yichao Fu Siqi Zhu Runlong Su Aurick Qiao Ion Stoica and Hao Zhang. 2024. Efficient LLM Scheduling by Learning to Rank. arXiv:2408.15792 [cs.LG] https:\/\/arxiv.org\/abs\/2408.15792"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.5555\/3691992.3691999"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190541"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/MCOM.001.2000196"},{"key":"e_1_3_2_1_20_1","volume-title":"Chameleon: Adaptive Caching and Scheduling for Many-Adapter LLM Inference Environments. arXiv:2411.17741 [cs.DC] https:\/\/arxiv.org\/abs\/2411.17741","author":"Iliakopoulou Nikoleta","year":"2024","unstructured":"Nikoleta Iliakopoulou, Jovan Stojkovic, Chloe Alverti, Tianyin Xu, Hubertus Franke, and Josep Torrellas. 2024. Chameleon: Adaptive Caching and Scheduling for Many-Adapter LLM Inference Environments. arXiv:2411.17741 [cs.DC] https:\/\/arxiv.org\/abs\/2411.17741"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3721146.3721947"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_23_1","volume-title":"InfiniGen: Efficient Generative Inference of Large Language Models with Dynamic KV Cache Management. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Lee Wonbeom","year":"2024","unstructured":"Wonbeom Lee, Jungi Lee, Junghwan Seo, and Jaewoong Sim. 2024. InfiniGen: Efficient Generative Inference of Large Language Models with Dynamic KV Cache Management. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 155\u2013172. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/lee"},{"key":"e_1_3_2_1_24_1","unstructured":"LeewayHertz. 2025. Parameter-efficient Fine-tuning (PEFT): Overview benefits techniques and model training. https:\/\/www.leewayhertz.com\/parameter-efficient-fine-tuning\/"},{"key":"e_1_3_2_1_25_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems 33 (2020), 9459\u20139474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_26_1","volume-title":"MIXLORA: Enhancing Large Language Models Fine-Tuning with LoRA-based Mixture of Experts. arXiv preprint arXiv:2404.15159v3","author":"Li Dengchun","year":"2024","unstructured":"Dengchun Li, Yingzi Ma, Naizheng Wang, Zhengmao Ye, Zhiyuan Cheng, Yinghao Tang, Yan Zhang, Lei Duan, Jie Zuo, Cal Yang, and Mingjie Tang. 2024. MIXLORA: Enhancing Large Language Models Fine-Tuning with LoRA-based Mixture of Experts. arXiv preprint arXiv:2404.15159v3 (2024)."},{"key":"e_1_3_2_1_27_1","unstructured":"Suyi Li Hanfeng Lu Tianyuan Wu Minchen Yu Qizhen Weng Xusheng Chen Yizhou Shan Binhang Yuan and Wei Wang. [n. d.]. CaraServe: CPU-Assisted and Rank-Aware LoRA Serving for Generative LLM Inference. https:\/\/arxiv.org\/html\/2401.11240v1"},{"key":"e_1_3_2_1_28_1","volume-title":"Scaling Down to Scale Up: A Guide to Parameter-Efficient Fine-Tuning. arXiv preprint arXiv:2303.15647","author":"Lialin Vladislav","year":"2024","unstructured":"Vladislav Lialin, Vijeta Deshpande, Xiaowei Yao, and Anna Rumshisky. 2024. Scaling Down to Scale Up: A Guide to Parameter-Efficient Fine-Tuning. arXiv preprint arXiv:2303.15647 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"arXiv preprint arXiv:2312.05386","author":"Liang Jiacheng","year":"2023","unstructured":"Jiacheng Liang, Ren Pang, Changjiang Li, and Ting Wang. 2023. Model Extraction Attacks Revisited. arXiv preprint arXiv:2312.05386 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"Few-Shot Parameter-Efficient Fine-Tuning is Better and Cheaper than In-Context Learning. arXiv preprint arXiv:2205.05638","author":"Liu Haokun","year":"2022","unstructured":"Haokun Liu, Derek Tam, Mohammed Muqeeth, Jay Mohta, Tenghao Huang, Mohit Bansal, and Colin Raffel. 2022. Few-Shot Parameter-Efficient Fine-Tuning is Better and Cheaper than In-Context Learning. arXiv preprint arXiv:2205.05638 (2022)."},{"key":"e_1_3_2_1_31_1","volume-title":"PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods. https:\/\/github.com\/huggingface\/peft.","author":"Mangrulkar Sourab","year":"2022","unstructured":"Sourab Mangrulkar, Sylvain Gugger, Lysandre Debut, Younes Belkada, Sayak Paul, and Benjamin Bossan. 2022. PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods. https:\/\/github.com\/huggingface\/peft."},{"key":"e_1_3_2_1_32_1","volume-title":"RoSA: Accurate Parameter-Efficient Fine-Tuning via Robust Adaptation. arXiv preprint arXiv:2401.04679","author":"Nikdan Mahdi","year":"2024","unstructured":"Mahdi Nikdan, Soroush Tabesh, Elvir Crncevic, and Dan Alistarh. 2024. RoSA: Accurate Parameter-Efficient Fine-Tuning via Robust Adaptation. arXiv preprint arXiv:2401.04679 (2024). https:\/\/arxiv.org\/abs\/2401.04679"},{"key":"e_1_3_2_1_33_1","unstructured":"Gabriele Oliaro Xupeng Miao Xinhao Cheng Vineeth Kada Ruohan Gao Yingyi Huang Remi Delacourt April Yang Yingcheng Wang Mengdi Wu Colin Unger and Zhihao Jia. 2025. FlexLLM: A System for Co-Serving Large Language Model Inference and Parameter-Efficient Finetuning. arXiv:2402.18789 [cs.DC] https:\/\/arxiv.org\/abs\/2402.18789"},{"key":"e_1_3_2_1_34_1","volume-title":"InstInfer: In-Storage Attention Offloading for Cost-Effective Long-Context LLM Inference. arXiv preprint arXiv:2409.04992","author":"Pan Xiurui","year":"2024","unstructured":"Xiurui Pan, Endian Li, Qiao Li, Shengwen Liang, Yizhou Shan, Ke Zhou, Yingwei Luo, Xiaolin Wang, and Jie Zhang. 2024. InstInfer: In-Storage Attention Offloading for Cost-Effective Long-Context LLM Inference. arXiv preprint arXiv:2409.04992 (2024)."},{"key":"e_1_3_2_1_35_1","volume-title":"Splitwise: Efficient generative LLM inference using phase splitting. In ISCA. https:\/\/www.microsoft.com\/en-us\/research\/publication\/splitwise-efficient-generative-llm-inference-using-phase-splitting\/","author":"Patel Pratyush","year":"2024","unstructured":"Pratyush Patel, Esha Choukse, Chaojie Zhang, Aashaka Shah, \u00cd\u00f1igo Goiri, Saeed Maleki, and Ricardo Bianchini. 2024. Splitwise: Efficient generative LLM inference using phase splitting. In ISCA. https:\/\/www.microsoft.com\/en-us\/research\/publication\/splitwise-efficient-generative-llm-inference-using-phase-splitting\/"},{"key":"e_1_3_2_1_36_1","volume-title":"23rd USENIX Conference on File and Storage Technologies (FAST 25)","author":"Qin Ruoyu","year":"2025","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Jialei Cui, Feng Ren, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2025. Mooncake: Trading More Storage for Less Computation \u2014 A KVCache-centric Architecture for Serving LLM Chatbot. In 23rd USENIX Conference on File and Storage Technologies (FAST 25). USENIX Association, Santa Clara, CA, 155\u2013170. https:\/\/www.usenix.org\/conference\/fast25\/presentation\/qin"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML).","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022. Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","unstructured":"Ying Sheng Shiyi Cao Dacheng Li Coleman Hooper Nicholas Lee Shuo Yang Christopher Chou Banghua Zhu Lianmin Zheng Kurt Keutzer Joseph E. Gonzalez and Ion Stoica. 2024. S-LoRA: Serving Thousands of Concurrent LoRA Adapters. https:\/\/doi.org\/10.48550\/arXiv.2311.03285 arXiv:2311.03285 [cs].","DOI":"10.48550\/arXiv.2311.03285"},{"key":"e_1_3_2_1_40_1","unstructured":"Ying Sheng Lianmin Zheng Binhang Yuan Zhuohan Li Max Ryabinin Daniel Y. Fu Zhiqiang Xie Beidi Chen Clark Barrett Joseph E. Gonzalez Percy Liang Christopher R\u00e9 Ion Stoica and Ce Zhang. 2023. FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU. https:\/\/arxiv.org\/abs\/2303.06865"},{"key":"e_1_3_2_1_41_1","volume-title":"25th USENIX Security Symposium (USENIX Security 16)","author":"Tram\u00e8r Florian","year":"2016","unstructured":"Florian Tram\u00e8r, Fan Zhang, Ari Juels, Michael K. Reiter, and Thomas Ristenpart. 2016. Stealing Machine Learning Models via Prediction APIs. In 25th USENIX Security Symposium (USENIX Security 16). USENIX Association, Austin, TX, 601\u2013618. https:\/\/www.usenix.org\/conference\/usenixsecuritty16\/technical-sessions\/presentation\/tramer"},{"key":"e_1_3_2_1_42_1","volume-title":"HoneypotNet: Backdoor Attacks Against Model Extraction. arXiv preprint arXiv:2501.01090","author":"Wang Yixu","year":"2025","unstructured":"Yixu Wang, Tianle Gu, Yan Teng, Yingchun Wang, and Xingjun Ma. 2025. HoneypotNet: Backdoor Attacks Against Model Extraction. arXiv preprint arXiv:2501.01090 (2025)."},{"key":"e_1_3_2_1_43_1","volume-title":"Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush.","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Perric Cistac, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. 2020. Transformers: State-of-the-Art Natural Language Processing. Association for Computational Linguistics, 38\u201345. https:\/\/www.aclweb.org\/anthology\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38\u201345","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. 2020. Transformers: State-of-the-Art Natural Language Processing. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38\u201345. https:\/\/www.aclweb.org\/anthology\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_45_1","unstructured":"Bingyang Wu Yinmin Zhong Zili Zhang Shengyu Liu Fangyue Liu Yuanhang Sun Gang Huang Xuanzhe Liu and Xin Jin. 2024. Fast Distributed Inference Serving for Large Language Models. arXiv:2305.05920 [cs.LG] https:\/\/arxiv.org\/abs\/2305.05920"},{"key":"e_1_3_2_1_46_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Wu Bingyang","year":"2024","unstructured":"Bingyang Wu, Ruidong Zhu, Zili Zhang, Peng Sun, Xuanzhe Liu, and Xin Jin. 2024. dLoRA: Dynamically Orchestrating Requests and Adapters for LoRA LLM Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 911\u2013927. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/wu-bingyang"},{"key":"e_1_3_2_1_47_1","unstructured":"Guangxuan Xiao Jiaming Tang Jingwei Zuo Junxian Guo Shang Yang Haotian Tang Yao Fu and Song Han. 2024. DuoAttention: Efficient Long-Context LLM Inference with Retrieval and Streaming Heads. arXiv:2410.10819 [cs.CL] https:\/\/arxiv.org\/abs\/2410.10819"},{"key":"e_1_3_2_1_48_1","volume-title":"LayerKV: Optimizing Large Language Model Serving with Layer-wise KV Cache Management. arXiv preprint arXiv:2410.00428v3","author":"Xiong Yi","year":"2024","unstructured":"Yi Xiong, Hao Wu, Changxu Shao, Ziqing Wang, Rui Zhang, Yuhong Guo, Junping Zhao, Ke Zhang, and Zhenxuan Pan. 2024. LayerKV: Optimizing Large Language Model Serving with Layer-wise KV Cache Management. arXiv preprint arXiv:2410.00428v3 (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"ASPEN: High-Throughput LoRA Fine-Tuning of Large Language Models with a Single GPU. arXiv preprint arXiv:2312.02515","author":"Ye Zhengmao","year":"2023","unstructured":"Zhengmao Ye, Dengchun Li, Jingqi Tian, Tingfeng Lan, Jie Zuo, Lei Duan, Hui Lu, Yexi Jiang, Jian Sha, Ke Zhang, and Mingjie Tang. 2023. ASPEN: High-Throughput LoRA Fine-Tuning of Large Language Models with a Single GPU. arXiv preprint arXiv:2312.02515 (2023)."},{"key":"e_1_3_2_1_50_1","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 521\u2013538. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/yu"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"ZeroMQ. 2024. Getting Started with ZeroMQ. https:\/\/zeromq.org\/get-started\/ Accessed: 2024-09-23.","DOI":"10.1007\/979-8-8688-0935-4_2"},{"key":"e_1_3_2_1_52_1","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=RkRrPp7GKO","author":"Zhang Zhenyu","year":"2023","unstructured":"Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher Re, Clark Barrett, Zhangyang Wang, and Beidi Chen. 2023. H2O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models. In Thirty-seventh Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=RkRrPp7GKO"},{"key":"e_1_3_2_1_53_1","unstructured":"Ye Zhengmao Li Dengchun Tian Jingqi Lan Tingfeng Liang Yanbo Jiang Yexi Zuo Jie Lu Hui Duan Lei and Tang Mingjie. 2023. m-LoRA: Efficient LLM Model Fine-tune and Inference via Multi-Lora Optimization. https:\/\/github.com\/TUDB-Labs\/mLoRA. *: these authors contributed equally to this work."},{"key":"e_1_3_2_1_54_1","unstructured":"Yinmin Zhong Shengyu Liu Junda Chen Jianbo Hu Yibo Zhu Xuanzhe Liu Xin Jin and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. arXiv:2401.09670 [cs.DC] https:\/\/arxiv.org\/abs\/2401.09670"}],"event":{"name":"SoCC '25: ACM Symposium on Cloud Computing","location":"Online USA","acronym":"SoCC '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 2025 ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772052.3772230","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:20:12Z","timestamp":1768321212000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772052.3772230"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,19]]},"references-count":54,"alternative-id":["10.1145\/3772052.3772230","10.1145\/3772052"],"URL":"https:\/\/doi.org\/10.1145\/3772052.3772230","relation":{},"subject":[],"published":{"date-parts":[[2025,11,19]]},"assertion":[{"value":"2026-01-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}