{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T07:21:48Z","timestamp":1772695308194,"version":"3.50.1"},"reference-count":66,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2024YFB4505703"],"award-info":[{"award-number":["2024YFB4505703"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62232011,62302302"],"award-info":[{"award-number":["62232011,62302302"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100007219","name":"Natural Science Foundation of Shanghai Municipality","doi-asserted-by":"publisher","award":["25ZR1402241"],"award-info":[{"award-number":["25ZR1402241"]}],"id":[{"id":"10.13039\/100007219","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,1,31]]},"DOI":"10.1109\/hpca68181.2026.11408492","type":"proceedings-article","created":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T20:47:22Z","timestamp":1772657242000},"page":"1-14","source":"Crossref","is-referenced-by-count":0,"title":["ELORA: Efficient LoRA and KV Cache Management for Multi-LoRA LLM Serving"],"prefix":"10.1109","author":[{"given":"Jiuchen","family":"Shi","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yixiao","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Quan","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yizhou","family":"Shan","sequence":"additional","affiliation":[{"name":"Huawei Cloud"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kaihua","family":"Fu","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wei","family":"Wang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"117","article-title":"Taming throughput-latency tradeoff in llm inference with sarathi-serve","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal","year":"2024"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.678"},{"key":"ref3","volume-title":"Instruct-tune llama on consumer hardware using alpaca-lora","year":"2023"},{"key":"ref4","volume-title":"Introducing apple\u2019s on-device and server foundation models","year":"2025"},{"key":"ref5","article-title":"The costly dilemma: generalization, evaluation and cost-optimal deployment of large language models","author":"Aryan","year":"2023","journal-title":"arXiv preprint"},{"key":"ref6","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1459"},{"key":"ref8","article-title":"Kairos: Low-latency multi-agent serving with shared 11 ms and excessive loads in the public cloud","author":"Chen","year":"2025","journal-title":"arXiv preprint"},{"key":"ref9","first-page":"1","article-title":"Punica: Multi-tenant lora serving","volume-title":"Proceedings of Machine Learning and Systems","volume":"6","author":"Chen","year":"2024"},{"key":"ref10","article-title":"Chatbot arena: An open platform for evaluating llms by human preference","author":"Chiang","year":"2024","journal-title":"arXiv preprint"},{"issue":"240","key":"ref11","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"Journal of Machine Learning Research"},{"key":"ref12","volume-title":"copilot","year":"2022"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0441"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/s11023-020-09548-1"},{"key":"ref15","first-page":"111","article-title":"Cost-efficient large language model serving for multi-turn conversations with cachedattention","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Gao","year":"2024"},{"key":"ref16","first-page":"325","article-title":"Prompt cache: Modular attention reuse for low-latency inference","volume-title":"Proceedings of Machine Learning and Systems","volume":"6","author":"Gim","year":"2024"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3716011"},{"key":"ref18","volume-title":"Bard","year":"2023"},{"key":"ref19","volume-title":"Why can\u2019t i use multi-lora adapter and radix attention together?","author":"Ha"},{"key":"ref20","article-title":"Lora: Low-rank adaptation of large language models","author":"Hu","year":"2021","journal-title":"arXiv preprint"},{"key":"ref21","article-title":"Lorahub: Efficient cross-task generalization via dynamic lora composition","author":"Huang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref22","article-title":"Chameleon: Adaptive caching and scheduling for many-adapter 11m inference environments","author":"Iliakopoulou","year":"2024","journal-title":"arXiv preprint"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.17"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/1816038.1815971"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref26","first-page":"155","article-title":"InfiniGen: Efficient generative inference of large language models with dynamic KV cache management","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Lee"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00572"},{"key":"ref29","article-title":"Toppings: CPU-assisted, rank-aware adapter serving for lm inference","volume-title":"Proc. USENIX ATC","author":"Li","year":"2025"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"ref31","article-title":"Personal 11 m agents: Insights and survey about the capability, efficiency and security","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref32","first-page":"929","article-title":"Parrot: Efficient serving of LLM-based applications with semantic variable","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Lin"},{"key":"ref33","article-title":"Dora: Weight-decomposed low-rank adaptation","volume-title":"Forty-first International Conference on Machine Learning","author":"Liu"},{"key":"ref34","volume-title":"Llama3","year":"2024"},{"key":"ref35","article-title":"Arena learning: Build data flywheel for 11 ms post-training via simulated chatbot arena","author":"Luo","year":"2024","journal-title":"arXiv preprint"},{"key":"ref36","volume-title":"A tensorrt toolbox for optimized large language model inference"},{"key":"ref37","volume-title":"Chatgpt","year":"2020"},{"key":"ref38","volume-title":"torch.stream - pytorch 2.0.1 documentation","year":"2023"},{"key":"ref39","first-page":"155170","article-title":"Mooncake: Trading more storage for less computation-a kvcache-centric architecture for serving 11 m chatbot","volume-title":"23rd USENIX Conference on File and Storage Technologies (FAST 25)","author":"Qin","year":"2025"},{"key":"ref40","first-page":"205","article-title":"Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider","volume-title":"2020 USENIX annual technical conference (USENIX ATC 20)","author":"Shahrad","year":"2020"},{"key":"ref41","article-title":"Fast transformer decoding: One write-head is all you need","author":"Shazeer","year":"2019","journal-title":"arXiv preprint"},{"key":"ref42","first-page":"296","article-title":"SLoRA: Scalable serving of thousands of lora adapters","volume-title":"Proceedings of Machine Learning and Systems","volume":"6","author":"Sheng","year":"2024"},{"key":"ref43","first-page":"31094","article-title":"FlexGen: High-throughput generative inference of large language models with a single GPU","volume-title":"Proceedings of the 40th International Conference on Machine Learning, ser. Proceedings of Machine Learning Research","volume":"202","author":"Sheng"},{"key":"ref44","first-page":"1149","article-title":"HALP: Heuristic aided learned preference eviction policy for YouTube content delivery network","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Song"},{"key":"ref45","article-title":"Understanding LSTM-a tutorial into long short-term memory recurrent neural networks","author":"Staudemeyer","year":"2019","journal-title":"arXiv preprint"},{"key":"ref46","article-title":"Llama 2: Open foundation and fine-tuned chat models","volume":"abs\/2307.09288","author":"Touvron","year":"2023","journal-title":"CoRR"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/2948072"},{"key":"ref48","volume-title":"vLLM: A high-throughput and memory-efficient inference and serving engine for 11 ms"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0088"},{"key":"ref50","article-title":"LoRA-Pro: Are low-rank adapters properly optimized?","volume-title":"The Thirteenth International Conference on Learning Representations (ICLR)","author":"Wang","year":"2025"},{"key":"ref51","volume-title":"Trie","year":"2023"},{"key":"ref52","first-page":"911","article-title":"dLoRA: Dynamically orchestrating requests and adapters for lorallm serving","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Wu","year":"2024"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696098"},{"key":"ref54","first-page":"521","article-title":"Orca: A distributed serving system for transformer-based generative models","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu","year":"2022"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696086"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.148"},{"key":"ref57","article-title":"Google\u2019s neural machine translation system: Bridging the gap between human and machine translation","volume":"11","author":"Zhang","year":"2016","journal-title":"arXiv preprint"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/3725338"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-short.107"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483580"},{"key":"ref61","volume-title":"Release v0.4.8","author":"Zhang"},{"key":"ref62","article-title":"LoRA Land: 310 fine-tuned 11 ms that rival gpt-4, a technical report","author":"Zhao","year":"2024","journal-title":"arXiv preprint"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2020"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2000"},{"key":"ref65","first-page":"193","article-title":"Distserve: Disaggregating prefill and decoding for goodputoptimized large language model serving","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong","year":"2024"},{"key":"ref66","article-title":"Multilingual machine translation with large language models: Empirical results and analysis","author":"Zhu","year":"2023","journal-title":"arXiv preprint"}],"event":{"name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","location":"Sydney, Australia","start":{"date-parts":[[2026,1,31]]},"end":{"date-parts":[[2026,2,4]]}},"container-title":["2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11408404\/11408433\/11408492.pdf?arnumber=11408492","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T06:50:31Z","timestamp":1772693431000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11408492\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,31]]},"references-count":66,"URL":"https:\/\/doi.org\/10.1109\/hpca68181.2026.11408492","relation":{},"subject":[],"published":{"date-parts":[[2026,1,31]]}}}