{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T05:10:45Z","timestamp":1759900245061,"version":"build-2065373602"},"reference-count":51,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T00:00:00Z","timestamp":1756771200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T00:00:00Z","timestamp":1756771200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["92267105"],"award-info":[{"award-number":["92267105"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,9,2]]},"DOI":"10.1109\/cluster59342.2025.11186463","type":"proceedings-article","created":{"date-parts":[[2025,10,7]],"date-time":"2025-10-07T17:35:09Z","timestamp":1759858509000},"page":"1-13","source":"Crossref","is-referenced-by-count":0,"title":["Rock: Serving Multimodal Models in Cloud with Heterogeneous-Aware Resource Orchestration for Thousands of LoRA Adapters"],"prefix":"10.1109","author":[{"given":"Shuaipeng","family":"Wu","sequence":"first","affiliation":[{"name":"Shenzhen Institutes of Advanced Technology,Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanying","family":"Lin","sequence":"additional","affiliation":[{"name":"Shenzhen Institutes of Advanced Technology,Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shijie","family":"Peng","sequence":"additional","affiliation":[{"name":"Shenzhen Institutes of Advanced Technology,Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenyan","family":"Chen","sequence":"additional","affiliation":[{"name":"Shenzhen Institutes of Advanced Technology,Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chong","family":"Ma","sequence":"additional","affiliation":[{"name":"Alibaba Group Inc,AIOS Team"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Min","family":"Shen","sequence":"additional","affiliation":[{"name":"Alibaba Group Inc,AIOS Team"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Le","family":"Chen","sequence":"additional","affiliation":[{"name":"Alibaba Group Inc,AIOS Team"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chengzhong","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Macau"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kejiang","family":"Ye","sequence":"additional","affiliation":[{"name":"Shenzhen Institutes of Advanced Technology,Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"journal-title":"Lora: Low-rank adaptation of large language models","year":"2021","author":"Hu","key":"ref1"},{"key":"ref2","article-title":"An image is worth one word: Personalizing text-to-image generation using textual inversion","author":"Gal","year":"2022","journal-title":"arXiv preprint"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.02155"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01202"},{"journal-title":"Multi-lora composition for image generation","year":"2024","author":"Zhong","key":"ref5"},{"journal-title":"Peft: State-of-theart parameter-efficient fine-tuning methods","year":"2022","author":"Mangrulkar","key":"ref6"},{"journal-title":"Train small, infer large: Memory-efficient lora training for large language models","year":"2025","author":"Zhang","key":"ref7"},{"article-title":"Qlora: efficient finetuning of quantized llms","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"Dettmers","key":"ref8"},{"journal-title":"Punica: Multi-tenant lora serving","year":"2023","author":"Chen","key":"ref9"},{"key":"ref10","article-title":"Slora: Serving thousands of concurrent lora adapters","author":"Sheng","year":"2023","journal-title":"arXiv preprint"},{"key":"ref11","first-page":"911","article-title":"dLoRA: Dynamically orchestrating requests and adapters for LoRA LLM serving","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Wu"},{"journal-title":"Caraserve: Cpu-assisted and rank-aware lora serving for generative 11 m inference","year":"2024","author":"Li","key":"ref12"},{"key":"ref13","first-page":"595610","article-title":"Gandiva: Introspective cluster scheduling for deep learning","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao"},{"key":"ref14","first-page":"481","article-title":"Heterogeneity-aware cluster scheduling policies for deep learning workloads","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020","author":"Narayanan","year":"2020"},{"journal-title":"Modserve: Scalable and resource-efficient large multimodal model serving","year":"2025","author":"Qiu","key":"ref15"},{"journal-title":"Shepherd: Serving dnns in the wild","author":"Zhang","key":"ref16"},{"journal-title":"Alpaserve: Statistical multiplexing with model parallelism for deep learning serving","year":"2023","author":"Li","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3717472"},{"key":"ref19","first-page":"851","article-title":"Envpipe: Performance-preserving dnn training framework for saving energy","volume-title":"2023 USENIX Annual Technical Conference, USENIX ATC 2023","author":"Choi"},{"key":"ref20","first-page":"521","article-title":"Orca: A distributed serving system for transformer-based generative models","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604813"},{"key":"ref22","first-page":"315","article-title":"On-demand container loading in aws lambda","volume-title":"2023 USENIX Annual Technical Conference, USENIX ATC 2023","author":"Brooker"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604813"},{"journal-title":"B4: Experience with a globally-deployed software defined wan: Acm sigcomm computer communication review: Vol 43, no 4","key":"ref24"},{"article-title":"Tetris: Memory-efficient serverless inference through tensor sharing","volume-title":"2022 USENIX Annual Technical Conference, USENIX ATC 2022","author":"Li","key":"ref25"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3542929.3563477"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref28","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i10.17103"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/s11222-017-9793-z"},{"journal-title":"Tensor moments of gaussian mixture models: Theory and applications","year":"2022","author":"Pereira","key":"ref31"},{"volume-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis","year":"2023","author":"Podell","key":"ref32"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS60910.2024.00010"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640411"},{"journal-title":"QServe: W4A8KV4 Quantization and System Co-design for Efficient LLM Serving","year":"2024","author":"Lin","key":"ref36"},{"key":"ref37","first-page":"111","article-title":"Cost-Efficient Large Language Model Serving for Multi-turn Conversations with CachedAttention","volume-title":"Proceedings of the 2024 USENIX Annual Technical Conference, USENIX ATC 2024","author":"Gao","year":"2024"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696098"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696086"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3717459"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696072"},{"key":"ref42","first-page":"739","article-title":"SuperServe: Fine-Grained Inference Serving for Unpredictable Workloads","volume-title":"22nd USENIX Symposium on Networked Systems Design and Implementation, NSDI 2025","author":"Khare","year":"2025"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"journal-title":"Fp6\u20131lm: Efficiently serving large language models through fp6-centric algorithm-system co-design","year":"2024","author":"Xia","key":"ref44"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695948"},{"key":"ref46","first-page":"193","article-title":"DistServe: Disaggregating prefill and decoding for goodput-optimized large language model serving","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong"},{"key":"ref47","first-page":"521","article-title":"Orca: A distributed serving system for Transformer-Based generative models","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu"},{"journal-title":"Harnessing your dram and ssd for sustainable and accessible llm inference with mixed-precision and multi-level caching","year":"2024","author":"Peng","key":"ref48"},{"key":"ref49","first-page":"503518","article-title":"Segcache: A memory-efficient and scalable in-memory key-value cache for small objects","volume-title":"18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Yang"},{"volume-title":"Aws inferentia","key":"ref50"},{"volume-title":"Ai model inference service: An overview","key":"ref51"}],"event":{"name":"2025 IEEE International Conference on Cluster Computing (CLUSTER)","start":{"date-parts":[[2025,9,2]]},"location":"United Kingdom","end":{"date-parts":[[2025,9,5]]}},"container-title":["2025 IEEE International Conference on Cluster Computing (CLUSTER)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11186399\/11186452\/11186463.pdf?arnumber=11186463","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T04:53:23Z","timestamp":1759899203000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11186463\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,2]]},"references-count":51,"URL":"https:\/\/doi.org\/10.1109\/cluster59342.2025.11186463","relation":{},"subject":[],"published":{"date-parts":[[2025,9,2]]}}}