{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T00:17:40Z","timestamp":1777421860944,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["NSF-2421782, NSF-2350425, NSF-2319988, NSF-2206522"],"award-info":[{"award-number":["NSF-2421782, NSF-2350425, NSF-2319988, NSF-2206522"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100004318","name":"Microsoft","doi-asserted-by":"publisher","award":["Microsoft Research Faculty Fellowship 8300751"],"award-info":[{"award-number":["Microsoft Research Faculty Fellowship 8300751"]}],"id":[{"id":"10.13039\/100004318","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,19]]},"DOI":"10.1145\/3772052.3772254","type":"proceedings-article","created":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:19:00Z","timestamp":1768321140000},"page":"817-830","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["M\n                    <scp>od<\/scp>\n                    S\n                    <scp>erve<\/scp>\n                    : Modality- and Stage-Aware Resource Disaggregation for Scalable Multimodal Model Serving"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8071-1130","authenticated-orcid":false,"given":"Haoran","family":"Qiu","sequence":"first","affiliation":[{"name":"Microsoft, Redmond, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6149-9739","authenticated-orcid":false,"given":"Anish","family":"Biswas","sequence":"additional","affiliation":[{"name":"Microsoft, Bengaluru, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5167-0918","authenticated-orcid":false,"given":"Zihan","family":"Zhao","sequence":"additional","affiliation":[{"name":"University of Virginia, Charlottesville, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5260-3203","authenticated-orcid":false,"given":"Jayashree","family":"Mohan","sequence":"additional","affiliation":[{"name":"Microsoft, Bengaluru, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4649-9022","authenticated-orcid":false,"given":"Alind","family":"Khare","sequence":"additional","affiliation":[{"name":"Microsoft, Bengaluru, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0371-5522","authenticated-orcid":false,"given":"Esha","family":"Choukse","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2591-4012","authenticated-orcid":false,"given":"\u00cd\u00f1igo","family":"Goiri","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7853-6854","authenticated-orcid":false,"given":"Zeyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Virginia, Charlottesville, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7548-6223","authenticated-orcid":false,"given":"Haiying","family":"Shen","sequence":"additional","affiliation":[{"name":"University of Virginia, Charlottesville, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0102-8139","authenticated-orcid":false,"given":"Chetan","family":"Bansal","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0007-6040","authenticated-orcid":false,"given":"Ram","family":"Ramjee","sequence":"additional","affiliation":[{"name":"Microsoft, Bengaluru, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9662-2661","authenticated-orcid":false,"given":"Rodrigo","family":"Fonseca","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.5555\/3691938.3691945"},{"key":"e_1_3_2_1_2_1","volume-title":"Medha: Efficiently Serving Multi-Million Context Length LLM Inference Requests Without Approximations. arXiv preprint arXiv:2409.17264","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Haoran Qiu, Junda Chen, \u00cd\u00f1igo Goiri, Chaojie Zhang, Rayyan Shahid, Ramachandran Ramjee, Alexey Tumanov, and Esha Choukse. 2024. Medha: Efficiently Serving Multi-Million Context Length LLM Inference Requests Without Approximations. arXiv preprint arXiv:2409.17264 (2024)."},{"key":"e_1_3_2_1_3_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. 2024 Conference on Neural Information Processing Systems (NeurIPS 2024) 35 (2022) 23716\u201323736."},{"key":"e_1_3_2_1_4_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv","author":"Alexey Dosovitskiy","year":"2010","unstructured":"Dosovitskiy Alexey. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv: 2010.11929 (2020)."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC).","author":"Aminabadi Reza Yazdani","year":"2022","unstructured":"Reza Yazdani Aminabadi, Samyam Rajbhandari, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Olatunji Ruwase, Shaden Smith, Minjia Zhang, Jeff Rasley, et al. 2022. DeepSpeed-Inference: Enabling efficient inference of transformer models at unprecedented scale. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC)."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022). 1803","author":"Chen Jun","year":"2022","unstructured":"Jun Chen, Han Guo, Kai Yi, Boyang Li, and Mohamed Elhoseiny. 2022. VisualGPT: Data-efficient adaptation of pretrained language models for image captioning. In Proceedings of the 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2022). 18030\u201318040."},{"key":"e_1_3_2_1_7_1","unstructured":"Xiaokang Chen Zhiyu Wu Xingchao Liu Zizheng Pan Wen Liu Zhenda Xie Xingkai Yu and Chong Ruan. 2025. Janus-Pro: Unified Multimodal Understanding and Generation with Data and Model Scaling."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Zhe Chen Weiyun Wang Hao Tian Shenglong Ye Zhangwei Gao Erfei Cui Wenwen Tong Kongzhi Hu Jiapeng Luo Zheng Ma et al. 2024. How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites. arXiv preprint arXiv:2404.16821 (2024).","DOI":"10.1007\/s11432-024-4231-5"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al. 2024. InternVL: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2024). 24185\u201324198."},{"key":"e_1_3_2_1_10_1","volume-title":"Lewei and others","author":"Wu Zhe","year":"2024","unstructured":"Chen, Zhe and Wu, Jiannan and Wang, Wenhai and Su, Weijie and Chen, Guo and Xing, Sen and Zhong, Muyan and Zhang, Qinglong and Zhu, Xizhou and Lu, Lewei and others. 2024. HuggingFace Model: OpenGVLab\/InternVL2_5-26B. https:\/\/huggingface.co\/OpenGVLab\/InternVL2_5-26B."},{"key":"e_1_3_2_1_11_1","volume-title":"Kartikeya Upasani, and Mahesh Pasupuleti.","author":"Chi Jianfeng","year":"2024","unstructured":"Jianfeng Chi, Ujjwal Karn, Hongyuan Zhan, Eric Smith, Javier Rando, Yiming Zhang, Kate Plawiak, Zacharie Delpierre Coudert, Kartikeya Upasani, and Mahesh Pasupuleti. 2024. Llama Guard 3 Vision: Safeguarding Human-AI Image Understanding Conversations. arXiv preprint arXiv:2411.10414 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"NVLM: Open Frontier-Class Multimodal LLMs. arXiv:2409.11402 [cs.CL] https:\/\/arxiv.org\/abs\/2409.11402","author":"Dai Wenliang","year":"2024","unstructured":"Wenliang Dai, Nayeon Lee, Boxin Wang, Zhuolin Yang, Zihan Liu, Jon Barker, Tuomas Rintamaki, Mohammad Shoeybi, Bryan Catanzaro, and Wei Ping. 2024. NVLM: Open Frontier-Class Multimodal LLMs. arXiv:2409.11402 [cs.CL] https:\/\/arxiv.org\/abs\/2409.11402"},{"key":"e_1_3_2_1_13_1","unstructured":"Dai Wenliang and Lee Nayeon and Wang Boxin and Yang Zhuolin and Liu Zihan and Barker Jon and Rintamaki Tuomas and Shoeybi Mohammad and Catanzaro Bryan and Ping Wei. 2024. HuggingFace Model: nvidia\/NVLM-D-72B. https:\/\/huggingface.co\/nvidia\/NVLM-D-72B."},{"key":"e_1_3_2_1_14_1","volume-title":"HydraInfer: Hybrid Disaggregated Scheduling for Multimodal Large Language Model Serving. arXiv preprint arXiv:2505.12658","author":"Dong Xianzhe","year":"2025","unstructured":"Xianzhe Dong, Tongxuan Liu, Yuting Zeng, Liangyu Liu, Yang Liu, Siyu Wu, Yu Wu, Hailong Yang, Ke Zhang, and Jing Li. 2025. HydraInfer: Hybrid Disaggregated Scheduling for Multimodal Large Language Model Serving. arXiv preprint arXiv:2505.12658 (2025)."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 32nd ACM International Conference on Multimedia.","author":"Duan Haodong","year":"2024","unstructured":"Haodong Duan, Junming Yang, Yuxuan Qiao, Xinyu Fang, Lin Chen, Yuan Liu, Xiaoyi Dong, Yuhang Zang, Pan Zhang, Jiaqi Wang, et al. 2024. VLMEvalKit: An open-source toolkit for evaluating large multi-modality models. https:\/\/huggingface.co\/spaces\/opencompass\/open_vlm_leaderboard. In Proceedings of the 32nd ACM International Conference on Multimedia."},{"key":"e_1_3_2_1_16_1","volume-title":"Azure Public Dataset: Azure LMM Inference Trace","year":"2025","unstructured":"GitHub. 2025. Azure Public Dataset: Azure LMM Inference Trace 2025. https:\/\/github.com\/Azure\/AzurePublicDataset\/tree\/master."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/383962.384010"},{"key":"e_1_3_2_1_18_1","volume-title":"The Deployment of End-to-End Audio Language Models Should Take into Account the Principle of Least Privilege. arXiv preprint arXiv:2503.16833","author":"He Luxi","year":"2025","unstructured":"Luxi He, Xiangyu Qi, Michel Liao, Inyoung Cheong, Prateek Mittal, Danqi Chen, and Peter Henderson. 2025. The Deployment of End-to-End Audio Language Models Should Take into Account the Principle of Least Privilege. arXiv preprint arXiv:2503.16833 (2025)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2022.3215718"},{"key":"e_1_3_2_1_20_1","unstructured":"Cunchen Hu Heyang Huang Junhao Hu Jiang Xu Xusheng Chen Tao Xie Chenxi Wang Sa Wang Yungang Bao Ninghui Sun et al. 2024. MemServe: Context caching for disaggregated LLM serving with elastic memory pool. arXiv preprint arXiv:2406.17565 (2024)."},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the 2023 IEEE\/CVF International Conference on Computer Vision (ICCV","author":"Hu Yushi","year":"2023","unstructured":"Yushi Hu, Hang Hua, Zhengyuan Yang, Weijia Shi, Noah A Smith, and Jiebo Luo. 2023. PromptCap: Prompt-guided image captioning for VQA with GPT-3. In Proceedings of the 2023 IEEE\/CVF International Conference on Computer Vision (ICCV 2023). 2963\u20132975."},{"key":"e_1_3_2_1_22_1","volume-title":"Dynamic-LLaVA: Efficient Multimodal Large Language Models via Dynamic Vision-language Context Sparsification. arXiv preprint arXiv:2412.00876","author":"Huang Wenxuan","year":"2024","unstructured":"Wenxuan Huang, Zijie Zhai, Yunhang Shen, Shaoshen Cao, Fei Zhao, Xiangfeng Xu, Zheyu Ye, and Shaohui Lin. 2024. Dynamic-LLaVA: Efficient Multimodal Large Language Models via Dynamic Vision-language Context Sparsification. arXiv preprint arXiv:2412.00876 (2024)."},{"key":"e_1_3_2_1_23_1","unstructured":"HuggingFace. 2024. Audio-Text-to-Text Models. https:\/\/huggingface.co\/models?pipeline_tag=audio-text-to-text."},{"key":"e_1_3_2_1_24_1","unstructured":"HuggingFace. 2024. HuggingFace Transformers. https:\/\/huggingface.co\/docs\/transformers\/en\/index."},{"key":"e_1_3_2_1_25_1","unstructured":"HuggingFace. 2024. Image-Text-to-Text Models. https:\/\/huggingface.co\/models?pipeline_tag=image-text-to-text."},{"key":"e_1_3_2_1_26_1","volume-title":"Pod-attention: Unlocking full prefill-decode overlap for faster LLM inference. arXiv preprint arXiv:2410.18038","author":"Kamath Aditya K","year":"2024","unstructured":"Aditya K Kamath, Ramya Prabhu, Jayashree Mohan, Simon Peter, Ramachandran Ramjee, and Ashish Panwar. 2024. Pod-attention: Unlocking full prefill-decode overlap for faster LLM inference. arXiv preprint arXiv:2410.18038 (2024)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_28_1","volume-title":"Christian Puhrsch, Daniel Haziza, Driss Guessous, et al.","author":"Lee Yejin","year":"2024","unstructured":"Yejin Lee, Anna Sun, Basil Hosmer, Bilge Acun, Can Balioglu, Changhan Wang, Charles David Hernandez, Christian Puhrsch, Daniel Haziza, Driss Guessous, et al. 2024. Characterizing and Efficiently Accelerating Multimodal Generation Model Inference. arXiv preprint arXiv:2410.00215 (2024)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_2_1_30_1","volume-title":"LLaVA-OneVision: Easy Visual Task Transfer. arXiv preprint arXiv:2408.03326","author":"Li Bo","year":"2024","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Yanwei Li, Ziwei Liu, and Chunyuan Li. 2024. LLaVA-OneVision: Easy Visual Task Transfer. arXiv preprint arXiv:2408.03326 (2024)."},{"key":"e_1_3_2_1_31_1","unstructured":"Bo Li Yuanhan Zhang Dong Guo Renrui Zhang Feng Li Hao Zhang Kaichen Zhang Peiyuan Zhang Yanwei Li Ziwei Liu and Chunyuan Li. 2024. LLaVA-OneVision: Easy Visual Task Transfer. arXiv:2408.03326 [cs.CV] https:\/\/arxiv.org\/abs\/2408.03326"},{"key":"e_1_3_2_1_32_1","unstructured":"Kevin Y Li Sachin Goyal Joao D Semedo and J Zico Kolter. 2024. Inference Optimal VLMs Need Only One Visual Token but Larger Models. arXiv preprint arXiv:2411.03312(2024)."},{"key":"e_1_3_2_1_33_1","unstructured":"Li Bo and Zhang Yuanhan and Guo Dong and Zhang Renrui and Li Feng and Zhang Hao and Zhang Kaichen and Li Yanwei and Liu Ziwei and Li Chunyuan. 2024. HuggingFace Model: lmms-lab\/llava-onevision-qwen2-72b-ov-sft. https:\/\/huggingface.co\/lmms-lab\/llava-onevision-qwen2-72b-ov-sft."},{"key":"e_1_3_2_1_34_1","unstructured":"Li Bo and Zhang Yuanhan and Guo Dong and Zhang Renrui and Li Feng and Zhang Hao and Zhang Kaichen and Li Yanwei and Liu Ziwei and Li Chunyuan. 2024. HuggingFace Model: lmms-lab\/llava-onevision-qwen2-7b-ov. https:\/\/huggingface.co\/lmms-lab\/llava-onevision-qwen2-7b-ov."},{"key":"e_1_3_2_1_35_1","volume-title":"Boosting Multimodal Large Language Models with Visual Tokens Withdrawal for Rapid Inference. arXiv preprint arXiv:2405.05803","author":"Lin Zhihang","year":"2024","unstructured":"Zhihang Lin, Mingbao Lin, Luxi Lin, and Rongrong Ji. 2024. Boosting Multimodal Large Language Models with Visual Tokens Withdrawal for Rapid Inference. arXiv preprint arXiv:2405.05803 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"European Conference on Computer Vision (ECCV).","author":"Liu Zuyan","year":"2025","unstructured":"Zuyan Liu, Benlin Liu, Jiahui Wang, Yuhao Dong, Guangyi Chen, Yongming Rao, Ranjay Krishna, and Jiwen Lu. 2025. Efficient inference of vision instruction-following models with elastic cache. In European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_2_1_37_1","unstructured":"Meta AI. 2024. HuggingFace Model: meta-llama\/Llama-3.2-11B-Vision-Instruct. https:\/\/huggingface.co\/meta-llama\/Llama-3.2-11B-Vision-Instruct."},{"key":"e_1_3_2_1_38_1","unstructured":"Meta AI. 2024. HuggingFace Model: meta-llama\/Llama-3.2-90B-Vision-Instruct. https:\/\/huggingface.co\/meta-llama\/Llama-3.2-90B-Vision-Instruct."},{"key":"e_1_3_2_1_39_1","unstructured":"Microsoft Azure. 2024. Azure VM NDm-A100-v4 sizes series. https:\/\/learn.microsoft.com\/en-us\/azure\/virtual-machines\/sizes\/gpu-accelerated\/ndma100v4-series."},{"key":"e_1_3_2_1_40_1","volume-title":"ClipCap: CLIP Prefix for Image Captioning. arXiv preprint arXiv:2111.09734","author":"Mokady Ron","year":"2021","unstructured":"Ron Mokady, Amir Hertz, and Amit H Bermano. 2021. ClipCap: CLIP Prefix for Image Captioning. arXiv preprint arXiv:2111.09734 (2021)."},{"key":"e_1_3_2_1_41_1","volume-title":"Inf-MLLM: Efficient Streaming Inference of Multimodal Large Language Models on a Single GPU. arXiv preprint arXiv:2409.09086","author":"Ning Zhenyu","year":"2024","unstructured":"Zhenyu Ning, Jieru Zhao, Qihao Jin, Wenchao Ding, and Minyi Guo. 2024. Inf-MLLM: Efficient Streaming Inference of Multimodal Large Language Models on a Single GPU. arXiv preprint arXiv:2409.09086 (2024)."},{"key":"e_1_3_2_1_42_1","volume-title":"ScreenAgent: A Vision Language Model-Driven Computer Control Agent. arXiv preprint arXiv:2402.07945","author":"Niu Runliang","year":"2024","unstructured":"Runliang Niu, Jindong Li, Shiqi Wang, Yali Fu, Xiyu Hu, Xueyuan Leng, He Kong, Yi Chang, and Qi Wang. 2024. ScreenAgent: A Vision Language Model-Driven Computer Control Agent. arXiv preprint arXiv:2402.07945 (2024)."},{"key":"e_1_3_2_1_43_1","unstructured":"NVIDIA. 2024. NVIDIA TensoRT. https:\/\/github.com\/NVIDIA\/TensorRT."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.5555\/2643634.2643666"},{"key":"e_1_3_2_1_45_1","unstructured":"OpenAI. 2025. Computer-Using Agent: Introducing a universal interface for AI to interact with the digital world. https:\/\/openai.com\/index\/computer-using-agent."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the 15th ACM Symposium on Cloud Computing (SoCC).","author":"Patke Archit","year":"2024","unstructured":"Archit Patke, Dhemath Reddy, Saurabh Jha, Haoran Qiu, Christian Pinto, Chandra Narayanaswami, Zbigniew Kalbarczyk, and Ravishankar Iyer. 2024. Queue Management for SLO-Oriented Large Language Model Serving. In Proceedings of the 15th ACM Symposium on Cloud Computing (SoCC)."},{"key":"e_1_3_2_1_48_1","volume-title":"Mooncake: A KVCache-centric Disaggregated Architecture for LLM Serving. arXiv preprint arXiv:2407.00079","author":"Qin Ruoyu","year":"2024","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2024. Mooncake: A KVCache-centric Disaggregated Architecture for LLM Serving. arXiv preprint arXiv:2407.00079 (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"USENIX Annual Technical Conference (USENIX ATC","author":"Qiu Haoran","year":"2024","unstructured":"Haoran Qiu, Weichao Mao, Archit Patke, Shengkun Cui, Saurabh Jha, Chen Wang, Hubertus Franke, Zbigniew Kalbarczyk, Tamer Ba\u015far, and Ravishankar K. Iyer. 2024. Power-aware Deep Learning Model Serving with \u03bc-Serve. In USENIX Annual Technical Conference (USENIX ATC 2024)."},{"key":"e_1_3_2_1_50_1","volume-title":"Efficient Interactive LLM Serving with Proxy Model-based Sequence Length Prediction. In The 5th International Workshop on Cloud Intelligence \/ AIOps at ASPLOS","author":"Qiu Haoran","year":"2024","unstructured":"Haoran Qiu, Weichao Mao, Archit Patke, Shengkun Cui, Saurabh Jha, Chen Wang, Hubertus Franke, Zbigniew T. Kalbarczyk, Tamer Ba\u015far, and Ravishankar K. Iyer. 2024. Efficient Interactive LLM Serving with Proxy Model-based Sequence Length Prediction. In The 5th International Workshop on Cloud Intelligence \/ AIOps at ASPLOS 2024."},{"key":"e_1_3_2_1_51_1","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV).","author":"Schwenk Dustin","year":"2022","unstructured":"Dustin Schwenk, Apoorv Khandelwal, Christopher Clark, Kenneth Marino, and Roozbeh Mottaghi. 2022. A-OKVQA: A benchmark for visual question answering using world knowledge. In Proceedings of the European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR","author":"Shao Zhenwei","year":"2023","unstructured":"Zhenwei Shao, Zhou Yu, Meng Wang, and Jun Yu. 2023. Prompting large language models with answer heuristics for knowledge-based visual question answering. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2023)."},{"key":"e_1_3_2_1_53_1","unstructured":"Mohammad Shazeer et al. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Pipeline Parallelism. arXiv preprint arXiv:1909.08053 (2020). https:\/\/arxiv.org\/abs\/1909.08053"},{"key":"e_1_3_2_1_54_1","unstructured":"Gursimran Singh Xinglu Wang Ivan Hu Timothy Yu Linzi Xing Wei Jiang Zhefeng Wang Xiaolong Bai Yi Li Ying Xiong et al. 2024. Efficiently serving large multimedia models using EPD Disaggregation. arXiv preprint arXiv:2501.05460 (2024)."},{"key":"e_1_3_2_1_55_1","volume-title":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS). 1266\u20131281","author":"Stojkovic Jovan","year":"2025","unstructured":"Jovan Stojkovic, Chaojie Zhang, \u00cd\u00f1igo Goiri, Esha Choukse, Haoran Qiu, Rodrigo Fonseca, Josep Torrellas, and Ricardo Bianchini. 2025. TAPAS: Thermal-and Power-Aware Scheduling for LLM Inference in Cloud Platforms. In Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS). 1266\u20131281."},{"key":"e_1_3_2_1_56_1","volume-title":"Proceedings of the IEEE International Symposium on High Performance Computer Architecture (HPCA).","author":"Stojkovic Jovan","year":"2025","unstructured":"Jovan Stojkovic, Chaojie Zhang, \u00cd\u00f1igo Goiri, Josep Torrellas, and Esha Choukse. 2025. DynamoLLM: Designing LLM Inference Clusters for Performance and Energy Efficiency. In Proceedings of the IEEE International Symposium on High Performance Computer Architecture (HPCA)."},{"key":"e_1_3_2_1_57_1","volume-title":"Llumnix: Dynamic Scheduling for Large Language Model Serving. arXiv preprint arXiv:2406.03243","author":"Sun Biao","year":"2024","unstructured":"Biao Sun, Ziming Huang, Hanyu Zhao, Wencong Xiao, Xinyi Zhang, Yong Li, and Wei Lin. 2024. Llumnix: Dynamic Scheduling for Large Language Model Serving. arXiv preprint arXiv:2406.03243 (2024)."},{"key":"e_1_3_2_1_58_1","volume-title":"Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al.","author":"Team Gemini","year":"2024","unstructured":"Gemini Team, Petko Georgiev, Ving Ian Lei, Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al. 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)."},{"key":"e_1_3_2_1_59_1","unstructured":"vLLM. 2024. Distributed Inference and Serving. https:\/\/docs.vllm.ai\/en\/latest\/serving\/distributed_serving.html."},{"key":"e_1_3_2_1_60_1","volume-title":"Proceedings of the 15th USENIX Symposium on Networked Systems Design and Implementation (NSDI).","author":"Wang Cheng","year":"2018","unstructured":"Cheng Wang, Xusheng Chen, Weiwei Jia, Boxuan Li, Haoran Qiu, Shixiong Zhao, and Heming Cui. 2018. PLOVER: Fast, Multi-core Scalable Virtual Machine Fault-tolerance. In Proceedings of the 15th USENIX Symposium on Networked Systems Design and Implementation (NSDI)."},{"key":"e_1_3_2_1_61_1","volume-title":"2017 Conference on Neural Information Processing Systems (NIPS","author":"Waswani A","year":"2017","unstructured":"A Waswani, N Shazeer, N Parmar, J Uszkoreit, L Jones, A Gomez, L Kaiser, and I Polosukhin. 2017. Attention is all you need. In 2017 Conference on Neural Information Processing Systems (NIPS 2017)."},{"key":"e_1_3_2_1_62_1","volume-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38\u201345","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. 2020. Transformers: State-of-the-Art Natural Language Processing. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38\u201345. https:\/\/www.aclweb.org\/anthology\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_63_1","volume-title":"Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI).","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_64_1","volume-title":"Sigmoid Loss for Language Image Pre-Training. In 2023 IEEE\/CVF International Conference on Computer Vision (ICCV).","author":"Zhai Xiaohua","year":"2023","unstructured":"Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, and Lucas Beyer. 2023. Sigmoid Loss for Language Image Pre-Training. In 2023 IEEE\/CVF International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_2_1_65_1","volume-title":"Proceedings of the 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI).","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In Proceedings of the 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_66_1","volume-title":"Connector-S: A Survey of Connectors in Multi-modal Large Language Models. arXiv preprint arXiv:2502.11453","author":"Zhu Xun","year":"2025","unstructured":"Xun Zhu, Zheng Zhang, Xi Chen, Yiming Shi, Miao Li, and Ji Wu. 2025. Connector-S: A Survey of Connectors in Multi-modal Large Language Models. arXiv preprint arXiv:2502.11453 (2025)."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.14778\/3685800.3685829"}],"event":{"name":"SoCC '25: ACM Symposium on Cloud Computing","location":"Online USA","acronym":"SoCC '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 2025 ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772052.3772254","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:24:43Z","timestamp":1768321483000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772052.3772254"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,19]]},"references-count":67,"alternative-id":["10.1145\/3772052.3772254","10.1145\/3772052"],"URL":"https:\/\/doi.org\/10.1145\/3772052.3772254","relation":{},"subject":[],"published":{"date-parts":[[2025,11,19]]},"assertion":[{"value":"2026-01-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}