{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T23:25:44Z","timestamp":1768346744285,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,19]]},"DOI":"10.1145\/3772052.3772207","type":"proceedings-article","created":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:19:00Z","timestamp":1768321140000},"page":"735-747","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Multiplexed Heterogeneous LLM Serving via Stage-Aligned Parallelism"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-4406-1841","authenticated-orcid":false,"given":"Tao","family":"Luo","sequence":"first","affiliation":[{"name":"University of Pennsylvania, Philadelphia, Pennsylvania, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9191-2085","authenticated-orcid":false,"given":"Kelvin K.W.","family":"Ng","sequence":"additional","affiliation":[{"name":"University of Pennsylvania, Philadelphia, Pennsylvania, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1094-6827","authenticated-orcid":false,"given":"Zhen Ping","family":"Khor","sequence":"additional","affiliation":[{"name":"University of Pennsylvania, Philadelphia, Pennsylvania, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1508-9401","authenticated-orcid":false,"given":"Sidharth","family":"Sankhe","sequence":"additional","affiliation":[{"name":"University of Pennsylvania, Philadelphia, Pennsylvania, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4757-1746","authenticated-orcid":false,"given":"Boon Thau","family":"Loo","sequence":"additional","affiliation":[{"name":"University of Pennsylvania, Philadelphia, Pennsylvania, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7683-208X","authenticated-orcid":false,"given":"Vincent","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Pennsylvania, Philadelphia, Pennsylvania, USA"}]}],"member":"320","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"351","article-title":"Vidur: A Large-Scale Simulation Framework For LLM Inference","volume":"6","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Jayashree Mohan, Ashish Panwar, Nipun Kwatra, Bhargav Gulavani, Ramachandran Ramjee, and Alexey Tumanov. 2024. Vidur: A Large-Scale Simulation Framework For LLM Inference. Proceedings of Machine Learning and Systems 6 (2024), 351\u2013366.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_2_1","volume-title":"Taming throughput-latency tradeoff in llm inference with sarathi-serve. arXiv preprint arXiv:2403.02310","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming throughput-latency tradeoff in llm inference with sarathi-serve. arXiv preprint arXiv:2403.02310 (2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"Medha: Efficiently Serving Multi-Million Context Length LLM Inference Requests Without Approximations. arXiv preprint arXiv:2409.17264","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Haoran Qiu, Junda Chen, \u00cd\u0144igo Goiri, Chaojie Zhang, Rayyan Shahid, Ramachandran Ramjee, Alexey Tumanov, and Esha Choukse. 2024. Medha: Efficiently Serving Multi-Million Context Length LLM Inference Requests Without Approximations. arXiv preprint arXiv:2409.17264 (2024)."},{"key":"e_1_3_2_1_4_1","unstructured":"Sam Altman. 2025. OPENAI ROADMAP UPDATE FOR GPT-4.5 and GPT-5. https:\/\/x.com\/sama\/status\/1889755723078443244"},{"key":"e_1_3_2_1_5_1","unstructured":"Murali Andoorveedu. 2024. [RFC]: Initial support for Pipeline Paralleism. https:\/\/github.com\/vllm-project\/vllm\/issues\/4461"},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the 11th ACM Symposium on Cloud Computing","author":"Dhakal Aditya","unstructured":"Aditya Dhakal, Sameer G Kulkarni, and K. K. Ramakrishnan. 2020. GSLICE: controlled spatial sharing of GPUs for a scalable inference platform. In Proceedings of the 11th ACM Symposium on Cloud Computing (Virtual Event, USA) (SoCC '20). Association for Computing Machinery, New York, NY, USA, 492\u2013506."},{"key":"e_1_3_2_1_7_1","volume-title":"MuxServe: Flexible Multiplexing for Efficient Multiple LLM Serving. arXiv preprint arXiv:2404.02015","author":"Duan Jiangfei","year":"2024","unstructured":"Jiangfei Duan, Runyu Lu, Haojie Duanmu, Xiuhong Li, Xingcheng Zhang, Dahua Lin, Ion Stoica, and Hao Zhang. 2024. MuxServe: Flexible Multiplexing for Efficient Multiple LLM Serving. arXiv preprint arXiv:2404.02015 (2024)."},{"key":"e_1_3_2_1_8_1","unstructured":"Hugging Face. 2024. Efficient Training on Multiple GPUs. https:\/\/huggingface.co\/docs\/transformers\/main\/en\/perf_train_gpu_many. Accessed: 2024-12-08."},{"key":"e_1_3_2_1_9_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Fu Yao","year":"2024","unstructured":"Yao Fu, Leyang Xue, Yeqi Huang, Andrei-Octavian Brabete, Dmitrii Ustiugov, Yuvraj Patel, and Luo Mai. 2024. {ServerlessLLM}:{Low-Latency} serverless inference for large language models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 135\u2013153."},{"key":"e_1_3_2_1_10_1","volume-title":"Dominant Resource Fairness: Fair Allocation of Multiple Resource Types. In 8th USENIX Symposium on Networked Systems Design and Implementation (NSDI 11)","author":"Ghodsi Ali","year":"2011","unstructured":"Ali Ghodsi, Matei Zaharia, Benjamin Hindman, Andy Konwinski, Scott Shenker, and Ion Stoica. 2011. Dominant Resource Fairness: Fair Allocation of Multiple Resource Types. In 8th USENIX Symposium on Networked Systems Design and Implementation (NSDI 11). USENIX Association, Boston, MA."},{"key":"e_1_3_2_1_11_1","unstructured":"Cunchen Hu Heyang Huang Liangliang Xu Xusheng Chen Jiang Xu Shuang Chen Hao Feng Chenxi Wang Sa Wang Yungang Bao et al. 2024. Inference without interference: Disaggregate llm inference for mixed downstream workloads. arXiv preprint arXiv:2401.11181 (2024)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476145"},{"key":"e_1_3_2_1_14_1","volume-title":"AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. 2023. AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 663\u2013679."},{"key":"e_1_3_2_1_15_1","unstructured":"Phillip Lippe. 2024. Tensor Parallelism Tutorial. https:\/\/uvadlc-notebooks.readthedocs.io\/en\/latest\/tutorial_notebooks\/scaling\/JAX\/tensor_parallel_simple.html. Accessed: 2024-12-08."},{"key":"e_1_3_2_1_16_1","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_1_17_1","unstructured":"NVIDIA. 2024. Demystifying AI Inference Deployments for Trillion-Parameter Large Language Models. https:\/\/developer.nvidia.com\/blog\/demystifying-ai-inference-deployments-for-trillion-parameter-large-language-models. Accessed: 2024-12-08."},{"key":"e_1_3_2_1_18_1","volume-title":"Accessed","year":"2025","unstructured":"OpenRouter. 2025. Amazon Bedrock Models | OpenRouter. https:\/\/openrouter.ai\/models?fmt=cards&providers=Amazon%20Bedrock. Accessed: April 17, 2025."},{"key":"e_1_3_2_1_19_1","volume-title":"Accessed","year":"2025","unstructured":"OpenRouter. 2025. Azure | OpenRouter: Browse models provided by Azure. https:\/\/openrouter.ai\/provider\/azure. Accessed: April 17, 2025."},{"key":"e_1_3_2_1_20_1","volume-title":"Accessed","year":"2025","unstructured":"OpenRouter. 2025. LLM Rankings | OpenRouter. https:\/\/openrouter.ai\/rankings?view=month. Accessed: April 17, 2025."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_2_1_22_1","unstructured":"Archit Patke Dhemath Reddy Saurabh Jha Haoran Qiu Christian Pinto Shengkun Cui Chandra Narayanaswami Zbigniew Kalbarczyk and Ravishankar Iyer. 2024. One Queue Is All You Need: Resolving Head-of-Line Blocking in Large Language Model Serving. arXiv:2407.00047 [cs.DC] https:\/\/arxiv.org\/abs\/2407.00047"},{"key":"e_1_3_2_1_23_1","volume-title":"A comparative study of dispatching rules in dynamic flowshops and jobshops. European journal of operational research 116, 1","author":"Rajendran Chandrasekharan","year":"1999","unstructured":"Chandrasekharan Rajendran and Oliver Holthaus. 1999. A comparative study of dispatching rules in dynamic flowshops and jobshops. European journal of operational research 116, 1 (1999), 156\u2013170."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00102"},{"key":"e_1_3_2_1_25_1","volume-title":"Fault-tolerant Generative LLM Serving. In Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"46771","author":"Strati Foteini","year":"2024","unstructured":"Foteini Strati, Sara Mcallister, Amar Phanishayee, Jakub Tarnawski, and Ana Klimovic. 2024. D\u00e9j\u00e0Vu: KV-cache Streaming for Fast, Fault-tolerant Generative LLM Serving. In Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 235), Ruslan Salakhutdinov, Zico Kolter, Katherine Heller, Adrian Weller, Nuria Oliver, Jonathan Scarlett, and Felix Berkenkamp (Eds.). PMLR, 46745\u201346771. https:\/\/proceedings.mlr.press\/v235\/strati24a.html"},{"key":"e_1_3_2_1_26_1","volume-title":"\u0141 ukasz Kaiser, and Illia Polosukhin","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.), Vol. 30. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_1_27_1","unstructured":"Bingyang Wu Yinmin Zhong Zili Zhang Shengyu Liu Fangyue Liu Yuanhang Sun Gang Huang Xuanzhe Liu and Xin Jin. 2024. Fast Distributed Inference Serving for Large Language Models. arXiv:2305.05920 [cs.LG] https:\/\/arxiv.org\/abs\/2305.05920"},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languagesand Operating Systems","volume":"1","author":"Zeng Shaoxun","year":"2025","unstructured":"Shaoxun Zeng, Minhui Xie, Shiwei Gao, Youmin Chen, and Youyou Lu. 2025. Medusa: Accelerating Serverless LLM Inference with Materialization. In Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languagesand Operating Systems, Volume 1. 653\u2013668."},{"key":"e_1_3_2_1_29_1","volume-title":"Jenga: Effective Memory Management for Serving LLM with Heterogeneity. arXiv:2503.18292 [cs.DC] https:\/\/arxiv.org\/abs\/2503.18292","author":"Zhang Chen","year":"2025","unstructured":"Chen Zhang, Kuntai Du, Shu Liu, Woosuk Kwon, Xiangxi Mo, Yufeng Wang, Xiaoxuan Liu, Kaichao You, Zhuohan Li, Mingsheng Long, Jidong Zhai, Joseph Gonzalez, and Ion Stoica. 2025. Jenga: Effective Memory Management for Serving LLM with Heterogeneity. arXiv:2503.18292 [cs.DC] https:\/\/arxiv.org\/abs\/2503.18292"},{"key":"e_1_3_2_1_30_1","volume-title":"Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P. Xing, Joseph E. Gonzalez, and Ion Stoica. 2022. Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 559\u2013578."},{"key":"e_1_3_2_1_31_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. { DistServe } : Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 193\u2013210."},{"key":"e_1_3_2_1_32_1","unstructured":"Yinmin Zhong Zili Zhang Bingyang Wu Shengyu Liu Yukun Chen Changyi Wan Hanpeng Hu Lei Xia Ranchen Ming Yibo Zhu et al. 2024. Optimizing RLHF Training for Large Language Models with Stage Fusion. arXiv preprint arXiv:2409.13221 (2024)."}],"event":{"name":"SoCC '25: ACM Symposium on Cloud Computing","location":"Online USA","acronym":"SoCC '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 2025 ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772052.3772207","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:25:29Z","timestamp":1768321529000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772052.3772207"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,19]]},"references-count":32,"alternative-id":["10.1145\/3772052.3772207","10.1145\/3772052"],"URL":"https:\/\/doi.org\/10.1145\/3772052.3772207","relation":{},"subject":[],"published":{"date-parts":[[2025,11,19]]},"assertion":[{"value":"2026-01-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}