{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T11:36:46Z","timestamp":1777462606214,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","funder":[{"name":"IBM-ILLINOIS Discovery Accelerator Institute (IIDAI)","award":["IBM Agreement No. W2177533 ILLINOIS Award No.103509"],"award-info":[{"award-number":["IBM Agreement No. W2177533 ILLINOIS Award No.103509"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3805621.3807662","type":"proceedings-article","created":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T13:08:45Z","timestamp":1777381725000},"page":"397-406","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Revisiting Disaggregated Large Language Model Serving for Performance and Energy Implications"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-3678-2240","authenticated-orcid":false,"given":"Jiaxi","family":"Li","sequence":"first","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, Illinois, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2207-1904","authenticated-orcid":false,"given":"Yue","family":"Zhu","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, New York, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3392-4834","authenticated-orcid":false,"given":"Bo","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, Illinois, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2335-4798","authenticated-orcid":false,"given":"Eun Kyung","family":"Lee","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, New York, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6813-3043","authenticated-orcid":false,"given":"Klara","family":"Nahrstedt","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, Illinois, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,4,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"DeepSeek-AI Aixin Liu Bei Feng Bing Xue and et al. 2025. DeepSeek-V3 Technical Report. arXiv:2412.19437 [cs.CL] https:\/\/arxiv.org\/abs\/2412.19437"},{"key":"e_1_3_2_1_2_1","unstructured":"Jiangsu Du Hongbin Zhang Taosheng Wei Zhenyi Zheng Kaiyi Wu Zhiguang Chen and Yutong Lu. 2025. EcoServe: Enabling Costeffective LLM Serving with Proactive Intra- and Inter-Instance Orchestration. arXiv:2504.18154 [cs.DC] https:\/\/arxiv.org\/abs\/2504.18154"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3730999"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the 2024 USENIX Conference on Usenix Annual Technical Conference (Santa Clara, CA, USA) (USENIX ATC'24). USENIX Association, USA, Article 7, 16 pages.","author":"Gao Bin","year":"2024","unstructured":"Bin Gao, Zhuomin He, Puru Sharma, Qingxuan Kang, Djordje Jevdjic, Junbo Deng, Xingkun Yang, Zhou Yu, and Pengfei Zuo. 2024. Cost-efficient large language model serving for multi-turn conversations with CachedAttention. In Proceedings of the 2024 USENIX Conference on Usenix Annual Technical Conference (Santa Clara, CA, USA) (USENIX ATC'24). USENIX Association, USA, Article 7, 16 pages."},{"key":"e_1_3_2_1_5_1","volume-title":"Nikhil Sarda, Anurag Khandelwal, and Lin Zhong.","author":"Gim In","year":"2024","unstructured":"In Gim, Guojun Chen, Seung seob Lee, Nikhil Sarda, Anurag Khandelwal, and Lin Zhong. 2024. Prompt Cache: Modular Attention Reuse for Low-Latency Inference. arXiv:2311.04934 [cs.CL] https:\/\/arxiv.org\/abs\/2311.04934"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528535.3565244"},{"key":"e_1_3_2_1_7_1","unstructured":"Ke Hong Lufang Chen Zhong Wang Xiuhong Li Qiuli Mao Jianping Ma Chao Xiong Guanyu Wu Buhe Han Guohao Dai Yun Liang and Yu Wang. 2025. semi-PD: Towards Efficient LLM Serving via Phase-Wise Disaggregated Computation and Unified Storage. arXiv:2504.19867 [cs.CL] https:\/\/arxiv.org\/abs\/2504.19867"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3732941"},{"key":"e_1_3_2_1_9_1","unstructured":"IBM. 2019. IPMI Overview. https:\/\/www.ibm.com\/docs\/en\/power8\/8001-12C?topic=ipmi-overview. Accessed: 2025-11-09."},{"key":"e_1_3_2_1_10_1","unstructured":"Intel. 2022. Running Average Power Limit Energy Reporting. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/articles\/technical\/software-security-guidance\/advisory-guidance\/running-average-power-limit-energy-reporting.html. Accessed: 2026-04-07."},{"key":"e_1_3_2_1_11_1","unstructured":"Albert Q. Jiang Alexandre Sablayrolles Arthur Mensch Chris Bamford Devendra Singh Chaplot Diego de las Casas Florian Bressand Gianna Lengyel Guillaume Lample Lucile Saulnier L\u00e9lio Renard Lavaud Marie-Anne Lachaux Pierre Stock Teven Le Scao Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William El Sayed. 2023. Mistral 7B. arXiv:2310.06825 [cs.CL] https:\/\/arxiv.org\/abs\/2310.06825"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3768628"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00103"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2024.3406038"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_16_1","unstructured":"Rongzhi Li Ruogu Du Zefang Chu Sida Zhao Chunlei Han Zuocheng Shi Yiwen Shao Huanle Han Long Huang Zherui Liu and Shufan Liu. 2025. Taming the Chaos: Coordinated Autoscaling for Heterogeneous and Disaggregated LLM Inference. arXiv:2508.19559 [cs.DC] https:\/\/arxiv.org\/abs\/2508.19559"},{"key":"e_1_3_2_1_17_1","volume-title":"Andes: Defining and Enhancing Quality-of-Experience in LLM-Based Text Streaming Services. arXiv:2404.16283 [cs.DC] https:\/\/arxiv.org\/abs\/2404.16283","author":"Liu Jiachen","year":"2024","unstructured":"Jiachen Liu, Jae-Won Chung, Zhiyu Wu, Fan Lai, Myungjin Lee, and Mosharaf Chowdhury. 2024. Andes: Defining and Enhancing Quality-of-Experience in LLM-Based Text Streaming Services. arXiv:2404.16283 [cs.DC] https:\/\/arxiv.org\/abs\/2404.16283"},{"key":"e_1_3_2_1_18_1","unstructured":"NVIDIA. 2025. CUDA C Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide. Accessed: 2025-11-09."},{"key":"e_1_3_2_1_19_1","volume-title":"NIXL: NVIDIA Inference Xfer Library (NIXL). https:\/\/github.com\/ai-dynamo\/nixl. Accessed: 2025-11-09.","author":"NVIDIA.","year":"2025","unstructured":"NVIDIA. 2025. NIXL: NVIDIA Inference Xfer Library (NIXL). https:\/\/github.com\/ai-dynamo\/nixl. Accessed: 2025-11-09."},{"key":"e_1_3_2_1_20_1","unstructured":"NVIDIA. 2025. NVIDIA Management Library (NVML). https:\/\/developer.nvidia.com\/management-library-nvml. Accessed: 2025-11-09."},{"key":"e_1_3_2_1_21_1","unstructured":"NVIDIA. 2025. pynvml: Python utilities for the NVIDIA Management Library. https:\/\/pypi.org\/project\/pynvml. Accessed: 2025-11-09."},{"key":"e_1_3_2_1_22_1","unstructured":"NVIDIA. 2026. GPUDirect Storage Documentation. NVIDIA. https:\/\/docs.nvidia.com\/gpudirect-storage\/ Accessed: 2026-02-26."},{"key":"e_1_3_2_1_23_1","unstructured":"OpenAI Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya and et al. 2024. GPT-4 Technical Report. arXiv:2303.08774 [cs.CL] https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_2_1_25_1","unstructured":"LMCache Project. 2025. LMCache: An Efficient KV-Cache Layer for Large Language Model Inference and Serving. https:\/\/github.com\/LMCache\/LMCache. Accessed: 2025-10-27."},{"key":"e_1_3_2_1_26_1","volume-title":"23rd USENIX Conference on File and Storage Technologies (FAST 25)","author":"Qin Ruoyu","year":"2025","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Jialei Cui, Feng Ren, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2025. Mooncake: Trading More Storage for Less Computation \u2014 A KVCache-centric Architecture for Serving LLM Chatbot. In 23rd USENIX Conference on File and Storage Technologies (FAST 25). USENIX Association, Santa Clara, CA, 155\u2013170. https:\/\/www.usenix.org\/conference\/fast25\/presentation\/qin"},{"key":"e_1_3_2_1_27_1","unstructured":"Tianyao Shi Yanran Wu Sihang Liu and Yi Ding. 2024. GreenLLM: Disaggregating Large Language Model Serving on Heterogeneous GPUs for Lower Carbon Emissions. arXiv:2412.20322 [cs.AR] https:\/\/arxiv.org\/abs\/2412.20322"},{"key":"e_1_3_2_1_28_1","unstructured":"Jovan Stojkovic Esha Choukse Chaojie Zhang Inigo Goiri and Josep Torrellas. 2024. Towards Greener LLMs: Bringing Energy-Efficiency to the Forefront of LLM Inference. arXiv:2403.20306 [cs.AI] https:\/\/arxiv.org\/abs\/2403.20306"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00102"},{"key":"e_1_3_2_1_30_1","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arXiv:2302.13971 [cs.CL] https:\/\/arxiv.org\/abs\/2302.13971"},{"key":"e_1_3_2_1_31_1","volume-title":"\u0141 ukasz Kaiser, and Illia Polosukhin","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.), Vol. 30. Curran Associates, Inc."},{"key":"e_1_3_2_1_32_1","unstructured":"vLLM Project. 2025. vLLM: A high-throughput and memory-efficient inference and serving engine for large language models. https:\/\/github.com\/vllm-project\/vllm. Accessed: 2025-10-27."},{"key":"e_1_3_2_1_33_1","unstructured":"Chao Wang Pengfei Zuo Zhangyu Chen Yunkai Liang Zhou Yu and Ming-Chang Yang. 2025. Prefill-Decode Aggregation or Disaggregation? Unifying Both for Goodput-Optimized LLM Serving. arXiv:2508.01989 [cs.DC] https:\/\/arxiv.org\/abs\/2508.01989"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 2025 USENIX Conference on Usenix Annual Technical Conference (Boston, MA, USA) (USENIX ATC '25). USENIX Association, USA, Article 28","author":"Wang Jiahao","year":"2025","unstructured":"Jiahao Wang, Jinbo Han, Xingda Wei, Sijie Shen, Dingyan Zhang, Chenguang Fang, Rong Chen, Wenyuan Yu, and Haibo Chen. 2025. KVCache cache in the wild: characterizing and optimizing KVCache cache at a large cloud provider. In Proceedings of the 2025 USENIX Conference on Usenix Annual Technical Conference (Boston, MA, USA) (USENIX ATC '25). USENIX Association, USA, Article 28, 18 pages."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696098"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696086"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Hongbin Zhang Taosheng Wei Zhenyi Zheng Jiangsu Du Zhiguang Chen and Yutong Lu. 2025. TD-Pipe: Temporally-Disaggregated Pipeline Parallelism Architecture for High-Throughput LLM Inference. arXiv:2506.10470 [cs.DC] https:\/\/arxiv.org\/abs\/2506.10470","DOI":"10.1145\/3754598.3754621"},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the 38th International Conference on Neural Information Processing Systems","author":"Zheng Lianmin","year":"2025","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Chuyue Sun, Jeff Huang, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E. Gonzalez, Clark Barrett, and Ying Sheng. 2025. SGLang: efficient execution of structured language model programs. In Proceedings of the 38th International Conference on Neural Information Processing Systems (Vancouver, BC, Canada) (NIPS '24). Curran Associates Inc., Red Hook, NY, USA, Article 2000, 27 pages."},{"key":"e_1_3_2_1_39_1","volume-title":"DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 193\u2013210. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/zhong-yinmin"}],"event":{"name":"EuroSys '26: 21st European Conference on Computer Systems","location":"Edinburgh Scotland Uk","acronym":"EuroMLSys '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Sixth European Workshop on Machine Learning and Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3805621.3807662","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T13:17:56Z","timestamp":1777382276000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805621.3807662"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,27]]},"references-count":39,"alternative-id":["10.1145\/3805621.3807662","10.1145\/3805621"],"URL":"https:\/\/doi.org\/10.1145\/3805621.3807662","relation":{},"subject":[],"published":{"date-parts":[[2026,4,27]]},"assertion":[{"value":"2026-04-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}