{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T23:08:07Z","timestamp":1768345687843,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62402024"],"award-info":[{"award-number":["62402024"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2024YFB4505901"],"award-info":[{"award-number":["2024YFB4505901"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,19]]},"DOI":"10.1145\/3772052.3772264","type":"proceedings-article","created":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:19:00Z","timestamp":1768321140000},"page":"881-893","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["C\n                    <scp>auchy<\/scp>\n                    : A Cost-Efficient LLM Serving System through Adaptive Heterogeneous Deployment"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-5768-0063","authenticated-orcid":false,"given":"Yihui","family":"Zhang","sequence":"first","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0517-5467","authenticated-orcid":false,"given":"Han","family":"Shen","sequence":"additional","affiliation":[{"name":"Kuaishou Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6334-4925","authenticated-orcid":false,"given":"Renyu","family":"Yang","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7618-6668","authenticated-orcid":false,"given":"Di","family":"Tian","sequence":"additional","affiliation":[{"name":"Kuaishou Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7584-1020","authenticated-orcid":false,"given":"Yuxi","family":"Luo","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5274-5512","authenticated-orcid":false,"given":"Menghao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2990-1614","authenticated-orcid":false,"given":"Li","family":"Li","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3473-9703","authenticated-orcid":false,"given":"Chunming","family":"Hu","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5331-3364","authenticated-orcid":false,"given":"Tianyu","family":"Wo","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3826-8436","authenticated-orcid":false,"given":"Chengru","family":"Song","sequence":"additional","affiliation":[{"name":"Kuaishou Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6591-7717","authenticated-orcid":false,"given":"Jin","family":"Ouyang","sequence":"additional","affiliation":[{"name":"Kuaishou Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming {Throughput-Latency} tradeoff in {LLM} inference with {Sarathi-Serve}. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 117\u2013134."},{"key":"e_1_3_2_1_3_1","volume-title":"Generative AI on AWS. https:\/\/aws.amazon.com\/ai\/generative-ai Retrieved","author":"Services Amazon Web","year":"2025","unstructured":"Amazon Web Services. 2025. Generative AI on AWS. https:\/\/aws.amazon.com\/ai\/generative-ai Retrieved June, 2025."},{"key":"e_1_3_2_1_4_1","volume-title":"Ai Document Summary. https:\/\/www.claude.ai Retrieved","year":"2025","unstructured":"Anthropic. 2025. Ai Document Summary. https:\/\/www.claude.ai Retrieved June, 2025."},{"key":"e_1_3_2_1_5_1","volume-title":"https:\/\/github.com\/Azure\/AzurePublicDataset Retrieved","author":"Dataset Azure Public","year":"2025","unstructured":"Azure Public Dataset. 2025. AzureLLMInferenceTrace. https:\/\/github.com\/Azure\/AzurePublicDataset Retrieved June, 2025."},{"key":"e_1_3_2_1_6_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang Binyuan Hui Luo Ji Mei Li Junyang Lin Runji Lin Dayiheng Liu Gao Liu Chengqiang Lu Keming Lu Jianxin Ma Rui Men Xingzhang Ren Xuancheng Ren Chuanqi Tan Sinan Tan Jianhong Tu Peng Wang Shijie Wang Wei Wang Shengguang Wu Benfeng Xu Jin Xu An Yang Hao Yang Jian Yang Shusheng Yang Yang Yao Bowen Yu Hongyi Yuan Zheng Yuan Jianwei Zhang Xingxuan Zhang Yichang Zhang Zhenru Zhang Chang Zhou Jingren Zhou Xiaohuan Zhou and Tianhang Zhu. 2023. Qwen Technical Report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"Unleashing 10,000+ Word Generation from Long Context LLMs. arXiv preprint arXiv:2408.07055","author":"Bai Yushi","year":"2024","unstructured":"Yushi Bai, Jiajie Zhang, Xin Lv, Linzhi Zheng, Siqi Zhu, Lei Hou, Yuxiao Dong, Jie Tang, and Juanzi Li. 2024. LongWriter: Unleashing 10,000+ Word Generation from Long Context LLMs. arXiv preprint arXiv:2408.07055 (2024)."},{"key":"e_1_3_2_1_8_1","volume-title":"A python Linear Programming API. https:\/\/github.com\/coin-or\/pulp Retrieved","author":"Foundation COIN-OR","year":"2025","unstructured":"COIN-OR Foundation. 2025. A python Linear Programming API. https:\/\/github.com\/coin-or\/pulp Retrieved June, 2025."},{"key":"e_1_3_2_1_9_1","volume-title":"EcoServe: Enabling Cost-effective LLM Serving with Proactive Intra-and Inter-Instance Orchestration. arXiv preprint arXiv:2504.18154","author":"Du Jiangsu","year":"2025","unstructured":"Jiangsu Du, Hongbin Zhang, Taosheng Wei, Zhenyi Zheng, Kaiyi Wu, Zhiguang Chen, and Yutong Lu. 2025. EcoServe: Enabling Cost-effective LLM Serving with Proactive Intra-and Inter-Instance Orchestration. arXiv preprint arXiv:2504.18154 (2025)."},{"key":"e_1_3_2_1_10_1","volume-title":"Efficient LLM Scheduling by Learning to Rank. arXiv preprint arXiv:2408.15792","author":"Fu Yichao","year":"2024","unstructured":"Yichao Fu, Siqi Zhu, Runlong Su, Aurick Qiao, Ion Stoica, and Hao Zhang. 2024. Efficient LLM Scheduling by Learning to Rank. arXiv preprint arXiv:2408.15792 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Cloud GPU Price Comparison. https:\/\/getdeploying.com\/reference\/cloud-gpu Retrieved","year":"2025","unstructured":"GetDeploying. 2025. Cloud GPU Price Comparison. https:\/\/getdeploying.com\/reference\/cloud-gpu Retrieved June, 2025."},{"key":"e_1_3_2_1_12_1","volume-title":"Cost of building and deploying AI models in Vertex AI. https:\/\/cloud.google.com\/vertex- ai\/generative-ai\/pricing Retrieved","author":"Cloud Google","year":"2025","unstructured":"Google Cloud. 2025. Cost of building and deploying AI models in Vertex AI. https:\/\/cloud.google.com\/vertex- ai\/generative-ai\/pricing Retrieved June, 2025."},{"key":"e_1_3_2_1_13_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_14_1","volume-title":"M'elange: Cost efficient large language model serving by exploiting gpu heterogeneity. arXiv preprint arXiv.2404.14527","author":"Griggs Tyler","year":"2024","unstructured":"Tyler Griggs, Xiaoxuan Liu, Jiaxiang Yu, Doyoung Kim, Wei-Lin Chiang, Alvin Cheung, and Ion Stoica. 2024. M'elange: Cost efficient large language model serving by exploiting gpu heterogeneity. arXiv preprint arXiv.2404.14527 (2024)."},{"key":"e_1_3_2_1_15_1","unstructured":"Karl Moritz Hermann Tom\u00e1s Kocisk\u00fd Edward Grefenstette Lasse Espeholt Will Kay Mustafa Suleyman and Phil Blunsom. 2015. Teaching Machines to Read and Comprehend. In NIPS. 1693\u20131701. http:\/\/papers.nips.cc\/paper\/5945-teaching-machines- to-read- and-comprehend"},{"key":"e_1_3_2_1_16_1","volume-title":"Jeff Rasley, Samyam Rajbhandari, Reza Yazdani Aminabadi, Heyang Qin, Arash Bakhtiari, Lev Kurilenko, et al.","author":"Holmes Connor","year":"2024","unstructured":"Connor Holmes, Masahiro Tanaka, Michael Wyatt, Ammar Ahmad Awan, Jeff Rasley, Samyam Rajbhandari, Reza Yazdani Aminabadi, Heyang Qin, Arash Bakhtiari, Lev Kurilenko, et al. 2024. Deepspeed-fastgen: High-throughput text generation for llms via mii and deepspeed-inference. arXiv preprint arXiv:2401.08671 (2024)."},{"key":"e_1_3_2_1_17_1","unstructured":"Ke Hong Lufang Chen Zhong Wang Xiuhong Li Qiuli Mao Jianping Ma Chao Xiong Guanyu Wu Buhe Han Guohao Dai et al. 2025. semi-PD: Towards Efficient LLM Serving via Phase-Wise Disaggregated Computation and Unified Storage. arXiv preprint arXiv:2504.19867 (2025)."},{"key":"e_1_3_2_1_18_1","volume-title":"ShareGPT datasets. https:\/\/huggingface.co\/datasets\/anon8231489123\/ShareGPT_Vicuna_unfiltered Retrieved","year":"2025","unstructured":"Huggingface. 2025. ShareGPT datasets. https:\/\/huggingface.co\/datasets\/anon8231489123\/ShareGPT_Vicuna_unfiltered Retrieved June, 2025."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_20_1","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_1_21_1","volume-title":"Introducing llama 3.1: Our most capable models to date","author":"Meta AI","year":"2024","unstructured":"AI Meta. 2024. Introducing llama 3.1: Our most capable models to date, 2024. URL https:\/\/ai. meta. com\/blog\/meta-llama-3-1\/. New models including flagship 405B parameter model, along with upgraded 8B and 70B models featuring 128K context length and multilingual capabilities (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"https:\/\/copilot.microsoft.com Retrieved","author":"Copilot Microsoft","year":"2025","unstructured":"Microsoft. 2025. Microsoft Copilot. https:\/\/copilot.microsoft.com Retrieved June, 2025."},{"key":"e_1_3_2_1_23_1","volume-title":"Azure OpenAI Service pricing. https:\/\/azure.microsoft.com\/en-us\/pricing\/details\/cognitive-services\/openai-service\/ Retrieved","author":"Azure Microsoft","year":"2025","unstructured":"Microsoft Azure. 2025. Azure OpenAI Service pricing. https:\/\/azure.microsoft.com\/en-us\/pricing\/details\/cognitive-services\/openai-service\/ Retrieved June, 2025."},{"key":"e_1_3_2_1_24_1","unstructured":"NVIDIA. 2025. NVIDIA Unveils Rubin CPX: A New Class of GPU Designed for Massive-Context Inference. https:\/\/nvidianews.nvidia.com\/news\/nvidia-unveils-rubin-cpx-a-new-class-of-gpu-designed-for-massive-context-inference Retrieved Sept 2025."},{"key":"e_1_3_2_1_25_1","volume-title":"ChatGPT: smart and simple AI. https:\/\/www.chatgpt.com Retrieved","author":"AI.","year":"2025","unstructured":"OpenAI. 2025. ChatGPT: smart and simple AI. https:\/\/www.chatgpt.com Retrieved June, 2025."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_2_1_27_1","volume-title":"23rd USENIX Conference on File and Storage Technologies (FAST 25)","author":"Qin Ruoyu","year":"2025","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Jialei Cui, Feng Ren, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2025. Mooncake: Trading More Storage for Less Computation\u2014A {KVCache-centric} Architecture for Serving {LLM} Chatbot. In 23rd USENIX Conference on File and Storage Technologies (FAST 25). 155\u2013170."},{"key":"e_1_3_2_1_28_1","volume-title":"Olivier Tardieu, Jordi Torres, and Josep Ll Berral.","author":"Recasens Pol G","year":"2025","unstructured":"Pol G Recasens, Ferran Agullo, Yue Zhu, Chen Wang, Eun Kyung Lee, Olivier Tardieu, Jordi Torres, and Josep Ll Berral. 2025. Mind the memory gap: Unveiling gpu bottlenecks in large-batch llm inference. arXiv preprint arXiv:2503.08311 (2025)."},{"key":"e_1_3_2_1_29_1","volume-title":"DynaServe: Unified and Elastic Execution for Dynamic Disaggregated LLM Serving. arXiv preprint arXiv:2504.09285","author":"Ruan Chaoyi","year":"2025","unstructured":"Chaoyi Ruan, Yinhe Chen, Dongqi Tian, Yandong Shi, Yongji Wu, Jialin Li, and Cheng Li. 2025. DynaServe: Unified and Elastic Execution for Dynamic Disaggregated LLM Serving. arXiv preprint arXiv:2504.09285 (2025)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1099"},{"key":"e_1_3_2_1_31_1","unstructured":"Mohammad Shahrad Rodrigo Fonseca Inigo Goiri Gohar Chaudhry Paul Batum Jason Cooke Eduardo Laureano Colby Tresness Mark Russinovich and Ricardo Bianchini. 2020. Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider. In 2020 USENIX annual technical conference (USENIX ATC 20). 205\u2013218."},{"key":"e_1_3_2_1_32_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Sheng Ying","year":"2024","unstructured":"Ying Sheng, Shiyi Cao, Dacheng Li, Banghua Zhu, Zhuohan Li, Danyang Zhuo, Joseph E Gonzalez, and Ion Stoica. 2024. Fairness in serving large language models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 965\u2013988."},{"key":"e_1_3_2_1_33_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Sun Biao","year":"2024","unstructured":"Biao Sun, Ziming Huang, Hanyu Zhao, Wencong Xiao, Xinyi Zhang, Yong Li, and Wei Lin. 2024. Llumnix: Dynamic scheduling for large language model serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 173\u2013191."},{"key":"e_1_3_2_1_34_1","volume-title":"https:\/\/vast.ai\/pricing Retrieved","author":"Vast Pricing","year":"2025","unstructured":"vast.ai. 2025. Pricing \/ Vast.ai. https:\/\/vast.ai\/pricing Retrieved June, 2025."},{"key":"e_1_3_2_1_35_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521\u2013538."},{"key":"e_1_3_2_1_36_1","volume-title":"Tempo: Application-aware LLM Serving with Mixed SLO Requirements. arXiv preprint arXiv:2504.20068","author":"Zhang Wei","year":"2025","unstructured":"Wei Zhang, Zhiyu Wu, Yi Mu, Banruo Liu, Myungjin Lee, and Fan Lai. 2025. Tempo: Application-aware LLM Serving with Mixed SLO Requirements. arXiv preprint arXiv:2504.20068 (2025)."},{"key":"e_1_3_2_1_37_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. {DistServe}: Disaggregating prefill and decoding for goodput-optimized large language model serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 193\u2013210."}],"event":{"name":"SoCC '25: ACM Symposium on Cloud Computing","location":"Online USA","acronym":"SoCC '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 2025 ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772052.3772264","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:22:16Z","timestamp":1768321336000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772052.3772264"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,19]]},"references-count":37,"alternative-id":["10.1145\/3772052.3772264","10.1145\/3772052"],"URL":"https:\/\/doi.org\/10.1145\/3772052.3772264","relation":{},"subject":[],"published":{"date-parts":[[2025,11,19]]},"assertion":[{"value":"2026-01-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}