{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T13:55:14Z","timestamp":1774965314279,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","funder":[{"name":"Hong Kong RIF grant","award":["R6021-20"],"award-info":[{"award-number":["R6021-20"]}]},{"name":"Hong Kong CRF grants","award":["C2004-21G, C7004-22G"],"award-info":[{"award-number":["C2004-21G, C7004-22G"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62272122"],"award-info":[{"award-number":["62272122"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3737413","type":"proceedings-article","created":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T20:52:41Z","timestamp":1754254361000},"page":"5831-5841","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["BurstGPT: A Real-World Workload Dataset to Optimize LLM Serving Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7680-1419","authenticated-orcid":false,"given":"Yuxin","family":"Wang","sequence":"first","affiliation":[{"name":"Huawei Hong Kong Research Center, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0791-5361","authenticated-orcid":false,"given":"Yuhan","family":"Chen","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4381-0544","authenticated-orcid":false,"given":"Zeyu","family":"Li","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), GuangZhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4096-5698","authenticated-orcid":false,"given":"Xueze","family":"Kang","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5533-6367","authenticated-orcid":false,"given":"Yuchu","family":"Fang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Ltd., Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8157-7474","authenticated-orcid":false,"given":"Yeju","family":"Zhou","sequence":"additional","affiliation":[{"name":"Huawei Technologies Ltd., Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3775-6739","authenticated-orcid":false,"given":"Yang","family":"Zheng","sequence":"additional","affiliation":[{"name":"Huawei Technologies Ltd., Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8769-9974","authenticated-orcid":false,"given":"Zhenheng","family":"Tang","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8764-8157","authenticated-orcid":false,"given":"Xin","family":"He","sequence":"additional","affiliation":[{"name":"Hong Kong Baptist University, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1622-1572","authenticated-orcid":false,"given":"Rui","family":"Guo","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8513-0902","authenticated-orcid":false,"given":"Xin","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2986-967X","authenticated-orcid":false,"given":"Qiang","family":"Wang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology (Shenzhen), Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9773-9332","authenticated-orcid":false,"given":"Amelie Chi","family":"Zhou","sequence":"additional","affiliation":[{"name":"Hong Kong Baptist University, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9745-4372","authenticated-orcid":false,"given":"Xiaowen","family":"Chu","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.5555\/3691938.3691945"},{"key":"e_1_3_2_2_2_1","unstructured":"Muhammad Ahsan Sacheendra Talluri and Alexandru Iosup. 2023. Failure Analysis of Big Cloud Service Providers Prior to and During Covid-19 Period. arxiv:2210.08006 [cs.DC]"},{"key":"e_1_3_2_2_3_1","volume-title":"GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints. arXiv preprint arXiv:2305.13245(2023).","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai. 2023. GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints. arXiv preprint arXiv:2305.13245(2023)."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939785"},{"key":"e_1_3_2_2_5_1","volume-title":"The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=mZn2Xyh9Ec","author":"Dao Tri","year":"2024","unstructured":"Tri Dao. 2024. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=mZn2Xyh9Ec"},{"key":"e_1_3_2_2_6_1","first-page":"16344","article-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. Flashattention: Fast and memory-efficient exact attention with io-awareness. Advances in Neural Information Processing Systems, Vol. 35 (2022), 16344-16359.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_7_1","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. 2022. LLM.int8(): 8-bit matrix multiplication for transformers at scale. In Proceedings of the 36th International Conference on Neural Information Processing Systems(New Orleans, LA, USA) (NIPS '22). Curran Associates Inc., Red Hook, NY, USA, Article 2198, 15 pages."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3717481"},{"key":"e_1_3_2_2_9_1","first-page":"111","volume-title":"Cost-Efficient Large Language Model Serving for Multi-turn Conversations with CachedAttention. In 2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Gao Bin","year":"2024","unstructured":"Bin Gao, Zhuomin He, Puru Sharma, Qingxuan Kang, Djordje Jevdjic, Junbo Deng, Xingkun Yang, Zhou Yu, and Pengfei Zuo. 2024. Cost-Efficient Large Language Model Serving for Multi-turn Conversations with CachedAttention. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). USENIX Association, Santa Clara, CA, 111-126. https:\/\/www.usenix.org\/conference\/atc24\/presentation\/gao-bin-cost"},{"key":"e_1_3_2_2_10_1","unstructured":"Suyu Ge Yunan Zhang Liyuan Liu Minjia Zhang Jiawei Han and Jianfeng Gao. 2023. Model Tells You What to Discard: Adaptive KV Cache Compression for LLMs. arXiv preprint arXiv:2310.01801(2023)."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3716011"},{"key":"e_1_3_2_2_12_1","unstructured":"Arnav Gudibande Eric Wallace Charlie Snell Xinyang Geng Hao Liu Pieter Abbeel Sergey Levine and Dawn Song. 2023. The False Promise of Imitating Proprietary LLMs. arxiv:2305.15717 [cs.CL]"},{"key":"e_1_3_2_2_13_1","volume-title":"Expertflow: Optimized expert activation and token allocation for efficient mixture-of-experts inference. arXiv preprint arXiv:2410.17954(2024).","author":"He Xin","year":"2024","unstructured":"Xin He, Shunkang Zhang, Yuxin Wang, Haiyan Yin, Zihao Zeng, Shaohuai Shi, Zhenheng Tang, Xiaowen Chu, Ivor Tsang, and Ong Yew Soon. 2024. Expertflow: Optimized expert activation and token allocation for efficient mixture-of-experts inference. arXiv preprint arXiv:2410.17954(2024)."},{"key":"e_1_3_2_2_14_1","volume-title":"Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa(Eds.)","volume":"6","author":"Hong Ke","year":"2024","unstructured":"Ke Hong, Guohao Dai, Jiaming Xu, Qiuli Mao, Xiuhong Li, Jun Liu, Kangdi Chen, Yuhan Dong, and Yu Wang. 2024. FlashDecoding: Faster Large Language Model Inference with Asynchronization, Flat GEMM Optimization, and Heuristics. In Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa(Eds.), Vol. 6. 148-161. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2024\/file\/5321b1dabcd2be188d796c21b733e8c7-Paper-Conference.pdf"},{"key":"e_1_3_2_2_15_1","unstructured":"Cunchen Hu Heyang Huang Junhao Hu Jiang Xu Xusheng Chen Tao Xie Chenxi Wang Sa Wang Yungang Bao Ninghui Sun and Yizhou Shan. 2024. MemServe: Context Caching for Disaggregated LLM Serving with Elastic Memory Pool. arxiv:2406.17565 [cs.DC] https:\/\/arxiv.org\/abs\/2406.17565"},{"key":"e_1_3_2_2_16_1","unstructured":"Woosuk Kwon Zhuohan Li Siyuan Zhuang Ying Sheng Lianmin Zheng Cody Yu Joey Gonzalez Hao Zhang and Ion Stoica. 2023a. vLLM: Easy Fast and Cheap LLM Serving with PagedAttention. https:\/\/vllm.ai\/"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_2_18_1","volume-title":"Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation","author":"Lee Wonbeom","year":"2024","unstructured":"Wonbeom Lee, Jungi Lee, Junghwan Seo, and Jaewoong Sim. 2024. InfiniGen: efficient generative inference of large language models with dynamic KV cache management. In Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation(Santa Clara, CA, USA) (OSDI'24). USENIX Association, USA, Article 9, 18 pages."},{"key":"e_1_3_2_2_19_1","volume-title":"ALPACA: A New Semi-Analytic Model for Metal Absorption Lines Emerging from Clumpy Galactic Environments. arxiv:2306.11089 [astro-ph.GA]","author":"Li Zhihui","year":"2023","unstructured":"Zhihui Li, Max Gronke, and Charles Steidel. 2023a. ALPACA: A New Semi-Analytic Model for Metal Absorption Lines Emerging from Clumpy Galactic Environments. arxiv:2306.11089 [astro-ph.GA]"},{"key":"e_1_3_2_2_20_1","unstructured":"Zhuohan Li Lianmin Zheng Yinmin Zhong Vincent Liu Ying Sheng Xin Jin Yanping Huang Zhifeng Chen Hao Zhang Joseph E Gonzalez et al. 2023c. AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. arXiv preprint arXiv:2302.11665(2023)."},{"key":"e_1_3_2_2_21_1","first-page":"663","volume-title":"AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. 2023b. AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 663-679. https:\/\/www.usenix.org\/conference\/osdi23\/presentation\/li-zhouhan"},{"key":"e_1_3_2_2_22_1","volume-title":"Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation","author":"Lin Chaofan","year":"2024","unstructured":"Chaofan Lin, Zhenhua Han, Chengruidong Zhang, Yuqing Yang, Fan Yang, Chen Chen, and Lili Qiu. 2024. Parrot: efficient serving of LLM-based applications with semantic variable. In Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation(Santa Clara, CA, USA) (OSDI'24). USENIX Association, USA, Article 50, 17 pages."},{"key":"e_1_3_2_2_23_1","unstructured":"Xupeng Miao Gabriele Oliaro Zhihao Zhang Xinhao Cheng Hongyi Jin Tianqi Chen and Zhihao Jia. 2023. Towards Efficient Generative Large Language Model Serving: A Survey from Algorithms to Systems. arXiv preprint arXiv:2312.15234(2023)."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651335"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640411"},{"key":"e_1_3_2_2_26_1","unstructured":"Microsoft. 2024. Azure OpenAI Service. https:\/\/azure.microsoft.com\/en-us\/products\/ai-services\/openai-service\/"},{"key":"e_1_3_2_2_27_1","volume-title":"Vaidya anf Fred and Comly Nick","author":"Neal Oh","year":"2023","unstructured":"Oh Neal, Vaidya anf Fred and Comly Nick. 2023. Optimizing inference on large language models with nvidia tensorrt-llm. https:\/\/github.com\/NVIDIA\/TensorRT-LLM"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640383"},{"key":"e_1_3_2_2_29_1","unstructured":"OpenAI. 2022. Introducing ChatGPT. https:\/\/openai.com\/blog\/chatgpt"},{"key":"e_1_3_2_2_30_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arXiv preprint arXiv:2303.08774(2023)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707256"},{"key":"e_1_3_2_2_33_1","first-page":"155","volume-title":"23rd USENIX Conference on File and Storage Technologies (FAST 25)","author":"Qin Ruoyu","year":"2025","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Jialei Cui, Feng Ren, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2025. Mooncake: Trading More Storage for Less Computation textemdash A KVCache-centric Architecture for Serving LLM Chatbot. In 23rd USENIX Conference on File and Storage Technologies (FAST 25). USENIX Association, Santa Clara, CA, 155-170. https:\/\/www.usenix.org\/conference\/fast25\/presentation\/qin"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_2_35_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=tVConYid20","author":"Shah Jay","year":"2024","unstructured":"Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, and Tri Dao. 2024. FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision. In The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=tVConYid20"},{"key":"e_1_3_2_2_36_1","first-page":"205","volume-title":"2020 USENIX Annual Technical Conference (USENIX ATC 20)","author":"Shahrad Mohammad","year":"2020","unstructured":"Mohammad Shahrad, Rodrigo Fonseca, Inigo Goiri, Gohar Chaudhry, Paul Batum, Jason Cooke, Eduardo Laureano, Colby Tresness, Mark Russinovich, and Ricardo Bianchini. 2020. Serverless in the Wild: Characterizing and Optimizing the Serverless Workload at a Large Cloud Provider. In 2020 USENIX Annual Technical Conference (USENIX ATC 20). USENIX Association, 205-218. https:\/\/www.usenix.org\/conference\/atc20\/presentation\/shahrad"},{"key":"e_1_3_2_2_37_1","unstructured":"Noam Shazeer. 2019. Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150(2019)."},{"key":"e_1_3_2_2_38_1","first-page":"965","volume-title":"Fairness in Serving Large Language Models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Sheng Ying","year":"2024","unstructured":"Ying Sheng, Shiyi Cao, Dacheng Li, Banghua Zhu, Zhuohan Li, Danyang Zhuo, Joseph E. Gonzalez, and Ion Stoica. 2024. Fairness in Serving Large Language Models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 965-988. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/sheng"},{"key":"e_1_3_2_2_39_1","volume-title":"Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation","author":"Sun Biao","year":"2024","unstructured":"Biao Sun, Ziming Huang, Hanyu Zhao, Wencong Xiao, Xinyi Zhang, Yong Li, and Wei Lin. 2024. Llumnix: dynamic scheduling for large language model serving. In Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation(Santa Clara, CA, USA) (OSDI'24). USENIX Association, USA, Article 10, 19 pages."},{"key":"e_1_3_2_2_40_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=E7fZOoiEKl","author":"Tang Zhenheng","year":"2024","unstructured":"Zhenheng Tang, Yonggang Zhang, Peijie Dong, Yiu ming Cheung, Amelie Chi Zhou, Bo Han, and Xiaowen Chu. 2024. FuseFL: One-Shot Federated Learning through the Lens of Causality with Progressive Model Fusion. In The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=E7fZOoiEKl"},{"key":"e_1_3_2_2_41_1","unstructured":"Gemini Team Machel Reid Nikolay Savinov and etc. 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arxiv:2403.05530 [cs.CL]"},{"key":"e_1_3_2_2_42_1","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arxiv:2302.13971 [cs.CL]"},{"key":"e_1_3_2_2_43_1","unstructured":"Bingyang Wu Yinmin Zhong Zili Zhang Gang Huang Xuanzhe Liu and Xin Jin. 2023. Fast Distributed Inference Serving for Large Language Models. arxiv:2305.05920 [cs.LG]"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3717455"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.14778\/3626292.3626303"},{"key":"e_1_3_2_2_46_1","first-page":"699","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Xia Haojun","year":"2024","unstructured":"Haojun Xia, Zhen Zheng, Xiaoxia Wu, Shiyang Chen, Zhewei Yao, Stephen Youn, Arash Bakhtiari, Michael Wyatt, Donglin Zhuang, Zhongzhu Zhou, Olatunji Ruwase, Yuxiong He, and Shuaiwen Leon Song. 2024. Quant-LLM: Accelerating the Serving of Large Language Models via FP6-Centric Algorithm-System Co-Design on Modern GPUs. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). USENIX Association, Santa Clara, CA, 699-713. https:\/\/www.usenix.org\/conference\/atc24\/presentation\/xia"},{"key":"e_1_3_2_2_47_1","volume-title":"International Conference on Machine Learning. PMLR, 38087-38099","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning. PMLR, 38087-38099."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696098"},{"key":"e_1_3_2_2_49_1","first-page":"521","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 521-538. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/yu"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483580"},{"key":"e_1_3_2_2_51_1","volume-title":"Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa(Eds.)","volume":"6","author":"Zhao Yilong","year":"2024","unstructured":"Yilong Zhao, Chien-Yu Lin, Kan Zhu, Zihao Ye, Lequn Chen, Size Zheng, Luis Ceze, Arvind Krishnamurthy, Tianqi Chen, and Baris Kasikci. 2024. Atom: Low-Bit Quantization for Efficient and Accurate LLM Serving. In Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa(Eds.), Vol. 6. 196-209. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2024\/file\/5edb57c05c81d04beb716ef1d542fe9e-Paper-Conference.pdf"},{"key":"e_1_3_2_2_52_1","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"Zheng Zangwei","year":"2023","unstructured":"Zangwei Zheng, Xiaozhe Ren, Fuzhao Xue, Yang Luo, Xin Jiang, and Yang You. 2023. Response length perception and sequence scheduling: an LLM-empowered LLM inference pipeline. In Proceedings of the 37th International Conference on Neural Information Processing Systems(New Orleans, LA, USA) (NIPS '23). Curran Associates Inc., Red Hook, NY, USA, Article 2859, 14 pages."},{"key":"e_1_3_2_2_53_1","volume-title":"Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. DistServe: disaggregating prefill and decoding for goodput-optimized large language model serving. In Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation(Santa Clara, CA, USA) (OSDI'24). USENIX Association, USA, Article 11, 18 pages."},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"crossref","unstructured":"Jiahang Zhou Yanyu Chen Zicong Hong Wuhui Chen Yue Yu Tao Zhang Hui Wang Chuanfu Zhang and Zibin Zheng. 2024. Training and Serving System of Foundation Models: A Comprehensive Survey. arxiv:2401.02643 [cs.AI]","DOI":"10.1109\/OJCS.2024.3380828"}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Toronto ON Canada","acronym":"KDD '25","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3737413","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,16]],"date-time":"2025-08-16T14:30:04Z","timestamp":1755354604000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3737413"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":54,"alternative-id":["10.1145\/3711896.3737413","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3737413","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}