{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T07:44:22Z","timestamp":1768031062261,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3731599.3767396","type":"proceedings-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T16:18:44Z","timestamp":1762532324000},"page":"508-515","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["WAGES: Workload-Aware GPU Sharing System for Energy-Efficient Serverless LLM Serving"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-4777-8337","authenticated-orcid":false,"given":"Tianyu","family":"Wang","sequence":"first","affiliation":[{"name":"University of Pittsburgh, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0373-1867","authenticated-orcid":false,"given":"Gourav","family":"Rattihalli","sequence":"additional","affiliation":[{"name":"Hewlett Packard Enterprise (HPE), Milpitas, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8297-8525","authenticated-orcid":false,"given":"Aditya","family":"Dhakal","sequence":"additional","affiliation":[{"name":"Hewlett Packard Enterprise (HPE), Milpitas, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3385-2053","authenticated-orcid":false,"given":"Xulong","family":"Tang","sequence":"additional","affiliation":[{"name":"University of Pittsburgh, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9830-8588","authenticated-orcid":false,"given":"Dejan","family":"Milojicic","sequence":"additional","affiliation":[{"name":"Hewlett Packard Enterprise (HPE), Palo Alto, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696074"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421284"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3624062.3624238"},{"key":"e_1_3_3_1_5_2","unstructured":"Jiangfei Duan Runyu Lu Haojie Duanmu Xiuhong Li Xingcheng Zhang Dahua Lin Ion Stoica and Hao Zhang. 2024. Muxserve: Flexible spatial-temporal multiplexing for multiple llm serving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.02015 (2024)."},{"key":"e_1_3_3_1_6_2","first-page":"135","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Fu Yao","year":"2024","unstructured":"Yao Fu, Leyang Xue, Yeqi Huang, Andrei-Octavian Brabete, Dmitrii Ustiugov, Yuvraj Patel, and Luo Mai. 2024. { ServerlessLLM} :{ Low-Latency} serverless inference for large language models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 135\u2013153."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629567"},{"key":"e_1_3_3_1_8_2","unstructured":"Tao Huang Pengfei Chen Kyoka Gong Jocky Hawk Zachary Bright Wenxin Xie Kecheng Huang and Zhi Ji. 2024. ENOVA: Autoscaling towards Cost-effective and Stable Serverless LLM Serving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.09486 (2024)."},{"key":"e_1_3_3_1_9_2","first-page":"18015","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Jin Yunho","year":"2023","unstructured":"Yunho Jin, Chun-Feng Wu, David Brooks, and Gu-Yeon Wei. 2023. S'3: Increasing GPU Utilization during Generative Inference for Higher Throughput. In Advances in Neural Information Processing Systems, T.\u00a0Naumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.), Vol.\u00a036. Curran Associates, Inc., 18015\u201318027. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/3a13be0c5dae69e0f08065f113fb10b8-Paper-Conference.pdf"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530510"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00048"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3542929.3563510"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3542929.3563510"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00031"},{"key":"e_1_3_3_1_16_2","first-page":"663","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph\u00a0E Gonzalez, et\u00a0al. 2023. { AlpaServe} : Statistical multiplexing with model parallelism for deep learning serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). 663\u2013679."},{"key":"e_1_3_3_1_17_2","unstructured":"Chiheng Lou Sheng Qi Chao Jin Dapeng Nie Haoran Yang Xuanzhe Liu and Xin Jin. 2025. Towards Swift Serverless LLM Cold Starts with ParaServe. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.15524 (2025)."},{"key":"e_1_3_3_1_18_2","unstructured":"Salman Mohamadi Ghulam Mujtaba Ngan Le Gianfranco Doretto and Donald\u00a0A Adjeroh. 2023. ChatGPT in the age of generative AI and large language models: a concise survey. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.04251 (2023)."},{"key":"e_1_3_3_1_19_2","unstructured":"NVIDIA. 2023. NVIDIA Driver Documentation - NVIDIA Multi-Instance GPU User Guide. https:\/\/docs.nvidia.com\/datacenter\/tesla\/mig-user-guide\/"},{"key":"e_1_3_3_1_20_2","unstructured":"NVIDIA. 2023. NVIDIA GPU Management and Deployment - Multi-Process Service. https:\/\/docs.nvidia.com\/deploy\/mps\/index.html"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620678.3624664"},{"key":"e_1_3_3_1_22_2","unstructured":"Haoran Qiu Weichao Mao Archit Patke Shengkun Cui Saurabh Jha Chen Wang Hubertus Franke Zbigniew\u00a0T Kalbarczyk Tamer Ba\u015far and Ravishankar\u00a0K Iyer. 2024. Efficient interactive llm serving with proxy model-based sequence length prediction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.08509 (2024)."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLOUD.2019.00018"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLOUD60044.2023.00014"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00102"},{"key":"e_1_3_3_1_26_2","unstructured":"Gemini Team R Anil S Borgeaud Y Wu JB Alayrac J Yu R Soricut J Schalkwyk AM Dai A Hauth et\u00a0al. 2024. Gemini: A family of highly capable multimodal models 2024. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.11805 10 (2024)."},{"key":"e_1_3_3_1_27_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_1_28_2","unstructured":"Tianyu Wang Sheng Li Bingyao Li Yue Dai Ao Li Geng Yuan Yufei Ding Youtao Zhang and Xulong Tang. 2024. Improving GPU Multi-Tenancy Through Dynamic Multi-Instance GPU Reconfiguration. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.13126 (2024)."},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507709"},{"key":"e_1_3_3_1_30_2","unstructured":"Shan Yu Jiarong Xing Yifan Qiao Mingyuan Ma Yangmin Li Yang Wang Shuo Yang Zhiqiang Xie Shiyi Cao Ke Bao et\u00a0al. 2025. Prism: Unleashing GPU Sharing for Cost-Efficient Multi-LLM Serving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.04021 (2025)."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707285"},{"key":"e_1_3_3_1_32_2","unstructured":"Huaizheng Zhang Yuanming Li Wencong Xiao Yizheng Huang Xing Di Jianxiong Yin Simon See Yong Luo Chiew\u00a0Tong Lau and Yang You. 2023. MIGPerf: A Comprehensive Benchmark for Deep Learning Training and Inference Workloads on Multi-Instance GPUs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2301.00407 (2023)."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696070"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3710848.3710863"},{"key":"e_1_3_3_1_35_2","first-page":"65517","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Zheng Zangwei","year":"2023","unstructured":"Zangwei Zheng, Xiaozhe Ren, Fuzhao Xue, Yang Luo, Xin Jiang, and Yang You. 2023. Response Length Perception and Sequence Scheduling: An LLM-Empowered LLM Inference Pipeline. In Advances in Neural Information Processing Systems , A.\u00a0Oh, T.\u00a0Naumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.), Vol.\u00a036. Curran Associates, Inc., 65517\u201365530. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/ce7ff3405c782f761fac7f849b41ae9a-Paper-Conference.pdf"}],"event":{"name":"SC Workshops '25: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St Louis MO USA","acronym":"SC Workshops '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731599.3767396","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:34:37Z","timestamp":1767987277000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731599.3767396"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":34,"alternative-id":["10.1145\/3731599.3767396","10.1145\/3731599"],"URL":"https:\/\/doi.org\/10.1145\/3731599.3767396","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}