{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T13:13:51Z","timestamp":1776950031901,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","funder":[{"name":"National Science Foundation","award":["2451392"],"award-info":[{"award-number":["2451392"]}]},{"name":"Deutsche Forschungsgemeinschaft","award":["510552229"],"award-info":[{"award-number":["510552229"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,5,4]]},"DOI":"10.1145\/3777884.3796983","type":"proceedings-article","created":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T12:27:26Z","timestamp":1776947246000},"page":"135-146","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Evaluating Kubernetes Performance for GenAI Inference: From Automatic Speech Recognition to LLM Summarization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0572-7618","authenticated-orcid":false,"given":"Sai Sindhur","family":"Malleni","sequence":"first","affiliation":[{"name":"Red Hat, Boston, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6042-0143","authenticated-orcid":false,"given":"Ra\u00fal","family":"Sevilla","sequence":"additional","affiliation":[{"name":"Red Hat, Madrid, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4182-3980","authenticated-orcid":false,"given":"Aleksei","family":"Vasilevskii","sequence":"additional","affiliation":[{"name":"Red Hat, Munich, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3092-3522","authenticated-orcid":false,"given":"Jos\u00e9 Castillo","family":"Lema","sequence":"additional","affiliation":[{"name":"Red Hat, Madrid, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5582-8812","authenticated-orcid":false,"given":"Andr\u00e9","family":"Bauer","sequence":"additional","affiliation":[{"name":"Illinois Institute of Technology, Chicago, USA"}]}],"member":"320","published-online":{"date-parts":[[2026,5,3]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3639478.3640034"},{"key":"e_1_3_2_1_2_1","unstructured":"Amazon Web Services Inc.. 2024. Red Hat OpenShift Service on AWS. https:\/\/aws.amazon.com\/rosa\/. Accessed: 2025-10-27."},{"key":"e_1_3_2_1_3_1","volume-title":"Kueue: Kubernetes-native Job Queueing. https:\/\/kueue.sigs.k8s.io\/. Accessed: 2025-09-29.","author":"Authors Kueue Project","year":"2025","unstructured":"Kueue Project Authors. 2025a. Kueue: Kubernetes-native Job Queueing. https:\/\/kueue.sigs.k8s.io\/. Accessed: 2025-09-29."},{"key":"e_1_3_2_1_4_1","unstructured":"The Kubernetes Authors. 2025b. Dynamic Resource Allocation (DRA) in Kubernetes. https:\/\/kubernetes.io\/docs\/concepts\/scheduling-eviction\/dynamic-resource-allocation\/. Accessed: 2025-09-29."},{"key":"e_1_3_2_1_5_1","unstructured":"The Kubernetes Authors. 2025c. Kubernetes Device Plugin Framework. https:\/\/kubernetes.io\/docs\/concepts\/extend-kubernetes\/compute-storage-net\/device-plugins\/. Accessed: 2025-09-29."},{"key":"e_1_3_2_1_6_1","unstructured":"The Kubernetes Authors. 2025d. Kubernetes Topology Manager. https:\/\/kubernetes.io\/docs\/tasks\/administer-cluster\/topology-manager\/. Accessed: 2025-09-29."},{"key":"e_1_3_2_1_7_1","unstructured":"The Kubernetes Authors. 2025 e. Resource Quotas in Kubernetes. https:\/\/kubernetes.io\/docs\/concepts\/policy\/resource-quotas\/. Accessed: 2025-09-29."},{"key":"e_1_3_2_1_8_1","first-page":"1481","volume-title":"2025 USENIX Annual Technical Conference (USENIX ATC 25)","author":"Bartolomeo Giovanni","year":"2025","unstructured":"Giovanni Bartolomeo, Navidreza Asadi, Wolfgang Kellerer, Jorg Ott, and Nitinder Mohan. 2025. Container Partitioning for Distributed . In 2025 USENIX Annual Technical Conference (USENIX ATC 25). 1481-1500."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2898442.2898444"},{"key":"e_1_3_2_1_10_1","volume-title":"EPJ Web of Conferences","volume":"295","author":"Ciangottini Diego","year":"2024","unstructured":"Diego Ciangottini, Giulio Bianchini, Mirko Mariotti, Daniele Spiga, Loriano Storchi, and Giacomo Surace. 2024. KServe inference extension for an FPGA vendor-free ecosystem. In EPJ Web of Conferences, Vol. 295. EDP Sciences, 11012."},{"key":"e_1_3_2_1_11_1","unstructured":"NVIDIA Corporation. 2025a. NVIDIA GPU Operator for Kubernetes. https:\/\/docs.nvidia.com\/datacenter\/cloud-native\/gpu-operator\/. Accessed: 2025-09-29."},{"key":"e_1_3_2_1_12_1","unstructured":"NVIDIA Corporation. 2025b. NVIDIA Multi-Instance GPU (MIG). https:\/\/docs.nvidia.com\/datacenter\/tesla\/mig-user-guide\/. Accessed: 2025-09-29."},{"key":"e_1_3_2_1_13_1","unstructured":"NVIDIA Corporation. 2025c. Run:ai: GPU orchestration. https:\/\/docs.run.ai\/v2.20\/home\/overview\/. Accessed: 2025-01-01."},{"key":"e_1_3_2_1_14_1","volume-title":"On the Cost of Model-Serving Frameworks: An Experimental Evaluation. In 2024 IEEE International Conference on Cloud Engineering (IC2E). IEEE, 221-232","author":"Rosa Pasquale De","year":"2024","unstructured":"Pasquale De Rosa, Y\u00e9rom-David Bromberg, Pascal Felber, Djob Mvondo, and Valerio Schiavoni. 2024. On the Cost of Model-Serving Frameworks: An Experimental Evaluation. In 2024 IEEE International Conference on Cloud Engineering (IC2E). IEEE, 221-232."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","first-page":"225","DOI":"10.18778\/1731-7533.21.3.02","article-title":"Accents in Speech Recognition through the Lens of a World Englishes Evaluation Set","volume":"21","author":"Rio Miguel Del","year":"2023","unstructured":"Miguel Del Rio, Corey Miller, Jan Profant, Jennifer Drexler-Fox, Quinn McNamara, Nishchal Bhandari, Natalie Delworth, Ilya Pirkin, Mig\u00fcel Jett\u00e9, Shipra Chandra, Peter Ha, and Ryan Westerman. 2023. Accents in Speech Recognition through the Lens of a World Englishes Evaluation Set. Research in Language, Vol. 21, 3 (2023), 225-244.","journal-title":"Research in Language"},{"key":"e_1_3_2_1_16_1","volume-title":"llm-d-inference-sim: A light weight vLLM simulator, for mocking out replicas. https:\/\/github.com\/llm-d\/llm-d-inference-sim. https:\/\/github.com\/llm-d\/llm-d-inference-sim GitHub repository","year":"2025","unstructured":"llm-d Developers. 2025. llm-d-inference-sim: A light weight vLLM simulator, for mocking out replicas. https:\/\/github.com\/llm-d\/llm-d-inference-sim. https:\/\/github.com\/llm-d\/llm-d-inference-sim GitHub repository. Published: May 2025 (Project Announcement), Last Updated: November 6, 2025."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3757892.3757902"},{"key":"e_1_3_2_1_18_1","volume-title":"CNCF 2023 Annual Survey. https:\/\/www.cncf.io\/reports\/cncf-annual-survey-2024\/. Accessed: 2025-10-13","author":"Computing Foundation Cloud Native","year":"2023","unstructured":"Cloud Native Computing Foundation. 2023. CNCF 2023 Annual Survey. https:\/\/www.cncf.io\/reports\/cncf-annual-survey-2024\/. Accessed: 2025-10-13."},{"key":"e_1_3_2_1_19_1","first-page":"135","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Fu Yao","year":"2024","unstructured":"Yao Fu, Leyang Xue, Yeqi Huang, Andrei-Octavian Brabete, Dmitrii Ustiugov, Yuvraj Patel, and Luo Mai. 2024. : serverless inference for large language models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 135-153."},{"key":"e_1_3_2_1_20_1","volume-title":"Share & Trends: Industry Report, 2030","author":"Research Grand View","year":"2025","unstructured":"Grand View Research. 2025. Cloud AI Market Size, Share & Trends: Industry Report, 2030. Technical Report GVR-4-68040-055-7. Grand View Research, Inc. 200 pages. https:\/\/www.grandviewresearch.com\/industry-analysis\/cloud-ai-market-report Historical range: 2018\u20132024, Forecast period: 2025\u20132030."},{"key":"e_1_3_2_1_21_1","volume-title":"ACM Queue","volume":"23","author":"Gschwind Michael","year":"2025","unstructured":"Michael Gschwind. 2025. AI: It's All About Inference Now. ACM Queue, Vol. 23 (May 2025). Issue 2. https:\/\/queue.acm.org\/detail.cfm?id=3733701"},{"key":"e_1_3_2_1_22_1","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al., 2022. Lora: Low-rank adaptation of large language models. ICLR, Vol. 1, 2 (2022), 3.","journal-title":"ICLR"},{"key":"e_1_3_2_1_23_1","unstructured":"Hugging Face. 2024. Qwen3-8B. https:\/\/huggingface.co\/Qwen\/Qwen3-8B. Accessed: 2025-10-27."},{"key":"e_1_3_2_1_24_1","unstructured":"Red Hat Inc. 2025. Dynamic Accelerator Slicer (DAS) Operator. https:\/\/github.com\/openshift\/instaslice-operator. Accessed: 2025-11-01."},{"key":"e_1_3_2_1_25_1","unstructured":"Kubernetes SIGs. 2025. Gateway API Inference Extension: Kubernetes-native routing & scheduling for GenAI inference workloads. https:\/\/github.com\/kubernetes-sigs\/gateway-api-inference-extension Accessed: 2025-10-27."},{"key":"e_1_3_2_1_26_1","unstructured":"Woosuk Kwon Sang-Woo Lee Hyunji Lee Beomsu Kim and Hyeontaek Lim. 2023. vLLM: Easy Fast and Cheap LLM Serving with PagedAttention. arXiv:2309.06180 [cs.DC] https:\/\/arxiv.org\/abs\/2309.06180"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3545008.3545027"},{"key":"e_1_3_2_1_28_1","unstructured":"llm-d Project Authors. 2025. llm-d: a Kubernetes-native high-performance distributed LLM inference framework. https:\/\/llm-d.ai\/. Accessed: 2025-09-29."},{"key":"e_1_3_2_1_29_1","unstructured":"llm-d.ai. 2024a. Intelligent Inference Scheduling with llm-d. https:\/\/llm-d.ai\/blog\/intelligent-inference-scheduling-with-llm-d. https:\/\/llm-d.ai\/blog\/intelligent-inference-scheduling-with-llm-d Accessed: 2025-10-27."},{"key":"e_1_3_2_1_30_1","unstructured":"llm-d.ai. 2024b. KV-Cache Wins You Can See. https:\/\/llm-d.ai\/blog\/kvcache-wins-you-can-see. https:\/\/llm-d.ai\/blog\/kvcache-wins-you-can-see Accessed: 2025-10-27."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3629527.3651405"},{"key":"e_1_3_2_1_32_1","volume-title":"Manifests for ICPE'26","author":"Malleni Sai Sindhur","year":"2025","unstructured":"Sai Sindhur Malleni, Jos\u00e9 Castillo Lema, Ra\u00fal Sevilla, and Aleksei Vasilevskii. 2025. Manifests for ICPE'26. https:\/\/github.com\/RedHatResearch\/icpe26-k8s-ai-apis. Accessed: 2025-11-01."},{"key":"e_1_3_2_1_33_1","unstructured":"Market Research Future. 2024. Automatic Speech Recognition Software Market Size | CAGR of 24.16%. https:\/\/www.marketresearchfuture.com\/reports\/automatic-speech-recognition-asr-software-market-27251 Accessed: 2025-10-04."},{"key":"e_1_3_2_1_34_1","volume-title":"Accessed","author":"Maslej Nestor","year":"2025","unstructured":"Nestor Maslej, Loredana Fattorini, et al., 2025. The AI Index 2025 Annual Report. https:\/\/hai.stanford.edu\/ai-index\/2025-ai-index-report Accessed: November 1, 2025."},{"key":"e_1_3_2_1_35_1","unstructured":"Neural Magic Inc. 2024. GuideLLM: Scalable Inference and Optimization for Large Language Models. https:\/\/github.com\/vllm-project\/guidellm."},{"key":"e_1_3_2_1_36_1","unstructured":"OpenAI. 2022. Whisper GitHub Page. https:\/\/github.com\/openai\/whisper. GitHub repository. Accessed: 2025-11-07."},{"key":"e_1_3_2_1_37_1","unstructured":"OpenShift Team. 2025. Integration with Kueue in Instaslice Operator. https:\/\/github.com\/openshift\/instaslice-operator\/pull\/910. Accessed: 2025-11-07."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.3390\/electronics11121831"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3328283"},{"key":"e_1_3_2_1_40_1","unstructured":"Envoy Project. 2025. Envoy Proxy. https:\/\/www.envoyproxy.io\/. https:\/\/www.envoyproxy.io\/ Accessed: 2025-10-27."},{"key":"e_1_3_2_1_41_1","volume-title":"International conference on machine learning. PMLR, 28492-28518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International conference on machine learning. PMLR, 28492-28518."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544788"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNET.2023.3267168"},{"key":"e_1_3_2_1_44_1","unstructured":"Gaurav Singh Harshal Patil and Abhishek Malvankar. 2025. The benefits of dynamic GPU slicing in OpenShift. https:\/\/developers.redhat.com\/articles\/2025\/05\/06\/benefits-dynamic-gpu-slicing-openshift. Accessed: 2025-10-29."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3688351.3689156"},{"key":"e_1_3_2_1_46_1","volume-title":"Cloud Native System for LLM Inference Serving. arXiv preprint arXiv:2507.18007","author":"Xu Minxian","year":"2025","unstructured":"Minxian Xu, Junhan Liao, Jingfeng Wu, Yiyuan He, Kejiang Ye, and Chengzhong Xu. 2025. Cloud Native System for LLM Inference Serving. arXiv preprint arXiv:2507.18007 (2025)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3638757"},{"key":"e_1_3_2_1_48_1","volume-title":"KIS-S: A GPU-Aware Kubernetes Inference Simulator with RL-Based Auto-Scaling. arXiv preprint arXiv:2507.07932","author":"Zhang Guilin","year":"2025","unstructured":"Guilin Zhang, Wulan Guo, Ziqi Tan, Qiang Guan, and Hailong Jiang. 2025. KIS-S: A GPU-Aware Kubernetes Inference Simulator with RL-Based Auto-Scaling. arXiv preprint arXiv:2507.07932 (2025)."},{"key":"e_1_3_2_1_49_1","volume-title":"Muxflow: Efficient and safe gpu sharing in large-scale production deep learning clusters. arXiv preprint arXiv:2303.13803","author":"Zhao Yihao","year":"2023","unstructured":"Yihao Zhao, Xin Liu, Shufan Liu, Xiang Li, Yibo Zhu, Gang Huang, Xuanzhe Liu, and Xin Jin. 2023. Muxflow: Efficient and safe gpu sharing in large-scale production deep learning clusters. arXiv preprint arXiv:2303.13803 (2023)."},{"key":"e_1_3_2_1_50_1","first-page":"559","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P Xing, et al., 2022. Alpa: Automating inter-and parallelism for distributed deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 559-578."},{"key":"e_1_3_2_1_51_1","first-page":"193","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. : Disaggregating prefill and decoding for goodput-optimized large language model serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 193-210."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539597.3573037"},{"key":"e_1_3_2_1_53_1","first-page":"489","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Zhou Zhe","year":"2022","unstructured":"Zhe Zhou, Xuechao Wei, Jiejing Zhang, and Guangyu Sun. 2022. : A unified framework for transformers serving. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). 489-504."}],"event":{"name":"ICPE '26: 17th ACM\/SPEC International Conference on Performance Engineering","location":"Florence Italy","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering","SIGMETRICS ACM Special Interest Group on Measurement and Evaluation","SPEC"]},"container-title":["Proceedings of the 17th ACM\/SPEC International Conference on Performance Engineering"],"original-title":[],"deposited":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T12:28:10Z","timestamp":1776947290000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3777884.3796983"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,3]]},"references-count":53,"alternative-id":["10.1145\/3777884.3796983","10.1145\/3777884"],"URL":"https:\/\/doi.org\/10.1145\/3777884.3796983","relation":{},"subject":[],"published":{"date-parts":[[2026,5,3]]},"assertion":[{"value":"2026-05-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}