{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T13:13:56Z","timestamp":1776950036188,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":25,"publisher":"ACM","funder":[{"name":"International Business Machines Corporation","award":["10018078"],"award-info":[{"award-number":["10018078"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,5,4]]},"DOI":"10.1145\/3777884.3797818","type":"proceedings-article","created":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T12:27:26Z","timestamp":1776947246000},"page":"325-332","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["FLYT: Transparent and Elastic GPU Provisioning for Multi-Tenant Cloud Services"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-6889-6570","authenticated-orcid":false,"given":"Santhosh M.","family":"Kumar","sequence":"first","affiliation":[{"name":"Indian Institute of Technology Bombay, Mumbai, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6029-0156","authenticated-orcid":false,"given":"Sameer","family":"Ahmad","sequence":"additional","affiliation":[{"name":"Indian Institute of Technology Bombay, Mumbai, India"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6543-1037","authenticated-orcid":false,"given":"Armaan","family":"Chowfin","sequence":"additional","affiliation":[{"name":"Indian Institute of Technology, Mumbai, India"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0272-9299","authenticated-orcid":false,"given":"Purushottam","family":"Kulkarni","sequence":"additional","affiliation":[{"name":"Indian Institute of Technology, Mumbai, India"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2189-555X","authenticated-orcid":false,"given":"Anand","family":"Eswaran","sequence":"additional","affiliation":[{"name":"IBM Research, Bangalore, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8961-9990","authenticated-orcid":false,"given":"Praveen","family":"Jayachandran","sequence":"additional","affiliation":[{"name":"IBM Research, Bangalore, India"}]}],"member":"320","published-online":{"date-parts":[[2026,5,3]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI . 117-134","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI . 117-134."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/RTAS58335.2023.00012"},{"key":"e_1_3_2_1_3_1","volume-title":"Analysis of virtio GPU in a containerized environment. In 2021 29th Telecommunications Forum (TELFOR). 1-3","author":"Bogdanovic Dejan","unstructured":"Dejan Bogdanovic, Miroslav Popovic, and Srdjan Usorac. 2021. Analysis of virtio GPU in a containerized environment. In 2021 29th Telecommunications Forum (TELFOR). 1-3."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCS.2010.5547126"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.6474"},{"key":"e_1_3_2_1_6_1","volume-title":"Uvmbench: A comprehensive benchmark suite for researching unified virtual memory in gpus. arXiv preprint arXiv:2007.09822","author":"Gu Yongbin","year":"2020","unstructured":"Yongbin Gu, Wenxuan Wu, Yunfan Li, and Lizhong Chen. 2020. Uvmbench: A comprehensive benchmark suite for researching unified virtual memory in gpus. arXiv preprint arXiv:2007.09822 (2020)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3533028.3533308"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_2_1_10_1","unstructured":"Munkyu Lee Sihoon Seong Minki Kang Jihyuk Lee Gap-Joo Na In-Geol Chun Dimitrios Nikolopoulos and Cheol-Ho Hong. 2024. ParvaGPU: Efficient Spatial GPU Sharing for Large-Scale DNN Inference in Cloud Environments. arXiv:2409.14447 https:\/\/arxiv.org\/abs\/2409.14447"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CloudCom.2019.00025"},{"key":"e_1_3_2_1_12_1","first-page":"595","volume-title":"Paella: Low-latency Model Serving with Software-defined GPU Scheduling. In ACM SIGOPS 29th Symposium on Operating Systems Principles (SOSP '23)","author":"Ng Kelvin KW","year":"2023","unstructured":"Kelvin KW Ng, Henri Maxime Demoulin, and Vincent Liu. 2023. Paella: Low-latency Model Serving with Software-defined GPU Scheduling. In ACM SIGOPS 29th Symposium on Operating Systems Principles (SOSP '23). 595-610."},{"key":"e_1_3_2_1_13_1","unstructured":"NVIDIA Corporation. 2024. CUDA C Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/"},{"key":"e_1_3_2_1_14_1","unstructured":"NVIDIA Corporation. 2024. Multi-Process Service (MPS) Documentation. https:\/\/docs.nvidia.com\/deploy\/mps\/index.html"},{"key":"e_1_3_2_1_15_1","unstructured":"NVIDIA Corporation. 2024. NVIDIA Multi-Instance GPU (MIG) User Guide. https:\/\/docs.nvidia.com\/datacenter\/tesla\/mig-user-guide\/"},{"key":"e_1_3_2_1_16_1","unstructured":"NVIDIA Corporation. 2025. Virtual GPU Software User Guide - NVIDIA Docs. https:\/\/docs.nvidia.com\/vgpu\/19.0\/grid-vgpu-user-guide\/index.html"},{"key":"e_1_3_2_1_17_1","volume-title":"Guardian: Safe GPU Sharing in Multi-Tenant Environments. arXiv:2401.09290 https:\/\/arxiv.org\/abs\/2401.09290","author":"Pavlidakis Manos","year":"2024","unstructured":"Manos Pavlidakis, Giorgos Vasiliadis, Stelios Mavridis, Anargyros Argyros, Antony Chazapis, and Angelos Bilas. 2024. Guardian: Safe GPU Sharing in Multi-Tenant Environments. arXiv:2401.09290 https:\/\/arxiv.org\/abs\/2401.09290"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2011.112"},{"key":"e_1_3_2_1_19_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and AndrewZisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629578"},{"key":"e_1_3_2_1_21_1","volume-title":"International conference on machine learning. 6105-6114","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc Le. 2019. Efficientnet: Rethinking model scaling for convolutional neural networks. In International conference on machine learning. 6105-6114."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2023.10.022"},{"key":"e_1_3_2_1_23_1","first-page":"595","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Q Zhang, et al. 2018. Gandiva: Introspective cluster scheduling for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 595-610."},{"key":"e_1_3_2_1_24_1","first-page":"533","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao, Shiru Ren, Yong Li, Yang Zhang, Pengyang Hou, Zhi Li, Yihui Feng,Wei Lin, and Yangqing Jia. 2020. AntMan: Dynamic scaling on GPU clusters for deep learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 533-548."},{"key":"e_1_3_2_1_25_1","volume-title":"Prism: Unleashing GPU Sharing for Cost-Efficient Multi-LLM Serving. arXiv preprint arXiv:2505.04021","author":"Yu Shan","year":"2025","unstructured":"Shan Yu, Jiarong Xing, Yifan Qiao, Mingyuan Ma, Yangmin Li, Yang Wang, Shuo Yang, Zhiqiang Xie, Shiyi Cao, Ke Bao, et al. 2025. Prism: Unleashing GPU Sharing for Cost-Efficient Multi-LLM Serving. arXiv preprint arXiv:2505.04021 (2025)."}],"event":{"name":"ICPE '26: 17th ACM\/SPEC International Conference on Performance Engineering","location":"Florence Italy","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering","SIGMETRICS ACM Special Interest Group on Measurement and Evaluation","SPEC"]},"container-title":["Proceedings of the 17th ACM\/SPEC International Conference on Performance Engineering"],"original-title":[],"deposited":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T12:28:38Z","timestamp":1776947318000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3777884.3797818"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,3]]},"references-count":25,"alternative-id":["10.1145\/3777884.3797818","10.1145\/3777884"],"URL":"https:\/\/doi.org\/10.1145\/3777884.3797818","relation":{},"subject":[],"published":{"date-parts":[[2026,5,3]]},"assertion":[{"value":"2026-05-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}