{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T23:27:30Z","timestamp":1776814050931,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","funder":[{"name":"NSF CNS","award":["2239311"],"award-info":[{"award-number":["2239311"]}]},{"name":"NSF CCF","award":["2217016"],"award-info":[{"award-number":["2217016"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3731569.3764818","type":"proceedings-article","created":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:43:24Z","timestamp":1759322604000},"page":"1-17","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["LithOS: An Operating System for Efficient Machine Learning on GPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7101-6961","authenticated-orcid":false,"given":"Patrick H.","family":"Coppock","sequence":"first","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3190-3129","authenticated-orcid":false,"given":"Brian","family":"Zhang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0570-183X","authenticated-orcid":false,"given":"Eliot H.","family":"Solomon","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9732-3360","authenticated-orcid":false,"given":"Vasilis","family":"Kypriotis","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5702-7687","authenticated-orcid":false,"given":"Leon","family":"Yang","sequence":"additional","affiliation":[{"name":"Meta, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8347-8207","authenticated-orcid":false,"given":"Bikash","family":"Sharma","sequence":"additional","affiliation":[{"name":"Meta, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0945-4429","authenticated-orcid":false,"given":"Dan","family":"Schatzberg","sequence":"additional","affiliation":[{"name":"Meta, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4076-5684","authenticated-orcid":false,"given":"Todd C.","family":"Mowry","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0289-5499","authenticated-orcid":false,"given":"Dimitrios","family":"Skarlatos","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the 2024 IEEE\/ACM 46th International Conference on Software Engineering: Companion Proceedings. 16\u201320","author":"Alexopoulos Georgios","year":"2024","unstructured":"Georgios Alexopoulos and Dimitris Mitropoulos. 2024. nvshare: Practical GPU Sharing without Memory Size Constraints. In Proceedings of the 2024 IEEE\/ACM 46th International Conference on Software Engineering: Companion Proceedings. 16\u201320."},{"key":"e_1_3_2_1_2_1","volume-title":"PipeSwitch: Fast Pipelined Context Switching for Deep Learning Applications. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Bai Zhihao","year":"2020","unstructured":"Zhihao Bai, Zhen Zhang, Yibo Zhu, and Xin Jin. 2020. PipeSwitch: Fast Pipelined Context Switching for Deep Learning Applications. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 499\u2013514."},{"key":"e_1_3_2_1_3_1","volume-title":"2022 IEEE Real-Time Systems Symposium (RTSS). IEEE, 370\u2013382","author":"Bakita Joshua","unstructured":"Joshua Bakita and James H. Anderson. 2022. Enabling GPU Memory Oversubscription via Transparent Paging to an NVMe SSD. In 2022 IEEE Real-Time Systems Symposium (RTSS). IEEE, 370\u2013382."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the 29th IEEE Real-Time and Embedded Technology and Applications Symposium. 54\u201366","author":"Bakita Joshua","unstructured":"Joshua Bakita and James H. Anderson. 2023. Hardware Compute Partitioning on NVIDIA GPUs. In Proceedings of the 29th IEEE Real-Time and Embedded Technology and Applications Symposium. 54\u201366."},{"key":"e_1_3_2_1_5_1","unstructured":"Alexey Bochkovskiy Chien-Yao Wang and Hong-Yuan Mark Liao. 2020. YOLOv4: Optimal Speed and Accuracy of Object Detection. arXiv:2004.10934 [cs.CV] https:\/\/arxiv.org\/abs\/2004.10934"},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"3","author":"Chang Chia-Hao","year":"2024","unstructured":"Chia-Hao Chang, Jihoon Han, Anand Sivasubramaniam, Vikram Sharma Mailthody, Zaid Qureshi, and Wen-Mei Hwu. 2024. GMT: GPU Orchestrated Memory Tiering for the Big Data Era. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3 (La Jolla, CA, USA) (ASPLOS '24). Association for Computing Machinery, New York, NY, USA, 464\u2013478. 10.1145\/3620666.3651353"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCC.2021.3119205"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 36th Annual ACM Symposium on Applied Computing (Virtual Event, Republic of Korea) (SAC '21)","author":"Chen Qichen","year":"2021","unstructured":"Qichen Chen, Hyerin Chung, Yongseok Son, Yoonhee Kim, and Heon Young Yeom. 2021. smCompactor: a workload-aware fine-grained resource management framework for GPGPUs. In Proceedings of the 36th Annual ACM Symposium on Applied Computing (Virtual Event, Republic of Korea) (SAC '21). Association for Computing Machinery, New York, NY, USA, 1147\u20131155. 10.1145\/3412841.3441989"},{"key":"e_1_3_2_1_9_1","volume-title":"Serving Heterogeneous Machine Learning Models on Multi-GPU Servers with Spatio-Temporal Sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Choi Seungbeom","year":"2022","unstructured":"Seungbeom Choi, Sunho Lee, Yeonjae Kim, Jongse Park, Youngjin Kwon, and Jaehyuk Huh. 2022. Serving Heterogeneous Machine Learning Models on Multi-GPU Servers with Spatio-Temporal Sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). USENIX Association, Carlsbad, CA, 199\u2013216. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/choi-seungbeom"},{"key":"e_1_3_2_1_10_1","volume-title":"KRISP: Enabling Kernel-wise RIght-sizing for Spatial Partitioned GPU Inference Servers. In 2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA). 624\u2013637","author":"Chow Marcus","year":"2023","unstructured":"Marcus Chow, Ali Jahanshahi, and Daniel Wong. 2023. KRISP: Enabling Kernel-wise RIght-sizing for Spatial Partitioned GPU Inference Servers. In 2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA). 624\u2013637. 10.1109\/HPCA56546.2023.10071121"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the 14th International Green and Sustainable Computing Conference","author":"Chow Marcus","year":"2024","unstructured":"Marcus Chow and Daniel Wong. 2024. CoFRIS: Coordinated Frequency and Resource Scaling for GPU Inference Servers. In Proceedings of the 14th International Green and Sustainable Computing Conference (Toronto, ON, Canada) (IGSC '23). Association for Computing Machinery, New York, NY, USA, 45\u201351. 10.1145\/3634769.3634808"},{"key":"e_1_3_2_1_12_1","volume-title":"d.]. Multi-Process Service. https:\/\/docs.nvidia.com\/deploy\/mps\/index.html. Accessed","author":"NVIDIA Corporation","year":"2025","unstructured":"NVIDIA Corporation. [n. d.]. Multi-Process Service. https:\/\/docs.nvidia.com\/deploy\/mps\/index.html. Accessed: April 14, 2025."},{"key":"e_1_3_2_1_13_1","unstructured":"NVIDIA Corporation. 2023. NVIDIA H100 Tensor Core GPU Architecture. Technical Report. NVIDIA Corporation Santa Clara CA."},{"key":"e_1_3_2_1_14_1","volume-title":"Triton Inference Server. https:\/\/developer.nvidia.com\/triton-inference-server. Accessed","author":"NVIDIA Corporation","year":"2024","unstructured":"NVIDIA Corporation. 2024. Triton Inference Server. https:\/\/developer.nvidia.com\/triton-inference-server. Accessed: May 8, 2024."},{"key":"e_1_3_2_1_15_1","volume-title":"Accessed","author":"NVIDIA Corporation","year":"2025","unstructured":"NVIDIA Corporation. 2025. NVIDIA Multi-Instance GPU User Guide. https:\/\/docs.nvidia.com\/datacenter\/tesla\/mig-user-guide\/index.html. Accessed: April 14, 2025."},{"key":"e_1_3_2_1_16_1","unstructured":"NVIDIA Corporation. 2025. NVIDIA RTX Blackwell GPU Architecture. https:\/\/images.nvidia.com\/aem-dam\/Solutions\/geforce\/blackwell\/nvidia-rtx-blackwell-gpu-architecture.pdf."},{"key":"e_1_3_2_1_17_1","unstructured":"NVIDIA Corporation. 2025. Open GPU documentation. https:\/\/github.com\/NVIDIA\/open-gpu-doc."},{"key":"e_1_3_2_1_18_1","volume-title":"Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Crankshaw Daniel","year":"2017","unstructured":"Daniel Crankshaw, Xin Wang, Guilio Zhou, Michael J. Franklin, Joseph E. Gonzalez, and Ion Stoica. 2017. Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17). USENIX Association, Boston, MA, 613\u2013627. https:\/\/www.usenix.org\/conference\/nsdi17\/technical-sessions\/presentation\/crankshaw"},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (St","author":"Cui Weihao","year":"2021","unstructured":"Weihao Cui, Han Zhao, Quan Chen, Ningxin Zheng, Jingwen Leng, Jieru Zhao, Zhuo Song, Tao Ma, Yong Yang, Chao Li, and Minyi Guo. 2021. Enable simultaneous DNN services based on deterministic operator overlap and precise latency prediction. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (St. Louis, Missouri) (SC '21). Association for Computing Machinery, New York, NY, USA, Article 15, 15 pages. 10.1145\/3458817.3476143"},{"key":"e_1_3_2_1_20_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL] https:\/\/arxiv.org\/abs\/1810.04805","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL] https:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the 11th ACM Symposium on Cloud Computing","author":"Dhakal Aditya","year":"1911","unstructured":"Aditya Dhakal, Sameer G Kulkarni, and K. K. Ramakrishnan. 2020. GSLICE: Controlled Spatial Sharing of GPUs for a Scalable Inference Platform. In Proceedings of the 11th ACM Symposium on Cloud Computing (Virtual Event, USA) (SoCC '20). Association for Computing Machinery, New York, NY, USA, 492\u2013506. 10.1145\/3419111.3421284"},{"key":"e_1_3_2_1_22_1","volume-title":"Nvidia announces \"Rubin Ultra\" and \"Feynman\" AI chips for 2027 and","author":"Edwards Benj","year":"2028","unstructured":"Benj Edwards. 2025. Nvidia announces \"Rubin Ultra\" and \"Feynman\" AI chips for 2027 and 2028. https:\/\/arstechnica.com\/ai\/2025\/03\/nvidia-announces-rubin-ultra-and-feynman-ai-chips-for-2027-and-2028\/"},{"key":"e_1_3_2_1_23_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Fried Joshua","year":"2020","unstructured":"Joshua Fried, Zhenyuan Ruan, Amy Ousterhout, and Adam Belay. 2020. Caladan: Mitigating Interference at Microsecond Timescales. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 281\u2013297. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/fried"},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the IEEE\/ACM 46th International Conference on Software Engineering. 1\u201313","author":"Gao Yanjie","year":"2024","unstructured":"Yanjie Gao, Yichen He, Xinze Li, Bo Zhao, Haoxiang Lin, Yoyo Liang, Jing Zhong, Hongyu Zhang, Jingzhou Wang, Yonghua Zeng, et al. 2024. An Empirical Study on Low GPU Utilization of Deep Learning Jobs. In Proceedings of the IEEE\/ACM 46th International Conference on Software Engineering. 1\u201313."},{"key":"e_1_3_2_1_25_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian and Ahmad Al-Dahle et al. 2024. The Llama 3 Herd of Models. arXiv:2407.21783 [cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_1_26_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. 2020. Serving DNNs like Clockwork: Performance Predictability from the Bottom Up. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 443\u2013462. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/gujarati"},{"key":"e_1_3_2_1_27_1","volume-title":"Microsecond-scale Preemption for Concurrent GPU-accelerated DNN Inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Han Mingcong","year":"2022","unstructured":"Mingcong Han, Hanze Zhang, Rong Chen, and Haibo Chen. 2022. Microsecond-scale Preemption for Concurrent GPU-accelerated DNN Inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 539\u2013558. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/han"},{"key":"e_1_3_2_1_28_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2015. Deep Residual Learning for Image Recognition. arXiv:1512.03385 [cs.CV] https:\/\/arxiv.org\/abs\/1512.03385"},{"key":"e_1_3_2_1_29_1","volume-title":"2019 IEEE Real-Time and Embedded Technology and Applications Symposium (RTAS). IEEE, 29\u201341","author":"Jain Saksham","year":"2019","unstructured":"Saksham Jain, Iljoo Baek, Shige Wang, and Ragunathan Rajkumar. 2019. Fractional GPUs: Software-based compute and memory bandwidth reservation for GPUs. In 2019 IEEE Real-Time and Embedded Technology and Applications Symposium (RTAS). IEEE, 29\u201341."},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the 2019 USENIX Conference on Usenix Annual Technical Conference (USENIX ATC '19). USENIX Association, USA, 947\u2013960","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, unjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In Proceedings of the 2019 USENIX Conference on Usenix Annual Technical Conference (USENIX ATC '19). USENIX Association, USA, 947\u2013960."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Andreas Kosmas Kakolyris Dimosthenis Masouros Petros Vavaroutsos Sotirios Xydis and Dimitrios Soudris. 2024. SLO-aware GPU Frequency Scaling for Energy Efficient LLM Inference Serving. arXiv:2408.05235 [cs.DC] https:\/\/arxiv.org\/abs\/2408.05235","DOI":"10.1109\/LCA.2024.3406038"},{"key":"e_1_3_2_1_32_1","volume-title":"RT-Swap: Addressing GPU Memory Bottlenecks for Real-Time Multi-DNN Inference. In 2024 IEEE 30th Real-Time and Embedded Technology and Applications Symposium (RTAS). IEEE, 373\u2013385","author":"Kang Woosung","year":"2024","unstructured":"Woosung Kang, Jinkyu Lee, Youngmoon Lee, Sangeun Oh, Kilho Lee, and Hoon Sung Chwa. 2024. RT-Swap: Addressing GPU Memory Bottlenecks for Real-Time Multi-DNN Inference. In 2024 IEEE 30th Real-Time and Embedded Technology and Applications Symposium (RTAS). IEEE, 373\u2013385."},{"key":"e_1_3_2_1_33_1","volume-title":"2011 IEEE 32nd Real-Time Systems Symposium. IEEE, 57\u201366","author":"Kato Shinpei","year":"2011","unstructured":"Shinpei Kato, Karthik Lakshmanan, Aman Kumar, Mihir Kelkar, Yutaka Ishikawa, and Ragunathan Rajkumar. 2011. RGEM: A responsive GPGPU execution model for runtime engines. In 2011 IEEE 32nd Real-Time Systems Symposium. IEEE, 57\u201366."},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 2011 USENIX Conference on USENIX Annual Technical Conference","author":"Kato Shinpei","year":"2011","unstructured":"Shinpei Kato, Karthik Lakshmanan, Ragunathan Rajkumar, and Yutaka Ishikawa. 2011. TimeGraph: GPU scheduling for real-time multitasking environments. In Proceedings of the 2011 USENIX Conference on USENIX Annual Technical Conference (Portland, OR) (USENIXATC'11). USENIX Association, USA, 2."},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the 59th ACM\/IEEE Design Automation Conference","author":"Kim Yunseong","year":"2022","unstructured":"Yunseong Kim, Yujeong Choi, and Minsoo Rhu. 2022. PARIS and ELSA: an elastic scheduling algorithm for reconfigurable multi-GPU inference servers. In Proceedings of the 59th ACM\/IEEE Design Automation Conference (San Francisco, California) (DAC '22). Association for Computing Machinery, New York, NY, USA, 607\u2013612. 10.1145\/3489517.3530510"},{"key":"e_1_3_2_1_36_1","unstructured":"Beth Kindig. 2024. AI power consumption: Rapidly becoming mission-critical. https:\/\/www.forbes.com\/sites\/bethkindig\/2024\/06\/20\/ai-power-consumption-rapidly-becoming-mission-critical\/"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the 13th Symposium on Cloud Computing","author":"Li Baolin","year":"2022","unstructured":"Baolin Li, Tirthak Patel, Siddharth Samsi, Vijay Gadepally, and Devesh Tiwari. 2022. MISO: Exploiting Multi-Instance GPU Capability on Multi-Tenant GPU Clusters. In Proceedings of the 13th Symposium on Cloud Computing (San Francisco, California) (SoCC '22). Association for Computing Machinery, New York, NY, USA, 173\u2013189. 10.1145\/3542929.3563510"},{"key":"e_1_3_2_1_38_1","unstructured":"Tsung-Yi Lin Priya Goyal Ross Girshick Kaiming He and Piotr Doll\u00e1r. 2018. Focal Loss for Dense Object Detection. arXiv:1708.02002 [cs.CV] https:\/\/arxiv.org\/abs\/1708.02002"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4227-2"},{"key":"e_1_3_2_1_40_1","unstructured":"Maxim Naumov Dheevatsa Mudigere Hao-Jun Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson G. Azzolini Dmytro Dzhulgakov Andrey Mallevich Ilia Cherniavskii Yinghai Lu Raghuraman Krishnamoorthi Ansha Yu Volodymyr Kondratenko Stephanie Pereira Xianjie Chen Wenlin Chen Vijay Rao Bill Jia Liang Xiong and Misha Smelyanskiy. 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. arXiv:1906.00091 [cs.IR] https:\/\/arxiv.org\/abs\/1906.00091"},{"key":"e_1_3_2_1_41_1","volume-title":"Dell exec reveals Nvidia has a 1,000 watt GPU in the works. https:\/\/www.msn.com\/en-us\/lifestyle\/other\/dell-exec-reveals-nvidia-has-a-1-000-watt-gpu-in-the-works\/ar-BB1jlE8f Accessed","author":"Network Microsoft","year":"2024","unstructured":"Microsoft Network. 2024. Dell exec reveals Nvidia has a 1,000 watt GPU in the works. https:\/\/www.msn.com\/en-us\/lifestyle\/other\/dell-exec-reveals-nvidia-has-a-1-000-watt-gpu-in-the-works\/ar-BB1jlE8f Accessed: June 24, 2024."},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles","author":"Ng Kelvin K. W.","year":"2023","unstructured":"Kelvin K. W. Ng, Henri Maxime Demoulin, and Vincent Liu. 2023. Paella: Low-latency Model Serving with Software-defined GPU Scheduling. In Proceedings of the 29th Symposium on Operating Systems Principles (Koblenz, Germany) (SOSP '23). Association for Computing Machinery, New York, NY, USA, 595\u2013610. 10.1145\/3600006.3613163"},{"key":"e_1_3_2_1_43_1","volume-title":"d.]. NVIDIA CUDA Driver API Documentation: Occupancy","author":"NVIDIA Corporation","unstructured":"NVIDIA Corporation. [n. d.]. NVIDIA CUDA Driver API Documentation: Occupancy. NVIDIA Corporation. https:\/\/docs.nvidia.com\/cuda\/cuda-driver-api\/group__CUDA__OCCUPANCY.html"},{"key":"e_1_3_2_1_44_1","volume-title":"Virtual GPU Software User Guide (v13.0)","author":"NVIDIA Corporation","unstructured":"NVIDIA Corporation. 2024. Virtual GPU Software User Guide (v13.0). NVIDIA Corporation. https:\/\/docs.nvidia.com\/vgpu\/13.0\/grid-vgpu-user-guide\/index.html Version 13.0, Accessed: 2025-08-28."},{"key":"e_1_3_2_1_45_1","unstructured":"Christopher Olston Noah Fiedel Kiril Gorovoy Jeremiah Harmsen Li Lao Fangwei Li Vinu Rajashekhar Sukriti Ramesh and Jordan Soyke. 2017. TensorFlow-Serving: Flexible High-Performance ML Serving. arXiv:1712.06139 [cs.DC]"},{"key":"e_1_3_2_1_46_1","volume-title":"Shenango: Achieving High CPU Efficiency for Latency-sensitive Datacenter Workloads. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Ousterhout Amy","year":"2019","unstructured":"Amy Ousterhout, Joshua Fried, Jonathan Behrens, Adam Belay, and Hari Balakrishnan. 2019. Shenango: Achieving High CPU Efficiency for Latency-sensitive Datacenter Workloads. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19). USENIX Association, Boston, MA, 361\u2013378. https:\/\/www.usenix.org\/conference\/nsdi19\/presentation\/ousterhout"},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"3","author":"Patel Pratyush","year":"2024","unstructured":"Pratyush Patel, Esha Choukse, Chaojie Zhang, \u00cd\u00f1igo Goiri, Brijesh Warrier, Nithish Mahalingam, and Ricardo Bianchini. 2024. Characterizing Power Management Opportunities for LLMs in the Cloud. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3 (La Jolla, CA, USA) (ASPLOS '24). Association for Computing Machinery, New York, NY, USA, 207\u2013222. 10.1145\/3620666.3651329"},{"key":"e_1_3_2_1_48_1","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Qiu Haoran","unstructured":"Haoran Qiu, Weichao Mao, Archit Patke, Shengkun Cui, Saurabh Jha, Chen Wang, Hubertus Franke, Zbigniew Kalbarczyk, Tamer Ba\u015far, and Ravishankar K. Iyer. 2024. Power-aware Deep Learning Model Serving with \u03bc-Serve. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). USENIX Association, Santa Clara, CA, 75\u201393. https:\/\/www.usenix.org\/conference\/atc24\/presentation\/qiu"},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"2","author":"Qureshi Zaid","year":"2023","unstructured":"Zaid Qureshi, Vikram Sharma Mailthody, Isaac Gelado, Seungwon Min, Amna Masood, Jeongmin Park, Jinjun Xiong, C. J. Newburn, Dmitri Vainbrand, I-Hsin Chung, Michael Garland, William Dally, and Wen-mei Hwu. 2023. GPU-Initiated On-Demand High-Throughput Storage Access in the BaM System Architecture. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (Vancouver, BC, Canada) (ASPLOS 2023). Association for Computing Machinery, New York, NY, USA, 325\u2013339. 10.1145\/3575693.3575748"},{"key":"e_1_3_2_1_50_1","volume-title":"Dilip Sequeira, Ashish Sirasao, Fei Sun, Hanlin Tang, Michael Thomson, Frank Wei, Ephrem Wu, Lingjie Xu, Koichi Yamada, Bing Yu, George Yuan, Aaron Zhong, Peizhao Zhang, and Yuchen Zhou.","author":"Reddi Vijay Janapa","year":"2019","unstructured":"Vijay Janapa Reddi, Christine Cheng, David Kanter, Peter Mattson, Guenther Schmuelling, Carole-Jean Wu, Brian Anderson, Maximilien Breughe, Mark Charlebois, William Chou, Ramesh Chukka, Cody Coleman, Sam Davis, Pan Deng, Greg Diamos, Jared Duke, Dave Fick, J. Scott Gardner, Itay Hubara, Sachin Idgunji, Thomas B. Jablin, Jeff Jiao, Tom St. John, Pankaj Kanwar, David Lee, Jeffery Liao, Anton Lokhmotov, Francisco Massa, Peng Meng, Paulius Micikevicius, Colin Osborne, Gennady Pekhimenko, Arun Tejusve Raghunath Rajan, Dilip Sequeira, Ashish Sirasao, Fei Sun, Hanlin Tang, Michael Thomson, Frank Wei, Ephrem Wu, Lingjie Xu, Koichi Yamada, Bing Yu, George Yuan, Aaron Zhong, Peizhao Zhang, and Yuchen Zhou. 2019. MLPerf Inference Benchmark. arXiv:1911.02549 [cs.LG]"},{"key":"e_1_3_2_1_51_1","volume-title":"INFaaS: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja J. Yadwadkar, and Christos Kozyrakis. 2021. INFaaS: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 397\u2013411. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/romero"},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the Twenty-Third ACM Symposium on Operating Systems Principles","author":"Rossbach Christopher J.","year":"2011","unstructured":"Christopher J. Rossbach, Jon Currey, Mark Silberstein, Baishakhi Ray, and Emmett Witchel. 2011. PTask: operating system abstractions to manage GPUs as compute devices. In Proceedings of the Twenty-Third ACM Symposium on Operating Systems Principles (Cascais, Portugal) (SOSP '11). Association for Computing Machinery, New York, NY, USA, 233\u2013248. 10.1145\/2043556.2043579"},{"key":"e_1_3_2_1_53_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 4510\u20134520","author":"Sandler Mark","year":"2018","unstructured":"Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, and Liang-Chieh Chen. 2018. Mobilenetv2: Inverted residuals and linear bottlenecks. In Proceedings of the IEEE conference on computer vision and pattern recognition. 4510\u20134520."},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of the 27th ACM Symposium on Operating Systems Principles (Huntsville","author":"Shen Haichen","year":"2019","unstructured":"Haichen Shen, Lequn Chen, Yuchen Jin, Liangyu Zhao, Bingyu Kong, Matthai Philipose, Arvind Krishnamurthy, and Ravi Sundaram. 2019. Nexus: A GPU Cluster Engine for Accelerating DNN-Based Video Analysis. In Proceedings of the 27th ACM Symposium on Operating Systems Principles (Huntsville, Ontario, Canada) (SOSP '19). Association for Computing Machinery, New York, NY, USA, 322\u2013337. 10.1145\/3341301.3359658"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/2553081"},{"key":"e_1_3_2_1_56_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2015. Very Deep Convolutional Networks for Large-Scale Image Recognition. arXiv:1409.1556 [cs.CV] https:\/\/arxiv.org\/abs\/1409.1556"},{"key":"e_1_3_2_1_57_1","volume-title":"High-throughput and Flexible Host Networking for Accelerated Computing. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Skiadopoulos Athinagoras","year":"2024","unstructured":"Athinagoras Skiadopoulos, Zhiqiang Xie, Mark Zhao, Qizhe Cai, Saksham Agarwal, Jacob Adelmann, David Ahern, Carlo Contavalli, Michael Goldflam, Vitaly Mayatskikh, Raghu Raja, Daniel Walton, Rachit Agarwal, Shrijeet Mukherjee, and Christos Kozyrakis. 2024. High-throughput and Flexible Host Networking for Accelerated Computing. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 405\u2013423. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/skiadopoulos"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"crossref","unstructured":"Jovan Stojkovic Chaojie Zhang \u00cd\u00f1igo Goiri Josep Torrellas and Esha Choukse. 2024. DynamoLLM: Designing LLM Inference Clusters for Performance and Energy Efficiency. arXiv:2408.00741 [cs.AI] https:\/\/arxiv.org\/abs\/2408.00741","DOI":"10.1109\/HPCA61900.2025.00102"},{"key":"e_1_3_2_1_59_1","volume-title":"Proceedings of the Nineteenth European Conference on Computer Systems","author":"Strati Foteini","year":"2024","unstructured":"Foteini Strati, Xianzhe Ma, and Ana Klimovic. 2024. Orion: Interference-aware, Fine-grained GPU Sharing for ML Applications. In Proceedings of the Nineteenth European Conference on Computer Systems (Athens, Greece) (EuroSys '24). Association for Computing Machinery, New York, NY, USA, 1075\u20131092. 10.1145\/3627703.3629578"},{"key":"e_1_3_2_1_60_1","unstructured":"Cheng Tan Zhichao Li Jian Zhang Yu Cao Sikai Qi Zherui Liu Yibo Zhu and Chuanxiong Guo. 2021. Serving DNN Models with Multi-Instance GPUs: A Case of the Reconfigurable Machine Scheduling Problem. arXiv:2109.11067 [cs.DC]"},{"key":"e_1_3_2_1_61_1","unstructured":"VMware. 2020. SHARING GPUS IN MACHINE LEARNING ENVIRONMENTS. VMware. https:\/\/www.vmware.com\/docs\/vmware-ai-ml-rama"},{"key":"e_1_3_2_1_62_1","unstructured":"Ben Wang and Aran Komatsuzaki. 2021. GPT-J-6B: A 6 Billion Parameter Autoregressive Language Model. https:\/\/github.com\/kingoflolz\/mesh-transformer-jax."},{"key":"e_1_3_2_1_63_1","volume-title":"Improving GPU Multi-Tenancy Through Dynamic Multi-Instance GPU Reconfiguration. arXiv preprint arXiv:2407.13126","author":"Wang Tianyu","year":"2024","unstructured":"Tianyu Wang, Sheng Li, Bingyao Li, Yue Dai, Ao Li, Geng Yuan, Yufei Ding, Youtao Zhang, and Xulong Tang. 2024. Improving GPU Multi-Tenancy Through Dynamic Multi-Instance GPU Reconfiguration. arXiv preprint arXiv:2407.13126 (2024)."},{"key":"e_1_3_2_1_64_1","volume-title":"Transparent GPU Sharing in Container Clouds for Deep Learning Workloads. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wu Bingyang","year":"2023","unstructured":"Bingyang Wu, Zili Zhang, Zhihao Bai, Xuanzhe Liu, and Xin Jin. 2023. Transparent GPU Sharing in Container Clouds for Deep Learning Workloads. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 69\u201385. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/wu"},{"key":"e_1_3_2_1_65_1","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation (OSDI'20)","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao, Shiru Ren, Yong Li, Yang Zhang, Pengyang Hou, Zhi Li, Yihui Feng, Wei Lin, and Yangqing Jia. 2020. AntMan: Dynamic Scaling on GPU Clusters for Deep Learning. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation (OSDI'20). USENIX Association, USA, Article 30, 16 pages."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3232715"},{"key":"e_1_3_2_1_67_1","volume-title":"Proceedings of the 29th International Symposium on High-Performance Parallel and Distributed Computing","author":"Yeh Ting-An","year":"2020","unstructured":"Ting-An Yeh, Hung-Hsin Chen, and Jerry Chou. 2020. KubeShare: A Framework to Manage GPUs as First-Class and Shared Resources in Container Cloud. In Proceedings of the 29th International Symposium on High-Performance Parallel and Distributed Computing (Stockholm, Sweden) (HPDC '20). Association for Computing Machinery, New York, NY, USA, 173\u2013184. 10.1145\/3369583.3392679"},{"key":"e_1_3_2_1_68_1","volume-title":"Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems. 807\u2013825","author":"Yu Hangchen","year":"2020","unstructured":"Hangchen Yu, Arthur Michener Peters, Amogh Akshintala, and Christopher J Rossbach. 2020. AvA: Accelerated virtualization of accelerators. In Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems. 807\u2013825."},{"key":"e_1_3_2_1_69_1","volume-title":"Proceedings of the Nineteenth European Conference on Computer Systems","author":"Zhang Yijia","year":"2024","unstructured":"Yijia Zhang, Qiang Wang, Zhe Lin, Pengxiang Xu, and Bingqiang Wang. 2024. Improving GPU Energy Efficiency through an Application-transparent Frequency Scaling Policy with Performance Assurance. In Proceedings of the Nineteenth European Conference on Computer Systems (Athens, Greece) (EuroSys '24). Association for Computing Machinery, New York, NY, USA, 769\u2013785. 10.1145\/3627703.3629584"},{"key":"e_1_3_2_1_70_1","volume-title":"Missile: FineGrained, Hardware-Level GPU Resource Isolation for Multi-Tenant DNN Inference. arXiv preprint arXiv:2407.13996","author":"Zhang Yongkang","year":"2024","unstructured":"Yongkang Zhang, Haoxuan Yu, Chenxia Han, Cheng Wang, Baotong Lu, Yang Li, Xiaowen Chu, and Huaicheng Li. 2024. Missile: FineGrained, Hardware-Level GPU Resource Isolation for Multi-Tenant DNN Inference. arXiv preprint arXiv:2407.13996 (2024)."},{"key":"e_1_3_2_1_71_1","volume-title":"Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","author":"Zhang Yongkang","year":"2025","unstructured":"Yongkang Zhang, Haoxuan Yu, Chenxia Han, Cheng Wang, Baotong Lu, Yunzhe Li, Zhifeng Jiang, Yang Li, Xiaowen Chu, and Huaicheng Li. 2025. SGDRC: Software-Defined Dynamic Resource Control for Concurrent DNN Inference on NVIDIA GPUs. In Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming (Las Vegas, NV, USA) (PPoPP '25). Association for Computing Machinery, New York, NY, USA, 267\u2013281. 10.1145\/3710848.3710863"},{"key":"e_1_3_2_1_72_1","volume-title":"Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems","author":"Zhao Xia","year":"2020","unstructured":"Xia Zhao, Magnus Jahre, and Lieven Eeckhout. 2020. HSM: A Hybrid Slowdown Model for Multitasking GPUs. In Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems (Lausanne, Switzerland) (ASPLOS '20). Association for Computing Machinery, New York, NY, USA, 1371\u20131385. 10.1145\/3373376.3378457"}],"event":{"name":"SOSP '25: ACM SIGOPS 31st Symposium on Operating Systems Principles","location":"Lotte Hotel World Seoul Republic of Korea","acronym":"SOSP '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","USENIX"]},"container-title":["Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles"],"original-title":[],"deposited":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:53:26Z","timestamp":1759323206000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731569.3764818"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":72,"alternative-id":["10.1145\/3731569.3764818","10.1145\/3731569"],"URL":"https:\/\/doi.org\/10.1145\/3731569.3764818","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}