{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T13:14:59Z","timestamp":1776950099468,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":107,"publisher":"ACM","funder":[{"name":"National Science Foundation","award":["2112606"],"award-info":[{"award-number":["2112606"]}]},{"name":"National Science Foundation","award":["2340722"],"award-info":[{"award-number":["2340722"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,5,4]]},"DOI":"10.1145\/3777884.3797000","type":"proceedings-article","created":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T12:27:26Z","timestamp":1776947246000},"page":"242-254","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MQGPU: A Multi-Queue Scheduling Framework For GPU Accelerated Serverless Functions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7286-4674","authenticated-orcid":false,"given":"Alexander","family":"Fuerst","sequence":"first","affiliation":[{"name":"Indiana University, Bloomington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9942-2472","authenticated-orcid":false,"given":"Siddharth","family":"Anil","sequence":"additional","affiliation":[{"name":"Indiana University, Bloomington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1789-0145","authenticated-orcid":false,"given":"Prateek","family":"Sharma","sequence":"additional","affiliation":[{"name":"Indiana University, Bloomington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,5,3]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Netflix & AWS Lambda Case Study. https:\/\/aws.amazon.com\/solutions\/case-studies\/netflix-and-aws-lambda\/."},{"key":"e_1_3_2_1_2_1","unstructured":"Nvidia management library. https:\/\/developer.nvidia.com\/nvidia-management-library-nvml."},{"key":"e_1_3_2_1_3_1","volume-title":"A cloud research testbed. https:\/\/www.cloudlab.us","year":"2014","unstructured":"CloudLab: A cloud research testbed. https:\/\/www.cloudlab.us, 2014. Accessed: July. 2024."},{"key":"e_1_3_2_1_4_1","volume-title":"June","year":"2015","unstructured":"Docker. https:\/\/www.docker.com\/, June 2015."},{"key":"e_1_3_2_1_5_1","volume-title":"https:\/\/developer.nvidia.com\/blog\/unified-memory-cuda-beginners\/","author":"Beginners Unified Memory","year":"2017","unstructured":"Unified Memory for CUDA Beginners. https:\/\/developer.nvidia.com\/blog\/unified-memory-cuda-beginners\/, 2017."},{"key":"e_1_3_2_1_6_1","volume-title":"https:\/\/aws.amazon.com\/lambda\/","author":"Lambda AWS","year":"2020","unstructured":"AWS Lambda. https:\/\/aws.amazon.com\/lambda\/, 2020."},{"key":"e_1_3_2_1_7_1","volume-title":"https:\/\/docs.nvidia.com\/datacenter\/cloud-native\/container-toolkit\/latest\/install-guide.html","author":"Nvidia","year":"2020","unstructured":"Nvidia container toolkit install guide. https:\/\/docs.nvidia.com\/datacenter\/cloud-native\/container-toolkit\/latest\/install-guide.html, 2020."},{"key":"e_1_3_2_1_8_1","volume-title":"https:\/\/www.alibabacloud.com\/help\/en\/fc\/use-cases\/quasi-real-time-inference-scenarios#section-rzz-zcb-w4e","author":"Alibaba","year":"2024","unstructured":"Alibaba cloud gpu function cold start overheads. https:\/\/www.alibabacloud.com\/help\/en\/fc\/use-cases\/quasi-real-time-inference-scenarios#section-rzz-zcb-w4e, 2024."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3185768.3186294"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3567496"},{"key":"e_1_3_2_1_11_1","volume-title":"May","author":"Acun Bilge","year":"2022","unstructured":"Bilge Acun, Benjamin Lee, Fiodar Kazhamiaka, Kiwan Maeng, Manoj Chakkaravarthy, Udit Gupta, David Brooks, and Carole-Jean Wu. Carbon Explorer: A Holistic Approach for Designing Carbon Aware Datacenters, May 2022. arXiv:2201.10036 [cs, eess]."},{"key":"e_1_3_2_1_12_1","first-page":"419","volume-title":"17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)","author":"Agache Alexandru","year":"2020","unstructured":"Alexandru Agache, Marc Brooker, Alexandra Iordache, Anthony Liguori, Rolf Neugebauer, Phil Piwonka, and Diana-Maria Popa. Firecracker: Lightweight virtualization for serverless applications. In 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20), pages 419-434, 2020."},{"key":"e_1_3_2_1_13_1","volume-title":"nvshare: Practical gpu sharing without memory size constraints. https:\/\/github.com\/grgalex\/nvshare","author":"Alexopoulos Georgios","year":"2023","unstructured":"Georgios Alexopoulos and Dimitris Mitropoulos. nvshare: Practical gpu sharing without memory size constraints. https:\/\/github.com\/grgalex\/nvshare, 2023."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.14778\/3547305.3547313"},{"key":"e_1_3_2_1_15_1","first-page":"263","volume-title":"Proceedings of the ACM Symposium on Cloud Computing","author":"Ao Lixiang","year":"2018","unstructured":"Lixiang Ao, Liz Izhikevich, Geoffrey M Voelker, and George Porter. Sprocket: A serverless video processing framework. In Proceedings of the ACM Symposium on Cloud Computing, pages 263-274, 2018."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2019.00042"},{"issue":"2","key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","first-page":"503","DOI":"10.1145\/3296957.3173169","article-title":"Redesigning the gpu memory hierarchy to support multi-application concurrency","volume":"53","author":"Ausavarungnirun Rachata","year":"2018","unstructured":"Rachata Ausavarungnirun, Vance Miller, Joshua Landgraf, Saugata Ghose, Jayneel Gandhi, Adwait Jog, Christopher J Rossbach, and Onur Mutlu. Mask: Redesigning the gpu memory hierarchy to support multi-application concurrency. ACM SIGPLAN Notices, 53(2):503-518, 2018.","journal-title":"ACM SIGPLAN Notices"},{"key":"e_1_3_2_1_18_1","volume-title":"Harnessing the power of serverless runtimes for large-scale optimization. arXiv preprint arXiv:1901.03161","author":"Aytekin Arda","year":"2019","unstructured":"Arda Aytekin and Mikael Johansson. Harnessing the power of serverless runtimes for large-scale optimization. arXiv preprint arXiv:1901.03161, 2019."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2023.12.007"},{"key":"e_1_3_2_1_20_1","volume-title":"Workshop on Systems for ML and Open Source Software at NeurIPS","volume":"2018","author":"Carreira Joao","year":"2018","unstructured":"Joao Carreira, Pedro Fonseca, Alexey Tumanov, Andrew Zhang, and Randy Katz. A case for serverless machine learning. In Workshop on Systems for ML and Open Source Software at NeurIPS, volume 2018, 2018."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_2_1_22_1","volume-title":"Dirigent: Lightweight serverless orchestration. arXiv preprint arXiv:2404.16393","author":"Cvetkovi\u0107 Lazar","year":"2024","unstructured":"Lazar Cvetkovi\u0107, Fran\u00e7ois Costa, Mihajlo Djokic, Michal Friedman, and Ana Klimovic. Dirigent: Lightweight serverless orchestration. arXiv preprint arXiv:2404.16393, 2024."},{"key":"e_1_3_2_1_23_1","first-page":"28","volume-title":"Proceedings of the 4th Workshop on Resource Disaggregation and Serverless, WORDS '23","author":"Cvetkovi\u00e7 Lazar","year":"2023","unstructured":"Lazar Cvetkovi\u00e7, Rodrigo Fonseca, and Ana Klimovic. Understanding the neglected cost of serverless cluster management. In Proceedings of the 4th Workshop on Resource Disaggregation and Serverless, WORDS '23, page 22\u201328, New York, NY, USA, 2023. Association for Computing Machinery."},{"key":"e_1_3_2_1_24_1","volume-title":"September","year":"2025","unstructured":"DigitalOcean. 7 serverless gpu platforms for scalable inference workloads. https:\/\/www.digitalocean.com\/resources\/articles\/serverless-gpu-platforms, September 2025."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507732"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378512"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCS.2010.5547126"},{"key":"e_1_3_2_1_28_1","volume-title":"Cold start latency mitigation mechanisms in serverless computing: Taxonomy, review, and future directions. Journal of Systems Architecture, page 103115","author":"Ebrahimi Ana","year":"2024","unstructured":"Ana Ebrahimi, Mostafa Ghobaei-Arani, and Hadi Saboohi. Cold start latency mitigation mechanisms in serverless computing: Taxonomy, review, and future directions. Journal of Systems Architecture, page 103115, 2024."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","first-page":"739","DOI":"10.1109\/IPDPS53621.2022.00077","volume-title":"2022 IEEE International Parallel and Distributed Processing Symposium (IPDPS)","author":"Fingler Henrique","year":"2022","unstructured":"Henrique Fingler, Zhiting Zhu, Esther Yoon, Zhipeng Jia, Emmett Witchel, and Christopher J Rossbach. Dgsf: Disaggregated gpus for serverless functions. In 2022 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pages 739-750. IEEE, 2022."},{"key":"e_1_3_2_1_30_1","first-page":"584","volume-title":"2022 SC22: International Conference for High Performance Computing, Networking, Storage and Analysis (SC)","author":"Fu Yuqi","year":"2022","unstructured":"Yuqi Fu, Li Liu, Haoliang Wang, Yue Cheng, and Songqing Chen. Sfs: Smart os scheduling for serverless functions. In 2022 SC22: International Conference for High Performance Computing, Networking, Storage and Analysis (SC), pages 584-599. IEEE Computer Society, 2022."},{"key":"e_1_3_2_1_31_1","first-page":"19","volume-title":"Proceedings of the 2024 USENIX Conference on Usenix Annual Technical Conference","author":"Fu Yuqi","year":"2024","unstructured":"Yuqi Fu, Ruizhe Shi, Haoliang Wang, Songqing Chen, and Yue Cheng. Alps: An adaptive learning, priority os scheduler for serverless function. In Proceedings of the 2024 USENIX Conference on Usenix Annual Technical Conference, pages 19-36, 2024."},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the 32nd International Symposium on High-Performance Parallel and Distributed Computing, HPDC '23. Association for Computing Machinery","author":"Fuerst Alexander","year":"2023","unstructured":"Alexander Fuerst, Abdul Rehman, and Prateek Sharma. Il\u00favatar: A fast control plane for serverless computing. In Proceedings of the 32nd International Symposium on High-Performance Parallel and Distributed Computing, HPDC '23. Association for Computing Machinery, June 2023."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446757"},{"key":"e_1_3_2_1_34_1","volume-title":"Fuerst and Prateek Sharma. Locality-aware Load-Balancing For Serverless Clusters. In Proceedings of the 31st International Symposium on High-Performance Parallel and Distributed Computing, HPDC 2022","author":"Alexander","year":"2022","unstructured":"Alexander Fuerst and Prateek Sharma. Locality-aware Load-Balancing For Serverless Clusters. In Proceedings of the 31st International Symposium on High-Performance Parallel and Distributed Computing, HPDC 2022, New York, NY, USA, 2022. Association for Computing Machinery."},{"key":"e_1_3_2_1_35_1","volume-title":"Mlless: Achieving cost efficiency in serverless machine learning training. arXiv e-prints","author":"Sarroca Pablo Gimeno","year":"2022","unstructured":"Pablo Gimeno Sarroca and Marc S\u00e1nchez-Artigas. Mlless: Achieving cost efficiency in serverless machine learning training. arXiv e-prints, pages arXiv-2206, 2022."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/90.649569"},{"key":"e_1_3_2_1_37_1","volume-title":"Has-gpu: Efficient hybrid auto-scaling with fine-grained gpu allocation for slo-aware serverless inferences","author":"Gu Jianfeng","year":"2025","unstructured":"Jianfeng Gu, Puxuan Wang, Isaac Nunezand, Kai Huang, and Michael Gerndt. Has-gpu: Efficient hybrid auto-scaling with fine-grained gpu allocation for slo-aware serverless inferences, 2025."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605638"},{"key":"e_1_3_2_1_39_1","first-page":"443","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. Serving like clockwork: Performance predictability from the bottom up. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pages 443-462, 2020."},{"key":"e_1_3_2_1_40_1","first-page":"539","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Han Mingcong","year":"2022","unstructured":"Mingcong Han, Hanze Zhang, Rong Chen, and Haibo Chen. Microsecond-scale preemption for concurrent inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 539-558, 2022."},{"key":"e_1_3_2_1_41_1","first-page":"301","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Hedayati Mohammad","year":"2019","unstructured":"Mohammad Hedayati, Kai Shen, Michael L Scott, and Mike Marty. fair queuing. In 2019 USENIX Annual Technical Conference (USENIX ATC 19), pages 301-314, 2019."},{"key":"e_1_3_2_1_42_1","volume-title":"Gpu virtualization and scheduling methods: A comprehensive survey. ACM Computing Surveys (CSUR), 50(3):1-37","author":"Hong Cheol-Ho","year":"2017","unstructured":"Cheol-Ho Hong, Ivor Spence, and Dimitrios S Nikolopoulos. Gpu virtualization and scheduling methods: A comprehensive survey. ACM Computing Surveys (CSUR), 50(3):1-37, 2017."},{"issue":"9","key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3721427","article-title":"A Survey","volume":"57","author":"Huang Jiahua","year":"2025","unstructured":"Jiahua Huang, Weiwei Lin, Wentai Wu, Yang Wang, Haocheng Zhong, Xinhua Wang, and Keqin Li. On Efficiency, Fairness and Security in AI Accelerator Resource Sharing: A Survey. ACM Computing Surveys, 57(9):1-35, September 2025.","journal-title":"ACM Computing Surveys"},{"key":"e_1_3_2_1_44_1","volume-title":"FluidFaaS: A Dynamic Pipelined Solution for Serverless Computing with Strong Isolation-based GPU Sharing","author":"Hui Xinning","year":"2025","unstructured":"Xinning Hui, Yuanchao Xu, and Xipeng Shen. FluidFaaS: A Dynamic Pipelined Solution for Serverless Computing with Strong Isolation-based GPU Sharing. 2025."},{"key":"e_1_3_2_1_45_1","volume-title":"Rapid rna sequencing data analysis using serverless computing. bioRxiv, page 576199","author":"Hung Ling-Hong","year":"2019","unstructured":"Ling-Hong Hung, Dimitar Kumanov, Xingzhi Niu, Wes Lloyd, and Ka Yee Yeung. Rapid rna sequencing data analysis using serverless computing. bioRxiv, page 576199, 2019."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","first-page":"117","DOI":"10.1145\/502034.502046","volume-title":"Proceedings of the eighteenth ACM symposium on Operating systems principles","author":"Iyer Sitaram","year":"2001","unstructured":"Sitaram Iyer and Peter Druschel. Anticipatory scheduling: A disk scheduling framework to overcome deceptive idleness in synchronous i\/o. In Proceedings of the eighteenth ACM symposium on Operating systems principles, pages 117-130, 2001."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304021"},{"key":"e_1_3_2_1_48_1","first-page":"1","volume-title":"SC24: International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Jiang Yankai","year":"2024","unstructured":"Yankai Jiang, Rohan Basu Roy, Baolin Li, and Devesh Tiwari. Ecolife: Carbon-aware serverless function scheduling for sustainable computing. In SC24: International Conference for High Performance Computing, Networking, Storage and Analysis, pages 1-15. IEEE, 2024."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","first-page":"43","DOI":"10.1145\/3368235.3368839","volume-title":"Proceedings of the 12th IEEE\/ACM International Conference on Utility and Cloud Computing Companion - UCC '19 Companion","author":"John Aji","year":"2019","unstructured":"Aji John, Kristiina Ausmees, Kathleen Muenzen, Catherine Kuhn, and Amanda Tan. SWEEP: Accelerating Scientific Research Through Scalable Serverless Workflows. In Proceedings of the 12th IEEE\/ACM International Conference on Utility and Cloud Computing Companion - UCC '19 Companion, pages 43-50, Auckland, New Zealand, 2019. ACM Press."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620678.3624783"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Justin San Juan and Bernard Wong. Reducing the Cost of GPU Cold Starts in Serverless Deep Learning Inference Serving. In 2023 IEEE International Conference on Pervasive Computing and Communications Workshops and other Affiliated Events (PerCom Workshops) pages 225-230 Atlanta GA USA March 2023. IEEE.","DOI":"10.1109\/PerComWorkshops56833.2023.10150381"},{"key":"e_1_3_2_1_52_1","volume-title":"Practical scheduling for real-world serverless computing. arXiv preprint arXiv:2111.07226","author":"Kaffes Kostis","year":"2021","unstructured":"Kostis Kaffes, Neeraja J Yadwadkar, and Christos Kozyrakis. Practical scheduling for real-world serverless computing. arXiv preprint arXiv:2111.07226, 2021."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3542929.3563468"},{"key":"e_1_3_2_1_54_1","first-page":"533","volume-title":"Daeyoung Kim. GPU Enabled Serverless Computing Framework. In 2018 26th Euromicro International Conference on Parallel, Distributed and Network-based Processing (PDP)","author":"Kim Jaewook","year":"2018","unstructured":"Jaewook Kim, Tae Joon Jun, Daeyoun Kang, Dohyeun Kim, and Daeyoung Kim. GPU Enabled Serverless Computing Framework. In 2018 26th Euromicro International Conference on Parallel, Distributed and Network-based Processing (PDP), pages 533-540, Cambridge, March 2018. IEEE."},{"key":"e_1_3_2_1_55_1","volume-title":"Serverless computing provides on-demand high performance computing for biomedical research. arXiv preprint arXiv:1807.11659","author":"Kumanov Dimitar","year":"2018","unstructured":"Dimitar Kumanov, Ling-Hong Hung, Wes Lloyd, and Ka Yee Yeung. Serverless computing provides on-demand high performance computing for biomedical research. arXiv preprint arXiv:1807.11659, 2018."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3542929.3563510"},{"key":"e_1_3_2_1_57_1","volume-title":"March","author":"Lin Ping-Min","year":"2019","unstructured":"Ping-Min Lin and Alex Glikson. Mitigating Cold Starts in Serverless Platforms: A Pool-Based Approach. arXiv:1903.12221 [cs], March 2019. arXiv: 1903.12221."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3487003"},{"key":"e_1_3_2_1_59_1","first-page":"311","volume-title":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"1","author":"Lv Cunchi","year":"2025","unstructured":"Cunchi Lv, Xiao Shi, Zhengyu Lei, Jinyue Huang, Wenting Tan, Xiaohui Zheng, and Xiaofang Zhao. Dilu: Enabling gpu resourcing-on-demand for serverless dl serving via introspective elasticity. In Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1, pages 311-325, 2025."},{"key":"e_1_3_2_1_60_1","first-page":"289","volume-title":"17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)","author":"Mahajan Kshiteej","year":"2020","unstructured":"Kshiteej Mahajan, Arjun Balasubramanian, Arjun Singhvi, Shivaram Venkataraman, Aditya Akella, Amar Phanishayee, and Shuchi Chawla. Themis: Fair and efficient cluster scheduling. In 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20), pages 289-304, 2020."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/UCC-Companion.2018.00054"},{"key":"e_1_3_2_1_62_1","first-page":"2403","article-title":"Optimal Resource Efficiency with Fairness","author":"Mo Zizhao","year":"2024","unstructured":"Zizhao Mo, Huanle Xu, and Wing Cheong Lau. Optimal Resource Efficiency with Fairness in Heterogeneous GPU Clusters, March 2024. arXiv:2403.18545 [cs].","journal-title":"Heterogeneous GPU Clusters"},{"key":"e_1_3_2_1_63_1","first-page":"6","volume-title":"Naren Nayak. Agile Cold Starts for Scalable Serverless. USENIX Workshop on Hot Topics in Cloud Computing (HotCloud)","author":"Mohan Anup","year":"2019","unstructured":"Anup Mohan, Harshad Sane, Kshitij Doshi, Saikrishna Edupuganti, Vadim Sukhomlinov, and Naren Nayak. Agile Cold Starts for Scalable Serverless. USENIX Workshop on Hot Topics in Cloud Computing (HotCloud), page 6, 2019."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2020.01.004"},{"key":"e_1_3_2_1_65_1","first-page":"595","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles","author":"Ng Kelvin KW","year":"2023","unstructured":"Kelvin KW Ng, Henri Maxime Demoulin, and Vincent Liu. Paella: Low-latency model serving with software-defined gpu scheduling. In Proceedings of the 29th Symposium on Operating Systems Principles, pages 595-610, 2023."},{"key":"e_1_3_2_1_66_1","volume-title":"https:\/\/docs.nvidia.com\/deploy\/mps\/index.html","author":"Nvidia","year":"2023","unstructured":"Nvidia. Nvidia MPS. https:\/\/docs.nvidia.com\/deploy\/mps\/index.html, 2023."},{"key":"e_1_3_2_1_67_1","volume-title":"NVIDIA Multi-Instance GPU User Guide. https:\/\/docs.nvidia.com\/datacenter\/tesla\/mig-user-guide\/index.html","year":"2023","unstructured":"Nvidia. NVIDIA Multi-Instance GPU User Guide. https:\/\/docs.nvidia.com\/datacenter\/tesla\/mig-user-guide\/index.html, 2023."},{"key":"e_1_3_2_1_68_1","volume-title":"Kernel-as-a-service: A serverless interface to gpus. arXiv preprint arXiv:2212.08146","author":"Pemberton Nathan","year":"2022","unstructured":"Nathan Pemberton, Anton Zabreyko, Zhoujie Ding, Randy Katz, and Joseph Gonzalez. Kernel-as-a-service: A serverless interface to gpus. arXiv preprint arXiv:2212.08146, 2022."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"crossref","first-page":"109","DOI":"10.1145\/2287076.2287091","volume-title":"Proceedings of the 21st international symposium on High-Performance Parallel and Distributed Computing","author":"Phull Rajat","year":"2012","unstructured":"Rajat Phull, Cheng-Hong Li, Kunal Rao, Hari Cadambi, and Srimat Chakradhar. Interference-driven resource management for gpu-based heterogeneous clusters. In Proceedings of the 21st international symposium on High-Performance Parallel and Distributed Computing, pages 109-120, 2012."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3524053.3542751"},{"key":"e_1_3_2_1_71_1","first-page":"1","volume-title":"Proceedings of the ACM Symposium on Cloud Computing","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Mark Zhao, Neeraja J Yadwadkar, and Christos Kozyrakis. Llama: A heterogeneous & serverless framework for auto-tuning video analytics pipelines. In Proceedings of the ACM Symposium on Cloud Computing, pages 1-17, 2021."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507750"},{"key":"e_1_3_2_1_73_1","first-page":"112","volume-title":"2024 IEEE 24th International Symposium on Cluster, Cloud and Internet Computing (CCGrid)","author":"Samanta Amit","year":"2024","unstructured":"Amit Samanta and Ryan Stutsman. Fair, efficient multi-resource scheduling for stateless serverless functions with anubis. In 2024 IEEE 24th International Symposium on Cluster, Cloud and Internet Computing (CCGrid), page 106\u2013112, May 2024."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"crossref","first-page":"17","DOI":"10.1145\/3452413.3464785","volume-title":"Proceedings of the 1st Workshop on High Performance Serverless Computing","author":"Satzke Klaus","year":"2020","unstructured":"Klaus Satzke, Istemi Ekin Akkus, Ruichuan Chen, Ivica Rimac, Manuel Stein, Andre Beck, Paarijaat Aditya, Manohar Vanga, and Volker Hilt. Efficient gpu sharing for serverless workflows. In Proceedings of the 1st Workshop on High Performance Serverless Computing, pages 17-24, 2020."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/3406011"},{"key":"e_1_3_2_1_76_1","first-page":"205","volume-title":"Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider. In 2020 USENIX annual technical conference (USENIX ATC 20)","author":"Shahrad Mohammad","year":"2020","unstructured":"Mohammad Shahrad, Rodrigo Fonseca, Inigo Goiri, Gohar Chaudhry, Paul Batum, Jason Cooke, Eduardo Laureano, Colby Tresness, Mark Russinovich, and Ricardo Bianchini. Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider. In 2020 USENIX annual technical conference (USENIX ATC 20), pages 205-218, 2020."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421287"},{"key":"e_1_3_2_1_78_1","first-page":"965","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Sheng Ying","year":"2024","unstructured":"Ying Sheng, Shiyi Cao, Dacheng Li, Banghua Zhu, Zhuohan Li, Danyang Zhuo, Joseph E Gonzalez, and Ion Stoica. Fairness in serving large language models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pages 965-988, 2024."},{"key":"e_1_3_2_1_79_1","first-page":"154","volume-title":"FaaSter","author":"Spillner Josef","year":"2018","unstructured":"Josef Spillner, Cristian Mateos, and David A. Monge. FaaSter, Better, Cheaper: The Prospect of Serverless Scientific Computing and HPC. In Esteban Mocskos and Sergio Nesmachnow, editors, High Performance Computing, volume 796, pages 154-168. Springer International Publishing, Cham, 2018."},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"crossref","first-page":"1075","DOI":"10.1145\/3627703.3629578","volume-title":"Proceedings of the Nineteenth European Conference on Computer Systems","author":"Strati Foteini","year":"2024","unstructured":"Foteini Strati, Xianzhe Ma, and Ana Klimovic. Orion: Interference-aware, fine-grained gpu sharing for ml applications. In Proceedings of the Nineteenth European Conference on Computer Systems, pages 1075-1092, 2024."},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"crossref","first-page":"55","DOI":"10.1145\/3143361.3143368","volume-title":"Proceedings of the 13th International Conference on emerging Networking EXperiments and Technologies","author":"Sundarrajan Aditya","year":"2017","unstructured":"Aditya Sundarrajan, Mingdong Feng, Mangesh Kasbekar, and Ramesh K Sitaraman. Footprint descriptors: Theory and practice of cache provisioning in a global cdn. In Proceedings of the 13th International Conference on emerging Networking EXperiments and Technologies, pages 55-67, 2017."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACSOS49614.2020.00020"},{"key":"e_1_3_2_1_83_1","first-page":"109","volume-title":"2014 USENIX Annual Technical Conference (USENIX ATC 14)","author":"Suzuki Yusuke","year":"2014","unstructured":"Yusuke Suzuki, Shinpei Kato, Hiroshi Yamada, and Kenji Kono. : Why not virtualizing at the hypervisor? In 2014 USENIX Annual Technical Conference (USENIX ATC 14), pages 109-120, 2014."},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605181.3626191"},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446714"},{"issue":"5","key":"e_1_3_2_1_86_1","doi-asserted-by":"crossref","first-page":"3917","DOI":"10.1109\/JIOT.2022.3165127","article-title":"Mitigating cold start problem in serverless computing: A reinforcement learning approach","volume":"10","author":"Vahidinia Parichehr","year":"2022","unstructured":"Parichehr Vahidinia, Bahar Farahani, and Fereidoon Shams Aliee. Mitigating cold start problem in serverless computing: A reinforcement learning approach. IEEE Internet of Things Journal, 10(5):3917-3927, 2022.","journal-title":"IEEE Internet of Things Journal"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"crossref","first-page":"47","DOI":"10.1109\/eScience.2015.15","volume-title":"2015 IEEE 11th International Conference on e-Science","author":"Varghese Blesson","year":"2015","unstructured":"Blesson Varghese, Javier Prades, Carlos Reano, and Federico Silla. Acceleration-as-a-service: Exploiting virtualised gpus for a financial application. In 2015 IEEE 11th International Conference on e-Science, pages 47-56. IEEE, 2015."},{"key":"e_1_3_2_1_88_1","first-page":"945","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Weng Qizhen","year":"2022","unstructured":"Qizhen Weng, Wencong Xiao, Yinghao Yu, Wei Wang, Cheng Wang, Jian He, Yong Li, Liping Zhang, Wei Lin, and Yu Ding. in the wild: Workload analysis and scheduling in heterogeneous clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22), pages 945-960, 2022."},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData.2018.8622362"},{"key":"e_1_3_2_1_90_1","first-page":"69","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wu Bingyang","year":"2023","unstructured":"Bingyang Wu, Zili Zhang, Zhihao Bai, Xuanzhe Liu, and Xin Jin. Transparent GPU sharing in container clouds for deep learning workloads. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23), pages 69-85, 2023."},{"key":"e_1_3_2_1_91_1","first-page":"59","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Wu Hao","year":"2024","unstructured":"Hao Wu, Yue Yu, Junxiao Deng, Shadi Ibrahim, Song Wu, Hao Fan, Ziyue Cheng, and Hai Jin. : A lightweight for serverless inference workflow. In 2024 USENIX Annual Technical Conference (USENIX ATC 24), pages 59-73, 2024."},{"key":"e_1_3_2_1_92_1","first-page":"595","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, et al. Gandiva: Introspective cluster scheduling for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18), pages 595-610, 2018."},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2021.3054656"},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2009.5160873"},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507709"},{"key":"e_1_3_2_1_96_1","doi-asserted-by":"publisher","DOI":"10.1145\/3638757"},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","DOI":"10.1145\/3369583.3392679"},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"publisher","DOI":"10.1145\/3317550.3321423"},{"key":"e_1_3_2_1_99_1","volume-title":"Slo-aware, gpu-efficient serverless inference via model swapping","author":"Yu Minchen","year":"2024","unstructured":"Minchen Yu, Ao Wang, Dong Chen, Haoxuan Yu, Xiaonan Luo, Zhuohao Li, Wei Wang, Ruichuan Chen, Dapeng Nie, and Haoran Yang. Faaswap: Slo-aware, gpu-efficient serverless inference via model swapping, 2024."},{"key":"e_1_3_2_1_100_1","first-page":"597","volume-title":"Proceedings of the 2025 USENIX Annual Technical Conference, USENIX ATC 2025","author":"Yu Minchen","year":"2025","unstructured":"Minchen Yu, Ao Wang, Dong Chen, Haoxuan Yu, Xiaonan Luo, Zhuohao Li, Wei Wang, Ruichuan Chen, Dapeng Nie, Haoran Yang, and Yu Ding. Torpor: Gpu-enabled serverless computing for low-latency, resource-efficient inference. In Deniz Altinb\u00fcken and Ryan Stutsman, editors, Proceedings of the 2025 USENIX Annual Technical Conference, USENIX ATC 2025, Boston, MA, USA, July 7-9, 2025, pages 597-612. USENIX Association, 2025."},{"key":"e_1_3_2_1_101_1","doi-asserted-by":"publisher","DOI":"10.1145\/3304112.3325608"},{"key":"e_1_3_2_1_102_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696070"},{"key":"e_1_3_2_1_103_1","volume-title":"Towards fast setup and high throughput of gpu serverless computing. arXiv preprint arXiv:2404.14691","author":"Zhao Han","year":"2024","unstructured":"Han Zhao, Weihao Cui, Quan Chen, Shulai Zhang, Zijun Li, Jingwen Leng, Chao Li, Deze Zeng, and Minyi Guo. Towards fast setup and high throughput of gpu serverless computing. arXiv preprint arXiv:2404.14691, 2024."},{"key":"e_1_3_2_1_104_1","first-page":"184","volume-title":"Proceedings of the 25th International Middleware Conference","author":"Zhao Yuxuan","year":"2024","unstructured":"Yuxuan Zhao, Weikang Weng, Rob van Nieuwpoort, and Alexandru Uta. In serverless, os scheduler choice costs money: A hybrid scheduling approach for cheaper faas. In Proceedings of the 25th International Middleware Conference, page 172\u2013184, Hong Kong Hong Kong, December 2024. ACM."},{"key":"e_1_3_2_1_105_1","first-page":"439 2159","volume-title":"2024 IEEE 17th International Conference on Cloud Computing (CLOUD)","author":"Zhu Jiafan","year":"2024","unstructured":"Jiafan Zhu, Xiao Zhang, Konstantinos Menychtas, Zhijing Gene Qin, Steven Hand, Dragos Sbirlea, and Yuang Liu. GASS: GPU Automated Sharing at Scale. In 2024 IEEE 17th International Conference on Cloud Computing (CLOUD), pages 439-445, July 2024. ISSN: 2159-6190."},{"key":"e_1_3_2_1_106_1","first-page":"132 2643","volume-title":"2020 IEEE 32nd International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD)","author":"Zuk Pawel","year":"2020","unstructured":"Pawel Zuk and Krzysztof Rzadca. Scheduling Methods to Reduce Response Latency of Function as a Service. In 2020 IEEE 32nd International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD), pages 132-140, September 2020. ISSN: 2643-3001."},{"key":"e_1_3_2_1_107_1","first-page":"172 2168","volume-title":"2022 IEEE International Conference on Cluster Computing (CLUSTER)","author":"Zuk Pawe\u0142","year":"2022","unstructured":"Pawe\u0142 Zuk, Bart\u0142omiej Przybylski, and Krzysztof Rzadca. Call Scheduling to Reduce Response Time of a FaaS System. In 2022 IEEE International Conference on Cluster Computing (CLUSTER), pages 172-182, September 2022. ISSN: 2168-9253."}],"event":{"name":"ICPE '26: 17th ACM\/SPEC International Conference on Performance Engineering","location":"Florence Italy","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering","SIGMETRICS ACM Special Interest Group on Measurement and Evaluation","SPEC"]},"container-title":["Proceedings of the 17th ACM\/SPEC International Conference on Performance Engineering"],"original-title":[],"deposited":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T12:30:16Z","timestamp":1776947416000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3777884.3797000"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,3]]},"references-count":107,"alternative-id":["10.1145\/3777884.3797000","10.1145\/3777884"],"URL":"https:\/\/doi.org\/10.1145\/3777884.3797000","relation":{},"subject":[],"published":{"date-parts":[[2026,5,3]]},"assertion":[{"value":"2026-05-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}