{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T22:28:38Z","timestamp":1782944918697,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T00:00:00Z","timestamp":1726444800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2237295"],"award-info":[{"award-number":["2237295"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,9,16]]},"DOI":"10.1145\/3688351.3689156","type":"proceedings-article","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T06:19:50Z","timestamp":1726467590000},"page":"68-82","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Optimizing GPU Sharing for Container-Based DNN Serving with Multi-Instance GPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-4944-4176","authenticated-orcid":false,"given":"Xinpeng","family":"Wei","sequence":"first","affiliation":[{"name":"Georgia Institute of Technology"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7388-7341","authenticated-orcid":false,"given":"Zhichao","family":"Li","sequence":"additional","affiliation":[{"name":"ByteDance Inc."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1420-5125","authenticated-orcid":false,"given":"Cheng","family":"Tan","sequence":"additional","affiliation":[{"name":"Northeastern University"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,9,16]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"NVIDIA Triton. https:\/\/developer.nvidia.com\/nvidia-triton-inference-server."},{"key":"e_1_3_2_1_2_1","unstructured":"TorchServe. https:\/\/pytorch.org\/serve\/."},{"key":"e_1_3_2_1_3_1","volume-title":"https:\/\/www.xilinx.com\/support\/documentation\/sw_manuals\/xilinx2018_1\/ug909-vivado-partial-reconfiguration.pdf","author":"Suite User Guide Partial Vivado Design","year":"2018","unstructured":"Vivado Design Suite User Guide Partial Reconfiguration. https:\/\/www.xilinx.com\/support\/documentation\/sw_manuals\/xilinx2018_1\/ug909-vivado-partial-reconfiguration.pdf, 2018."},{"key":"e_1_3_2_1_4_1","volume-title":"https:\/\/en.wikipedia.org\/wiki\/Cutting_stock_problem","author":"Cutting","year":"2021","unstructured":"Cutting stock problem. https:\/\/en.wikipedia.org\/wiki\/Cutting_stock_problem, 2021."},{"key":"e_1_3_2_1_5_1","volume-title":"https:\/\/docs.nvidia.com\/datacenter\/tesla\/pdf\/fabric-manager-user-guide.pdf","author":"Switch Systems Fabric Manager","year":"2021","unstructured":"Fabric Manager for NVIDIA NVSwitch Systems. https:\/\/docs.nvidia.com\/datacenter\/tesla\/pdf\/fabric-manager-user-guide.pdf, 2021."},{"key":"e_1_3_2_1_6_1","volume-title":"https:\/\/kubernetes.io\/docs\/concepts\/architecture\/controller\/","author":"Controllers Kubernetes","year":"2021","unstructured":"Kubernetes Controllers. https:\/\/kubernetes.io\/docs\/concepts\/architecture\/controller\/, 2021."},{"key":"e_1_3_2_1_7_1","volume-title":"https:\/\/developer.nvidia.com\/blog\/minimizing-dl-inference-latency-with-mig\/","author":"Learning Inference Minimizing Deep","year":"2021","unstructured":"Minimizing Deep Learning Inference Latency with NVIDIA Multi-Instance GPU. https:\/\/developer.nvidia.com\/blog\/minimizing-dl-inference-latency-with-mig\/, 2021."},{"key":"e_1_3_2_1_8_1","volume-title":"https:\/\/docs.nvidia.com\/datacenter\/tesla\/pdf\/NVIDIA_MIG_User_Guide.pdf","author":"User Guide NVIDIA","year":"2021","unstructured":"NVIDIA Multi-Instance GPU User Guide. https:\/\/docs.nvidia.com\/datacenter\/tesla\/pdf\/NVIDIA_MIG_User_Guide.pdf, 2021."},{"key":"e_1_3_2_1_9_1","volume-title":"https:\/\/aws.amazon.com\/machine-learning\/containers\/","author":"Deep Learning Containers AWS","year":"2023","unstructured":"AWS Deep Learning Containers. https:\/\/aws.amazon.com\/machine-learning\/containers\/, 2023."},{"key":"e_1_3_2_1_10_1","volume-title":"https:\/\/github.com\/nebuly-ai\/nos","author":"Nebuly Operating","year":"2023","unstructured":"Nebuly Operating System (nos). https:\/\/github.com\/nebuly-ai\/nos, 2023."},{"key":"e_1_3_2_1_11_1","volume-title":"https:\/\/docs.nvidia.com\/deploy\/mps\/index.html","author":"Multi-Process Service NVIDIA","year":"2023","unstructured":"NVIDIA Multi-Process Service. https:\/\/docs.nvidia.com\/deploy\/mps\/index.html, 2023."},{"key":"e_1_3_2_1_12_1","volume-title":"https:\/\/cloud.google.com\/vertex-ai\/docs\/general\/deep-learning","author":"Images Use Deep","year":"2023","unstructured":"Use Deep Learning VM Images and Deep Learning Containers with Vertex AI. https:\/\/cloud.google.com\/vertex-ai\/docs\/general\/deep-learning, 2023."},{"key":"e_1_3_2_1_13_1","volume-title":"https:\/\/github.com\/kubernetes\/autoscaler","author":"Autoscaler Kubernetes","year":"2024","unstructured":"Kubernetes Autoscaler. https:\/\/github.com\/kubernetes\/autoscaler, 2024."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2015.2401597"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.procir.2015.06.015"},{"key":"e_1_3_2_1_16_1","volume-title":"Proc. OSDI","author":"Bai Z.","year":"2020","unstructured":"Z. Bai, Z. Zhang, Y. Zhu, and X. Jin. Pipeswitch: Fast pipelined context switching for deep learning applications. In Proc. OSDI, 2020."},{"key":"e_1_3_2_1_17_1","volume-title":"Accelerating deep learning inference via learned caches. arXiv preprint arXiv:2101.07344","author":"Balasubramanian A.","year":"2021","unstructured":"A. Balasubramanian, A. Kumar, Y. Liu, H. Cao, S. Venkataraman, and A. Akella. Accelerating deep learning inference via learned caches. arXiv preprint arXiv:2101.07344, 2021."},{"key":"e_1_3_2_1_18_1","volume-title":"Scheduling splittable jobs on configurable machines. arXiv preprint arXiv:2312.05416","author":"Casey M.","year":"2023","unstructured":"M. Casey, R. Rajaraman, D. Stalfa, and C. Tan. Scheduling splittable jobs on configurable machines. arXiv preprint arXiv:2312.05416, 2023."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387555"},{"key":"e_1_3_2_1_20_1","first-page":"199","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Choi S.","year":"2022","unstructured":"S. Choi, S. Lee, Y. Kim, J. Park, Y. Kwon, and J. Huh. Serving heterogeneous machine learning models on {Multi-GPU } servers with {Spatio-Temporal} sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC 22), pages 199--216, 2022."},{"key":"e_1_3_2_1_21_1","volume-title":"Proc. NSDI","author":"Crankshaw D.","year":"2017","unstructured":"D. Crankshaw, X. Wang, G. Zhou, M. J. Franklin, J. E. Gonzalez, and I. Stoica. Clipper: A low-latency online prediction serving system. In Proc. NSDI, 2017."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421284"},{"key":"e_1_3_2_1_23_1","first-page":"281","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Fried J.","year":"2020","unstructured":"J. Fried, Z. Ruan, A. Ousterhout, and A. Belay. Caladan: Mitigating interference at microsecond timescales. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pages 281--297, 2020."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190541"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.3182\/20090819-3-PL-3002.00043"},{"key":"e_1_3_2_1_26_1","volume-title":"Proc. NSDI","author":"Gu J.","year":"2019","unstructured":"J. Gu, M. Chowdhury, K. G. Shin, Y. Zhu, M. Jeon, J. Qian, H. Liu, and C. Guo. Tiresias: A {GPU} cluster manager for distributed deep learning. In Proc. NSDI, 2019."},{"key":"e_1_3_2_1_27_1","volume-title":"Proc. OSDI","author":"Gujarati A.","year":"2020","unstructured":"A. Gujarati, R. Karimi, S. Alzayat, W. Hao, A. Kaufmann, Y. Vigfusson, and J. Mace. Serving dnns like clockwork: Performance predictability from the bottom up. In Proc. OSDI, 2020."},{"key":"e_1_3_2_1_28_1","first-page":"1041","volume-title":"USENIX NSDI","author":"Gunasekaran J. R.","year":"2022","unstructured":"J. R. Gunasekaran, C. S. Mishra, P. Thinakaran, B. Sharma, M. T. Kandemir, and C. R. Das. Cocktail: A multidimensional optimization for model serving in cloud. In USENIX NSDI, pages 1041--1057, 2022."},{"key":"e_1_3_2_1_29_1","first-page":"539","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Han M.","year":"2022","unstructured":"M. Han, H. Zhang, R. Chen, and H. Chen. Microsecond-scale preemption for concurrent {GPU-accelerated}{DNN} inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 539--558, 2022."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2024.3430063"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNSM.2016.2598420"},{"key":"e_1_3_2_1_32_1","volume-title":"Dynamic space-time scheduling for gpu inference. arXiv preprint arXiv:1901.00041","author":"Jain P.","year":"2018","unstructured":"P. Jain, X. Mo, A. Jain, H. Subbaraj, R. S. Durrani, A. Tumanov, J. Gonzalez, and I. Stoica. Dynamic space-time scheduling for gpu inference. arXiv preprint arXiv:1901.00041, 2018."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3350755.3400247"},{"key":"e_1_3_2_1_34_1","first-page":"16","volume-title":"4th Metaheuristics International Conference MIC","author":"J\u00f3zefowska J.","year":"2001","unstructured":"J. J\u00f3zefowska, M. Mika, R. R\u00f3\u017cycki, G. Walig\u00f3ra, and J. W\u0119glarz. Solving discrete-continuous scheduling problems by tabu search. In 4th Metaheuristics International Conference MIC, pages 16--20, 2001."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0007-8506(07)62120-9"},{"key":"e_1_3_2_1_36_1","volume-title":"Proc. NSDI","author":"Mahajan K.","year":"2020","unstructured":"K. Mahajan, A. Balasubramanian, A. Singhvi, S. Venkataraman, A. Akella, A. Phanishayee, and S. Chawla. Themis: Fair and efficient {GPU} cluster scheduling. In Proc. NSDI, 2020."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2020.106416"},{"key":"e_1_3_2_1_38_1","first-page":"1","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"McClure S.","year":"2022","unstructured":"S. McClure, A. Ousterhout, S. Shenker, and S. Ratnasamy. Efficient scheduling policies for {Microsecond-Scale} tasks. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22), pages 1--18, 2022."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483588"},{"key":"e_1_3_2_1_40_1","volume-title":"Tensorflow-serving: Flexible, high-performance ml serving. arXiv preprint arXiv:1712.06139","author":"Olston C.","year":"2017","unstructured":"C. Olston, N. Fiedel, K. Gorovoy, J. Harmsen, L. Lao, F. Li, V. Rajashekhar, S. Ramesh, and J. Soyke. Tensorflow-serving: Flexible, high-performance ml serving. arXiv preprint arXiv:1712.06139, 2017."},{"key":"e_1_3_2_1_41_1","first-page":"361","volume-title":"16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Ousterhout A.","year":"2019","unstructured":"A. Ousterhout, J. Fried, J. Behrens, A. Belay, and H. Balakrishnan. Shenango: Achieving high {CPU } efficiency for latency-sensitive datacenter workloads. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19), pages 361--378, 2019."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ejor.2006.12.006"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"M. Pinedo. Scheduling volume 29. Springer 2012.","DOI":"10.1007\/978-1-4614-2361-4"},{"key":"e_1_3_2_1_46_1","volume-title":"Infaas: A model-less inference serving system. arXiv preprint arXiv:1905.13348","author":"Romero F.","year":"2019","unstructured":"F. Romero, Q. Li, N. J. Yadwadkar, and C. Kozyrakis. Infaas: A model-less inference serving system. arXiv preprint arXiv:1905.13348, 2019."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2004.99"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3486987"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.3390\/electronics9091461"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411029.3411035"},{"key":"e_1_3_2_1_52_1","volume-title":"Proc. OSDI","author":"Xiao W.","year":"2018","unstructured":"W. Xiao, R. Bhardwaj, R. Ramjee, M. Sivathanu, N. Kwatra, Z. Han, P. Patel, X. Peng, H. Zhao, Q. Zhang, et al. Gandiva: Introspective cluster scheduling for deep learning. In Proc. OSDI, 2018."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0166-218X(00)00176-1"},{"key":"e_1_3_2_1_54_1","first-page":"787","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Zhang H.","year":"2023","unstructured":"H. Zhang, Y. Tang, A. Khandelwal, and I. Stoica. {SHEPHERD}: Serving {DNNs} in the wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23), pages 787--808, 2023."},{"key":"e_1_3_2_1_55_1","volume-title":"Proc. OSDI","author":"Zhao H.","year":"2020","unstructured":"H. Zhao, Z. Han, Z. Yang, Q. Zhang, F. Yang, L. Zhou, M. Yang, F. C. Lau, Y. Wang, Y. Xiong, et al. Hived: Sharing a {GPU } cluster for deep learning with guarantees. In Proc. OSDI, 2020."}],"event":{"name":"SYSTOR '24: The 17th ACM International Systems and Storage Conference","location":"Virtual Israel","acronym":"SYSTOR '24","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","Technion Israel Institute of Technology"]},"container-title":["Proceedings of the 17th ACM International Systems and Storage Conference on ZZZ"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3688351.3689156","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3688351.3689156","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,26]],"date-time":"2025-08-26T19:53:11Z","timestamp":1756237991000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3688351.3689156"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,16]]},"references-count":55,"alternative-id":["10.1145\/3688351.3689156","10.1145\/3688351"],"URL":"https:\/\/doi.org\/10.1145\/3688351.3689156","relation":{},"subject":[],"published":{"date-parts":[[2024,9,16]]},"assertion":[{"value":"2024-09-16","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}