{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T10:03:46Z","timestamp":1767261826445,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T00:00:00Z","timestamp":1732060800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"NSF","award":["CCF-2324859, CNS-2214980, CNS-2106434, CNS-1750109, CNS-2106027, CNS-2146909, CCF-2046444"],"award-info":[{"award-number":["CCF-2324859, CNS-2214980, CNS-2106434, CNS-1750109, CNS-2106027, CNS-2146909, CCF-2046444"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,20]]},"DOI":"10.1145\/3698038.3698555","type":"proceedings-article","created":{"date-parts":[[2024,11,14]],"date-time":"2024-11-14T06:32:43Z","timestamp":1731565963000},"page":"460-469","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["KACE: Kernel-Aware Colocation for Efficient GPU Spatial Sharing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-5449-0148","authenticated-orcid":false,"given":"Bing-Shiun","family":"Han","sequence":"first","affiliation":[{"name":"Stony Brook University Stony, Brook, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4526-2230","authenticated-orcid":false,"given":"Tathagata","family":"Paul","sequence":"additional","affiliation":[{"name":"Stony Brook University Stony, Brook, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8026-4502","authenticated-orcid":false,"given":"Zhenhua","family":"Liu","sequence":"additional","affiliation":[{"name":"Stony Brook University Stony, Brook, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2351-6523","authenticated-orcid":false,"given":"Anshul","family":"Gandhi","sequence":"additional","affiliation":[{"name":"Stony Brook University Stony, Brook, New York, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,11,20]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. KACE Artifact. https:\/\/github.com\/nba556677go\/KACE-artifact GitHub repository."},{"key":"e_1_3_2_1_2_1","unstructured":"Alexei Baevski Henry Zhou Abdelrahman Mohamed and Michael Auli. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. arXiv:2006.11477 [cs.CL] https:\/\/arxiv.org\/abs\/2006.11477"},{"key":"e_1_3_2_1_3_1","unstructured":"Lasai Barre\u00f1ada Paula Dhiman Dirk Timmerman Anne-Laure Boulesteix and Ben Van Calster. 2024. Understanding random forests and overfitting: a visualization and simulation study. arXiv:2402.18612 [stat.ME] https:\/\/arxiv.org\/abs\/2402.18612"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607060"},{"key":"e_1_3_2_1_5_1","volume-title":"Serving Heterogeneous Machine Learning Models on Multi-GPU Servers with Spatio-Temporal Sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Choi Seungbeom","year":"2022","unstructured":"Seungbeom Choi, Sunho Lee, Yeonjae Kim, Jongse Park, Youngjin Kwon, and Jaehyuk Huh. 2022. Serving Heterogeneous Machine Learning Models on Multi-GPU Servers with Spatio-Temporal Sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). USENIX Association, Carlsbad, CA, 199--216. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/choi-seungbeom"},{"key":"e_1_3_2_1_6_1","unstructured":"NVIDIA Corporation. 2021. CUDA Multi-Process Service Overview. https:\/\/docs.nvidia.com\/deploy\/pdf\/CUDA_Multi_Process_Service_Overview.pdf"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451125"},{"key":"e_1_3_2_1_8_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL] https:\/\/arxiv.org\/abs\/1810.04805","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL] https:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421284"},{"key":"e_1_3_2_1_10_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. arXiv:2010.11929 [cs.CV] https:\/\/arxiv.org\/abs\/2010.11929"},{"key":"e_1_3_2_1_11_1","unstructured":"Hugging Face. 2023. The Llama 3 Models Were Trained on 15 Trillion Tokens with 24 000 GPUs. https:\/\/huggingface.co\/blog\/llama3#:~:text=The%20Llama%203%20models%20were two%20clusters%20with%2024%2C000%20GPUs. Accessed: 2024-07-03."},{"key":"e_1_3_2_1_12_1","unstructured":"GitHub. [n.d.]. GitHub Copilot. https:\/\/copilot.github.com\/."},{"key":"e_1_3_2_1_13_1","unstructured":"H2O.ai. 2024. H2O.ai AutoML Documentation. https:\/\/docs.h2o.ai\/h2o\/latest-stable\/h2o-docs\/index.html."},{"key":"e_1_3_2_1_14_1","volume-title":"Microsecond-scale Preemption for Concurrent GPU-accelerated DNN Inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Han Mingcong","year":"2022","unstructured":"Mingcong Han, Hanze Zhang, Rong Chen, and Haibo Chen. 2022. Microsecond-scale Preemption for Concurrent GPU-accelerated DNN Inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 539--558. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/han"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357223.3362734"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the 2020 USENIX Annual Technical Conference (USENIX ATC '20). USENIX Association.","author":"Keahey Kate","year":"2020","unstructured":"Kate Keahey, Jason Anderson, Zhuo Zhen, Pierre Riteau, Paul Ruth, Dan Stanzione, Mert Cevik, Jacob Colleran, Haryadi S. Gunawi, Cody Hammock, Joe Mambretti, Alexander Barnes, Fran\u00e7ois Halbach, Alex Rocha, and Joe Stubbs. 2020. Lessons Learned from the Chameleon Testbed. In Proceedings of the 2020 USENIX Annual Technical Conference (USENIX ATC '20). USENIX Association."},{"key":"e_1_3_2_1_17_1","volume-title":"ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. arXiv:1909.11942 [cs.CL] https:\/\/arxiv.org\/abs\/1909.11942","author":"Lan Zhenzhong","year":"2020","unstructured":"Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut. 2020. ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. arXiv:1909.11942 [cs.CL] https:\/\/arxiv.org\/abs\/1909.11942"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3542929.3563510"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patter.2021.100218"},{"key":"e_1_3_2_1_20_1","volume-title":"AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. 2023. AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 663--679. https:\/\/www.usenix.org\/conference\/osdi23\/presentation\/li-zhouhan"},{"key":"e_1_3_2_1_21_1","volume-title":"Zico: Efficient GPU Memory Sharing for Concurrent DNN Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Lim Gangmuk","year":"2021","unstructured":"Gangmuk Lim, Jeongseob Ahn, Wencong Xiao, Youngjin Kwon, and Myeongjae Jeon. 2021. Zico: Efficient GPU Memory Sharing for Concurrent DNN Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 161--175. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/lim"},{"key":"e_1_3_2_1_22_1","unstructured":"LMSYS Team. 2023. LMSYS-CHAT-1M: A Dataset for Large-Scale Multimodal System Chat. Online. https:\/\/arxiv.org\/abs\/2309.11998 Accessed: YYYY-MM-DD."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155650"},{"key":"e_1_3_2_1_24_1","unstructured":"NVIDIA. 2021. NVIDIA AI Enterprise: Deployment Guide on VMware. https:\/\/docs.nvidia.com\/ai-enterprise\/deployment-guide-vmware\/0.1.0\/advance-gpu.html Accessed: 2024-07-03."},{"key":"e_1_3_2_1_25_1","unstructured":"NVIDIA Corporation. 2016. NVIDIA System Management Interface. https:\/\/developer.download.nvidia.com\/compute\/DCGM\/docs\/nvidia-smi-367.38.pdf Version 367.38."},{"key":"e_1_3_2_1_26_1","unstructured":"NVIDIA Corporation. 2021. NVIDIA Multi-Instance GPU User Guide. https:\/\/docs.nvidia.com\/datacenter\/tesla\/mig-user-guide\/index.html Accessed: 2024-07-03."},{"key":"e_1_3_2_1_27_1","unstructured":"NVIDIA Corporation. 2023. NVIDIA Nsight Compute. https:\/\/developer.nvidia.com\/nsight-compute."},{"key":"e_1_3_2_1_28_1","unstructured":"OpenAI. [n.d.]. ChatGPT. https:\/\/openai.com\/chatgpt\/."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453417.3453432"},{"key":"e_1_3_2_1_30_1","volume-title":"Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever.","author":"Radford Alec","year":"2022","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2022. Robust Speech Recognition via Large-Scale Weak Supervision. arXiv:2212.04356 [eess.AS] https:\/\/arxiv.org\/abs\/2212.04356"},{"key":"e_1_3_2_1_31_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. https:\/\/api.semanticscholar.org\/CorpusID:160025533"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Ties Robroek Ehsan Yousefzadeh-Asl-Miandoab and Pinar T\u00f6z\u00fcn. 2023. An Analysis of Collocation on GPUs for Deep Learning Training. arXiv:2209.06018 [cs.LG] https:\/\/arxiv.org\/abs\/2209.06018","DOI":"10.1145\/3642970.3655827"},{"key":"e_1_3_2_1_33_1","volume-title":"INFaaS: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja J. Yadwadkar, and Christos Kozyrakis. 2021. INFaaS: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 397--411. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/romero"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629578"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3486979"},{"key":"e_1_3_2_1_37_1","volume-title":"Garnett (Eds.)","volume":"31","author":"Tang Cheng","year":"2018","unstructured":"Cheng Tang, Damien Garreau, and Ulrike von Luxburg. 2018. When do random forests fail?. In Advances in Neural Information Processing Systems, S. Bengio, H. Wallach, H. Larochelle, K. Grauman, N. Cesa-Bianchi, and R. Garnett (Eds.), Vol. 31. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2018\/file\/204da255aea2cd4a75ace6018fad6b4d-Paper.pdf"},{"key":"e_1_3_2_1_38_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2023. Attention Is All You Need. arXiv:1706.03762 [cs.CL] https:\/\/arxiv.org\/abs\/1706.03762"},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of Machine Learning and Systems, A. Smola, A. Dimakis, and I. Stoica (Eds.)","volume":"3","author":"Wang Guanhua","year":"2021","unstructured":"Guanhua Wang, Kehan Wang, Kenan Jiang, XIANGJUN LI, and Ion Stoica. 2021. Wavelet: Efficient DNN Training with Tick-Tock Scheduling. In Proceedings of Machine Learning and Systems, A. Smola, A. Dimakis, and I. Stoica (Eds.), Vol. 3. 696--710. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2021\/file\/099268c3121d49937a67a052c51f865d-Paper.pdf"},{"key":"e_1_3_2_1_40_1","volume-title":"Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, Fan Yang, and Lidong Zhou. 2018. Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). USENIX Association, Carlsbad, CA, 595--610. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/xiao"},{"key":"e_1_3_2_1_41_1","volume-title":"AntMan: Dynamic Scaling on GPU Clusters for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao, Shiru Ren, Yong Li, Yang Zhang, Pengyang Hou, Zhi Li, Yihui Feng, Wei Lin, and Yangqing Jia. 2020. AntMan: Dynamic Scaling on GPU Clusters for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 533--548. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/xiao"},{"key":"e_1_3_2_1_42_1","volume-title":"Characterization and Prediction of Performance Interference on Mediated Passthrough GPUs for Interference-aware Scheduler. In 11th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 19)","author":"Xu Xin","year":"2019","unstructured":"Xin Xu, Na Zhang, Michael Cui, Michael He, and Ridhi Surana. 2019. Characterization and Prediction of Performance Interference on Mediated Passthrough GPUs for Interference-aware Scheduler. In 11th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 19). Renton, WA, USA."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3079202"},{"key":"e_1_3_2_1_44_1","volume-title":"Salus: Fine-Grained GPU Sharing Primitives for Deep Learning Applications. CoRR abs\/1902.04610","author":"Yu Peifeng","year":"2019","unstructured":"Peifeng Yu and Mosharaf Chowdhury. 2019. Salus: Fine-Grained GPU Sharing Primitives for Deep Learning Applications. CoRR abs\/1902.04610 (2019). arXiv:1902.04610 http:\/\/arxiv.org\/abs\/1902.04610"},{"key":"e_1_3_2_1_45_1","volume-title":"SHEPHERD: Serving DNNs in the Wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Zhang Hong","year":"2023","unstructured":"Hong Zhang, Yupeng Tang, Anurag Khandelwal, and Ion Stoica. 2023. SHEPHERD: Serving DNNs in the Wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 787--808. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/zhang-hong"}],"event":{"name":"SoCC '24: ACM Symposium on Cloud Computing","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGOPS ACM Special Interest Group on Operating Systems"],"location":"Redmond WA USA","acronym":"SoCC '24"},"container-title":["Proceedings of the ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3698038.3698555","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3698038.3698555","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:01:09Z","timestamp":1755889269000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3698038.3698555"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,20]]},"references-count":45,"alternative-id":["10.1145\/3698038.3698555","10.1145\/3698038"],"URL":"https:\/\/doi.org\/10.1145\/3698038.3698555","relation":{},"subject":[],"published":{"date-parts":[[2024,11,20]]},"assertion":[{"value":"2024-11-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}