{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T15:37:46Z","timestamp":1768405066425,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T00:00:00Z","timestamp":1717372800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"NKRDP","award":["2021YFB0300800"],"award-info":[{"award-number":["2021YFB0300800"]}]},{"name":"NSFC","award":["62102396"],"award-info":[{"award-number":["62102396"]}]},{"DOI":"10.13039\/501100005090","name":"Beijing Nova Program","doi-asserted-by":"publisher","award":["Z211100002121143"],"award-info":[{"award-number":["Z211100002121143"]}],"id":[{"id":"10.13039\/501100005090","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100005090","name":"Beijing Nova Program","doi-asserted-by":"publisher","award":["20220484217"],"award-info":[{"award-number":["20220484217"]}],"id":[{"id":"10.13039\/501100005090","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Youth Innovation Promotion Association of Chinese Academy of Sciences","award":["2021099"],"award-info":[{"award-number":["2021099"]}]},{"name":"CCF-Ant Research Fund CCF-AFSGRF","award":["20230207"],"award-info":[{"award-number":["20230207"]}]},{"name":"Pilotfor Major Scientific Research Facility of Jiangsu Province of China","award":["NO.BM2021800"],"award-info":[{"award-number":["NO.BM2021800"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,3]]},"DOI":"10.1145\/3625549.3658654","type":"proceedings-article","created":{"date-parts":[[2024,8,30]],"date-time":"2024-08-30T15:55:29Z","timestamp":1725033329000},"page":"1-14","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["ElasticRoom: Multi-Tenant DNN Inference Engine via Co-design with Resource-constrained Compilation and Strong Priority Scheduling"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-1081-0356","authenticated-orcid":false,"given":"Lixian","family":"Ma","sequence":"first","affiliation":[{"name":"State Key Lab of Processors, Institute of Computing Technology, CAS, Beijing, China"},{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0634-6842","authenticated-orcid":false,"given":"Haoruo","family":"Chen","sequence":"additional","affiliation":[{"name":"State Key Lab of Processors, Institute of Computing Technology, CAS, Beijing, China"},{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9678-7228","authenticated-orcid":false,"given":"En","family":"Shao","sequence":"additional","affiliation":[{"name":"State Key Lab of Processors, Institute of Computing Technology, CAS, Beijing, China"},{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4940-5598","authenticated-orcid":false,"given":"Leping","family":"Wang","sequence":"additional","affiliation":[{"name":"State Key Lab of Processors, Institute of Computing Technology, CAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5832-0347","authenticated-orcid":false,"given":"Quan","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6361-5948","authenticated-orcid":false,"given":"Guangming","family":"Tan","sequence":"additional","affiliation":[{"name":"State Key Lab of Processors, Institute of Computing Technology, CAS, Beijing, China"},{"name":"University of Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,8,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2012.6168946"},{"key":"e_1_3_2_1_2_1","unstructured":"AMD. Amd's approach to gpu virtualization. https:\/\/drivers.amd.com\/relnotes\/amd_mxgpu_deploymentguide_vmware.pdf."},{"key":"e_1_3_2_1_3_1","unstructured":"AMD. Stream management hip api. https:\/\/docs.amd.com\/bundle\/HIP-API-Guide-v5.4.1\/page\/a00183.html."},{"key":"e_1_3_2_1_4_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems, 33:1877--1901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877--1901, 2020."},{"key":"e_1_3_2_1_5_1","first-page":"578","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. {TVM}: An automated {End-to-End} optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18), pages 578--594, 2018."},{"key":"e_1_3_2_1_6_1","volume-title":"Multi-model machine learning inference serving with gpu spatial partitioning. arXiv preprint arXiv:2109.01611","author":"Choi Seungbeom","year":"2021","unstructured":"Seungbeom Choi, Sunho Lee, Yeonjae Kim, Jongse Park, Youngjin Kwon, and Jaehyuk Huh. Multi-model machine learning inference serving with gpu spatial partitioning. arXiv preprint arXiv:2109.01611, 2021."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071121"},{"key":"e_1_3_2_1_8_1","first-page":"613","volume-title":"14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Crankshaw Daniel","year":"2017","unstructured":"Daniel Crankshaw, Xin Wang, Guilio Zhou, Michael J Franklin, Joseph E Gonzalez, and Ion Stoica. Clipper: A low-latency online prediction serving system. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17), pages 613--627, 2017."},{"key":"e_1_3_2_1_9_1","first-page":"183","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Cui Weihao","year":"2022","unstructured":"Weihao Cui, Han Zhao, Quan Chen, Hao Wei, Zirui Li, Deze Zeng, Chao Li, and Minyi Guo. Dvabatch: Diversity-aware multi-entry multi-exit batching for efficient processing of dnn services on gpus. In 2022 USENIX Annual Technical Conference (USENIX ATC 22), pages 183--198, 2022."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476143"},{"key":"e_1_3_2_1_11_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805, 2018."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421284"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3576933"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453953.3453972"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","first-page":"25","DOI":"10.1109\/P3HPC51967.2020.00008","volume-title":"2020 IEEE\/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC)","author":"Goli Mehdi","year":"2020","unstructured":"Mehdi Goli, Kumudha Narasimhan, Ruyman Reyes, Ben Tracy, Daniel Soutar, Svetlozar Georgiev, Evarist M Fomenko, and Eugene Chereshnev. Towards cross-platform performance portability of dnn models using sycl. In 2020 IEEE\/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC), pages 25--35. IEEE, 2020."},{"key":"e_1_3_2_1_16_1","unstructured":"Google. Xla: Optimizing compiler for tensorflow. https:\/\/www.tensorflow.org\/xla."},{"key":"e_1_3_2_1_17_1","first-page":"443","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. Serving {DNNs} like clockwork: Performance predictability from the bottom up. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pages 443--462, 2020."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508425"},{"key":"e_1_3_2_1_19_1","first-page":"539","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Han Mingcong","year":"2022","unstructured":"Mingcong Han, Hanze Zhang, Rong Chen, and Haibo Chen. Microsecond-scale preemption for concurrent gpu-accelerated dnn inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 539--558, 2022."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_21_1","volume-title":"Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861","author":"Howard Andrew G","year":"2017","unstructured":"Andrew G Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, and Hartwig Adam. Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861, 2017."},{"key":"e_1_3_2_1_22_1","unstructured":"INTEL. Scalable i\/o virtualization. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/articles\/technical\/introducing-intel-scalable-io-virtualization.html."},{"key":"e_1_3_2_1_23_1","unstructured":"INTEL. Submitting kernels to multiple queues. https:\/\/www.intel.com\/content\/www\/us\/en\/docs\/oneapi\/optimization-guide-gpu\/2023-0\/submitting-kernels-to-multiple-queues.html."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433707"},{"key":"e_1_3_2_1_25_1","volume-title":"Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems, 25","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems, 25, 2012."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507752"},{"key":"e_1_3_2_1_28_1","unstructured":"llama.cpp. llama.cpp for sycl. https:\/\/github.com\/ggerganov\/llama.cpp\/blob\/master\/README-sycl.md."},{"key":"e_1_3_2_1_29_1","first-page":"563","volume-title":"2021 IEEE 39th International Conference on Computer Design (ICCD)","author":"Ma Lixian","year":"2021","unstructured":"Lixian Ma, En Shao, Yueyuan Zhou, and Guangming Tan. Widepipe: High-throughput deep learning inference system on a cluster of neural processing units. In 2021 IEEE 39th International Conference on Computer Design (ICCD), pages 563--566. IEEE, 2021."},{"key":"e_1_3_2_1_30_1","unstructured":"NVIDIA. Cuda multi-streams. https:\/\/developer.NVIDIA.com\/blog\/gpu-pro-tip-cuda-7-streams-simplify-concurrency\/."},{"key":"e_1_3_2_1_31_1","unstructured":"NVIDIA. multi-process service. https:\/\/docs.NVIDIA.com\/deploy\/pdf\/CUDA_Multi_Process_Service_Overview.pdf."},{"key":"e_1_3_2_1_32_1","unstructured":"NVIDIA. Nvidia multi-instance gpu user guide - nvidia developer. https:\/\/docs.NVIDIA.com\/datacenter\/tesla\/pdf\/NVIDIA_MIG_User_Guide.pdf."},{"key":"e_1_3_2_1_33_1","volume-title":"Flexible, high-performance ml serving. arXiv preprint arXiv:1712.06139","author":"Olston Christopher","year":"2017","unstructured":"Christopher Olston, Noah Fiedel, Kiril Gorovoy, Jeremiah Harmsen, Li Lao, Fangwei Li, Vinu Rajashekhar, Sukriti Ramesh, and Jordan Soyke. Tensorflow-serving: Flexible, high-performance ml serving. arXiv preprint arXiv:1712.06139, 2017."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453417.3453432"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037707"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/2499370.2462176"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","DOI":"10.1007\/978-1-4842-5574-2","volume-title":"Data parallel C++: mastering DPC++ for programming of heterogeneous systems using C++ and SYCL","author":"Reinders James","year":"2021","unstructured":"James Reinders, Ben Ashbaugh, James Brodman, Michael Kinsner, John Pennycook, and Xinmin Tian. Data parallel C++: mastering DPC++ for programming of heterogeneous systems using C++ and SYCL. Springer Nature, 2021."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"e_1_3_2_1_40_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556, 2014."},{"key":"e_1_3_2_1_41_1","volume-title":"Serving dnn models with multi-instance gpus: A case of the reconfigurable machine scheduling problem. arXiv preprint arXiv:2109.11067","author":"Tan Cheng","year":"2021","unstructured":"Cheng Tan, Zhichao Li, Jian Zhang, Yu Cao, Sikai Qi, Zherui Liu, Yibo Zhu, and Chuanxiong Guo. Serving dnn models with multi-instance gpus: A case of the reconfigurable machine scheduling problem. arXiv preprint arXiv:2109.11067, 2021."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_43_1","unstructured":"TVM. Tvm sycl backend rfc. https:\/\/github.com\/apache\/tvm-rfcs\/blob\/main\/rfcs\/0105-sycl-backend.md."},{"key":"e_1_3_2_1_44_1","volume-title":"Attention is all you need. Advances in neural information processing systems, 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. Attention is all you need. Advances in neural information processing systems, 30, 2017."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446078"},{"key":"e_1_3_2_1_46_1","first-page":"204","article-title":"Bridging the gap between auto-tuners and hardware-native performance","volume":"4","author":"Xing Jiarong","year":"2022","unstructured":"Jiarong Xing, Leyuan Wang, Shang Zhang, Jack Chen, Ang Chen, and Yibo Zhu. Bolt: Bridging the gap between auto-tuners and hardware-native performance. Proceedings of Machine Learning and Systems, 4:204--216, 2022.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3155284.3018754"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD51958.2021.9643501"},{"key":"e_1_3_2_1_49_1","volume-title":"A survey of multi-tenant deep learning inference on gpu. arXiv preprint arXiv:2203.09040","author":"Yu Fuxun","year":"2022","unstructured":"Fuxun Yu, Di Wang, Longfei Shangguan, Minjia Zhang, Chenchen Liu, and Xiang Chen. A survey of multi-tenant deep learning inference on gpu. arXiv preprint arXiv:2203.09040, 2022."},{"key":"e_1_3_2_1_50_1","first-page":"521","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. Orca: A distributed serving system for transformer-based generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 521--538, 2022."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","first-page":"1371","DOI":"10.1145\/3373376.3378457","volume-title":"Proceedings of the twenty-fifth international conference on architectural support for programming languages and operating systems","author":"Zhao Xia","year":"2020","unstructured":"Xia Zhao, Magnus Jahre, and Lieven Eeckhout. Hsm: A hybrid slowdown model for multitasking gpus. In Proceedings of the twenty-fifth international conference on architectural support for programming languages and operating systems, pages 1371--1385, 2020."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3205289.3205311"},{"key":"e_1_3_2_1_53_1","first-page":"863","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, et al. Ansor: Generating high-performance tensor programs for deep learning. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation, pages 863--879, 2020."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527440"},{"key":"e_1_3_2_1_55_1","first-page":"233","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zhu Hongyu","year":"2022","unstructured":"Hongyu Zhu, Ruofan Wu, Yijia Diao, Shanbin Ke, Haoyu Li, Chen Zhang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Wei Cui, et al. {ROLLER}: Fast and efficient tensor compilation for deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 233--248, 2022."}],"event":{"name":"HPDC '24: 33rd International Symposium on High-Performance Parallel and Distributed Computing","location":"Pisa Italy","acronym":"HPDC '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 33rd International Symposium on High-Performance Parallel and Distributed Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3625549.3658654","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3625549.3658654","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T22:50:37Z","timestamp":1750287037000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3625549.3658654"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,3]]},"references-count":55,"alternative-id":["10.1145\/3625549.3658654","10.1145\/3625549"],"URL":"https:\/\/doi.org\/10.1145\/3625549.3658654","relation":{},"subject":[],"published":{"date-parts":[[2024,6,3]]},"assertion":[{"value":"2024-08-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}