{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T12:18:23Z","timestamp":1773317903163,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","funder":[{"name":"the National Natural Science Foundation of China Grant","award":["No. 62202216"],"award-info":[{"award-number":["No. 62202216"]}]},{"name":"the Guangdong Basic and Applied Basic Research Foundation Grant","award":["No. 2023A1515010244"],"award-info":[{"award-number":["No. 2023A1515010244"]}]},{"name":"the Shenzhen Science and Technology Program Grant","award":["20231121101752002"],"award-info":[{"award-number":["20231121101752002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759857","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:04:47Z","timestamp":1762963487000},"page":"519-532","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["BOER: Enhancing Resource Utilization for Deep Learning Inference with Hybrid Spatial GPU Sharing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-1277-8164","authenticated-orcid":false,"given":"Bowen","family":"Zhang","sequence":"first","affiliation":[{"name":"Southern University of Science and Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0106-4660","authenticated-orcid":false,"given":"Yuhang","family":"Wang","sequence":"additional","affiliation":[{"name":"Southern University of Science and Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1903-6428","authenticated-orcid":false,"given":"Zhuozhao","family":"Li","sequence":"additional","affiliation":[{"name":"Southern University of Science and Technology, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_3_2_2","doi-asserted-by":"crossref","unstructured":"Quan Chen Hailong Yang Jason Mars and Lingjia Tang. 2016. Baymax: Qos awareness and increased utilization for non-preemptive accelerators in warehouse scale computers. ACM SIGPLAN Notices 51 4 (2016) 681\u2013696.","DOI":"10.1145\/2954679.2872368"},{"key":"e_1_3_3_3_3_2","first-page":"199","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Choi Seungbeom","year":"2022","unstructured":"Seungbeom Choi, Sunho Lee, Yeonjae Kim, Jongse Park, Youngjin Kwon, and Jaehyuk Huh. 2022. Serving heterogeneous machine learning models on Multi-GPU servers with Spatio-Temporal sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). 199\u2013216."},{"key":"e_1_3_3_3_4_2","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171\u20134186."},{"key":"e_1_3_3_3_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421284"},{"key":"e_1_3_3_3_6_2","first-page":"443","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. 2020. Serving DNNs like clockwork: Performance predictability from the bottom up. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 443\u2013462."},{"key":"e_1_3_3_3_7_2","unstructured":"Awni Hannun Carl Case Jared Casper Bryan Catanzaro Greg Diamos Erich Elsen Ryan Prenger Sanjeev Satheesh Shubho Sengupta Adam Coates et\u00a0al. 2014. Deep Speech: Scaling up end-to-end speech recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.5567 (2014)."},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_3_9_2","unstructured":"Andrew\u00a0G Howard Menglong Zhu Bo Chen Dmitry Kalenichenko Weijun Wang Tobias Weyand Marco Andreetto and Hartwig Adam. 2017. Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1704.04861 (2017)."},{"key":"e_1_3_3_3_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2018.00035"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530510"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"crossref","unstructured":"Aline\u00a0AS Leao Franklina\u00a0MB Toledo Jos\u00e9\u00a0Fernando Oliveira Maria\u00a0Ant\u00f3nia Carravilla and Ram\u00f3n Alvarez-Vald\u00e9s. 2020. Irregular packing problems: A review of mathematical models. European Journal of Operational Research 282 3 (2020) 803\u2013822.","DOI":"10.1016\/j.ejor.2019.04.045"},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"crossref","unstructured":"Yann LeCun L\u00e9on Bottou Yoshua Bengio and Patrick Haffner. 1998. Gradient-based learning applied to document recognition. Proc. IEEE 86 11 (1998) 2278\u20132324.","DOI":"10.1109\/5.726791"},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00048"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW55747.2022.00124"},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3542929.3563510"},{"key":"e_1_3_3_3_18_2","first-page":"663","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph\u00a0E Gonzalez, et\u00a0al. 2023. AlpaServe: Statistical multiplexing with model parallelism for Deep Learning serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). 663\u2013679."},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"crossref","unstructured":"Xuanzhe Liu Yihao Zhao Shufan Liu Xiang Li Yibo Zhu Xin Liu and Xin Jin. 2024. MuxFlow: efficient GPU sharing in production-level clusters with more than 10000 GPUs. Science China Information Sciences 67 12 (2024) 1\u201317.","DOI":"10.1007\/s11432-024-4227-2"},{"key":"e_1_3_3_3_21_2","first-page":"579","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Mohan Jayashree","year":"2022","unstructured":"Jayashree Mohan, Amar Phanishayee, Janardhan Kulkarni, and Vijay Chidambaram. 2022. Looking beyond GPUs for DNN scheduling on Multi-Tenant clusters. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 579\u2013596."},{"key":"e_1_3_3_3_22_2","volume-title":"NVIDIA Multi-Process Service (MPS) Overview","year":"2024","unstructured":"NVIDIA. 2024. NVIDIA Multi-Process Service (MPS) Overview. NVIDIA Corporation. https:\/\/docs.nvidia.com\/deploy\/mps\/index.html Accessed: 2025-08-25."},{"key":"e_1_3_3_3_23_2","volume-title":"NVIDIA Multi-Instance GPU User Guide","author":"Corporation NVIDIA","year":"2024","unstructured":"NVIDIA Corporation. 2024. NVIDIA Multi-Instance GPU User Guide. https:\/\/docs.nvidia.com\/datacenter\/tesla\/mig-user-guide\/index.html Accessed: 2025-08-25."},{"key":"e_1_3_3_3_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00025"},{"key":"e_1_3_3_3_25_2","unstructured":"Jiaxing Qi Wencong Xiao Mingzhen Li Chaojie Yang Yong Li Wei Lin Hailong Yang Zhongzhi Luan and Depei Qian. 2024. ElasticBatch: A Learning-Augmented Elastic Scheduling System for Batch Inference on MIG. IEEE Transactions on Parallel and Distributed Systems (2024)."},{"key":"e_1_3_3_3_26_2","unstructured":"Joseph Redmon and Ali Farhadi. 2018. Yolov3: An incremental improvement. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1804.02767 (2018)."},{"key":"e_1_3_3_3_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"e_1_3_3_3_28_2","first-page":"947","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Shubha Sudipta\u00a0Saha","year":"2024","unstructured":"Sudipta\u00a0Saha Shubha, Haiying Shen, and Anand Iyer. 2024. USHER: Holistic Interference Avoidance for Resource Optimized ML Inference. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 947\u2013964."},{"key":"e_1_3_3_3_29_2","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1409.1556 (2014)."},{"key":"e_1_3_3_3_30_2","unstructured":"Jasper Snoek Hugo Larochelle and Ryan\u00a0P Adams. 2012. Practical Bayesian optimization of machine learning algorithms. Advances in neural information processing systems 25 (2012)."},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"e_1_3_3_3_32_2","unstructured":"Cheng Tan Zhichao Li Jian Zhang Yu Cao Sikai Qi Zherui Liu Yibo Zhu and Chuanxiong Guo. 2021. Serving DNN models with multi-instance GPUs: A case of the reconfigurable machine scheduling problem. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2109.11067 (2021)."},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00293"},{"key":"e_1_3_3_3_34_2","unstructured":"Tianyu Wang Sheng Li Bingyao Li Yue Dai Ao Li Geng Yuan Yufei Ding Youtao Zhang and Xulong Tang. 2024. Improving GPU Multi-Tenancy Through Dynamic Multi-Instance GPU Reconfiguration. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.13126 (2024)."},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3688351.3689156"},{"key":"e_1_3_3_3_36_2","first-page":"945","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Weng Qizhen","year":"2022","unstructured":"Qizhen Weng, Wencong Xiao, Yinghao Yu, Wei Wang, Cheng Wang, Jian He, Yong Li, Liping Zhang, Wei Lin, and Yu Ding. 2022. MLaaS in the wild: Workload analysis and scheduling in Large-Scale heterogeneous GPU clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). 945\u2013960."},{"key":"e_1_3_3_3_37_2","first-page":"595","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, et\u00a0al. 2018. Gandiva: Introspective cluster scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 595\u2013610."},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3673038.3673089"},{"key":"e_1_3_3_3_39_2","first-page":"787","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Zhang Hong","year":"2023","unstructured":"Hong Zhang, Yupeng Tang, Anurag Khandelwal, and Ion Stoica. 2023. SHEPHERD: Serving DNNs in the wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 787\u2013808."},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"crossref","unstructured":"Wei Zhang Quan Chen Ningxin Zheng Weihao Cui Kaihua Fu and Minyi Guo. 2021. Toward qos-awareness and improved utilization of spatial multitasking gpus. IEEE Trans. Comput. 71 4 (2021) 866\u2013879.","DOI":"10.1109\/TC.2021.3064352"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"crossref","unstructured":"Yunqi Zhang David Meisner Jason Mars and Lingjia Tang. 2016. Treadmill: Attributing the source of tail latency through precise load testing and statistical inference. ACM SIGARCH Computer Architecture News 44 3 (2016) 456\u2013468.","DOI":"10.1145\/3007787.3001186"},{"key":"e_1_3_3_3_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544216.3544224"}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759857","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:30:40Z","timestamp":1773253840000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759857"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":41,"alternative-id":["10.1145\/3712285.3759857","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759857","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}