{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T12:19:54Z","timestamp":1773317994108,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","funder":[{"name":"National Research Foundation in Korea","award":["NRF-2016M3C4A7952587"],"award-info":[{"award-number":["NRF-2016M3C4A7952587"]}]},{"DOI":"10.13039\/100014553","name":"Samsung Advanced Institute of Technology","doi-asserted-by":"publisher","award":[""],"award-info":[{"award-number":[""]}],"id":[{"id":"10.13039\/100014553","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759846","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:05:39Z","timestamp":1762963539000},"page":"1697-1709","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Compile-Time QoS Scheme for Deep Learning Inferences"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-9960-4193","authenticated-orcid":false,"given":"Sungin","family":"Hong","sequence":"first","affiliation":[{"name":"Sungkyunkwan University, Suwon, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6810-0903","authenticated-orcid":false,"given":"Hyunjun","family":"Kim","sequence":"additional","affiliation":[{"name":"Samsung Advanced Institute of Technology, Suwon, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7182-8452","authenticated-orcid":false,"given":"Hwansoo","family":"Han","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University, Suwon, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00034"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00073"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/RTSS.2017.00017"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"crossref","unstructured":"Katie Atkinson Trevor Bench-Capon and Danushka Bollegala. 2020. Explanation in AI and law: Past present and future. Artificial Intelligence 289 (2020) 103387.","DOI":"10.1016\/j.artint.2020.103387"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Longbing Cao. 2022. Ai in finance: challenges techniques and opportunities. ACM Computing Surveys (CSUR) 55 3 (2022) 1\u201338.","DOI":"10.1145\/3502289"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3018743.3018748"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_2"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037700"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"crossref","unstructured":"Quan Chen Hailong Yang Jason Mars and Lingjia Tang. 2016. Baymax: Qos awareness and increased utilization for non-preemptive accelerators in warehouse scale computers. ACM SIGPLAN Notices 51 4 (2016) 681\u2013696.","DOI":"10.1145\/2954679.2872368"},{"key":"e_1_3_3_2_11_2","first-page":"578","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). USENIX Association, Carlsbad, CA, 578\u2013594. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/chen"},{"key":"e_1_3_3_2_12_2","first-page":"199","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Choi Seungbeom","year":"2022","unstructured":"Seungbeom Choi, Sunho Lee, Yeonjae Kim, Jongse Park, Youngjin Kwon, and Jaehyuk Huh. 2022. Serving heterogeneous machine learning models on { Multi-GPU} servers with { Spatio-Temporal} sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). 199\u2013216."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00049"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00027"},{"key":"e_1_3_3_2_15_2","first-page":"4171","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). 4171\u20134186."},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421284"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.5555\/3571885.3571976"},{"key":"e_1_3_3_2_18_2","first-page":"443","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. 2020. Serving { DNNs} like clockwork: Performance predictability from the bottom up. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 443\u2013462."},{"key":"e_1_3_3_2_19_2","first-page":"539","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Han Mingcong","year":"2022","unstructured":"Mingcong Han, Hanze Zhang, Rong Chen, and Haibo Chen. 2022. Microsecond-scale preemption for concurrent { GPU-accelerated}{ DNN} inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 539\u2013558."},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/RTAS48715.2020.000-8"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Mudassar\u00a0Ali Khan Sabahat Israr Abeer S\u00a0Almogren Ikram\u00a0Ud Din Ahmad Almogren and Joel\u00a0JPC Rodrigues. 2021. Using augmented reality and deep learning to enhance Taxila Museum experience. Journal of Real-Time Image Processing 18 (2021) 321\u2013332.","DOI":"10.1007\/s11554-020-01038-y"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/RTCSA55878.2022.00027"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605627"},{"key":"e_1_3_3_2_25_2","first-page":"881","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Ma Lingxiao","year":"2020","unstructured":"Lingxiao Ma, Zhiqiang Xie, Zhi Yang, Jilong Xue, Youshan Miao, Wei Cui, Wenxiang Hu, Fan Yang, Lintao Zhang, and Lidong Zhou. 2020. Rammer: Enabling Holistic Deep Learning Compiler Optimizations with rTasks. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 881\u2013897. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/ma"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3437984.3458837"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629565"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454083"},{"key":"e_1_3_3_2_30_2","unstructured":"NVIDIA. 2024. CUDA C++ Best Practices Guide. Retrieved August 16 2024 from https:\/\/docs.nvidia.com\/cuda\/cuda-c-best-practices-guide\/index.html"},{"key":"e_1_3_3_2_31_2","unstructured":"NVIDIA. 2024. CUDA C++ Programming Guide. Retrieved August 16 2024 from https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html"},{"key":"e_1_3_3_2_32_2","unstructured":"NVIDIA. 2024. Multi-Process Service. Retrieved August 16 2024 from https:\/\/docs.nvidia.com\/deploy\/mps\/"},{"key":"e_1_3_3_2_33_2","unstructured":"NVIDIA. 2024. NVIDIA A100 Tensor Core GPU Architecture. Retrieved August 16 2024 from https:\/\/resources.nvidia.com\/en-us-genomics-ep\/ampere-architecture-white-paper"},{"key":"e_1_3_3_2_34_2","unstructured":"OPENXLA\/IREE. 2024. IREE: Intermediate Representation Execution Environment. Retrieved August 16 2024 from https:\/\/iree.dev\/"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00042"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"crossref","unstructured":"Francesco Piccialli Vittorio Di\u00a0Somma Fabio Giampaolo Salvatore Cuomo and Giancarlo Fortino. 2021. A survey on deep learning in medicine: Why how and when? Information Fusion 66 (2021) 111\u2013137.","DOI":"10.1016\/j.inffus.2020.09.006"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00045"},{"key":"e_1_3_3_2_39_2","first-page":"397","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja\u00a0J Yadwadkar, and Christos Kozyrakis. 2021. { INFaaS} : Automated model-less inference serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). 397\u2013411."},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"crossref","unstructured":"Iria Santos Luz Castro Nereida Rodriguez-Fernandez Alvaro Torrente-Patino and Adrian Carballal. 2021. Artificial neural networks and deep learning in the visual arts: A review. Neural Computing and Applications 33 (2021) 121\u2013157.","DOI":"10.1007\/s00521-020-05565-4"},{"key":"e_1_3_3_2_41_2","unstructured":"Haichen Shen Jared Roesch Zhi Chen Wei Chen Yong Wu Mu Li Vin Sharma Zachary Tatlock and Yida Wang. 2021. Nimble: Efficiently compiling dynamic neural networks for model inference. Proceedings of Machine Learning and Systems 3 (2021) 208\u2013222."},{"key":"e_1_3_3_2_42_2","first-page":"701","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Shi Yining","year":"2023","unstructured":"Yining Shi, Zhi Yang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Ziming Miao, Yuxiao Guo, Fan Yang, and Lidong Zhou. 2023. Welder: Scheduling deep learning memory access via tile-graph. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). 701\u2013718."},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Nikhil Singh Guillermo Bernal Daria Savchenko and Elena\u00a0L Glassman. 2023. Where to hide a stolen elephant: Leaps in creative writing with multimodal machine intelligence. ACM Transactions on Computer-Human Interaction 30 5 (2023) 1\u201357.","DOI":"10.1145\/3511599"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3559009.3569650"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"crossref","unstructured":"Bo Wu Xu Liu Xiaobo Zhou and Changjun Jiang. 2017. Flep: Enabling flexible and efficient preemption on gpus. ACM SIGPLAN Notices 52 4 (2017) 483\u2013496.","DOI":"10.1145\/3093336.3037742"},{"key":"e_1_3_3_2_46_2","first-page":"69","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wu Bingyang","year":"2023","unstructured":"Bingyang Wu, Zili Zhang, Zhihao Bai, Xuanzhe Liu, and Xin Jin. 2023. Transparent { GPU} sharing in container clouds for deep learning workloads. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 69\u201385."},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"crossref","unstructured":"Fei Xu Jianian Xu Jiabin Chen Li Chen Ruitao Shang Zhi Zhou and Fangming Liu. 2022. igniter: Interference-aware gpu resource provisioning for predictable dnn inference in the cloud. IEEE Transactions on Parallel and Distributed Systems 34 3 (2022) 812\u2013827.","DOI":"10.1109\/TPDS.2022.3232715"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507709"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00048"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"crossref","unstructured":"Fengxiang Zhang and Alan Burns. 2009. Schedulability analysis for real-time systems with EDF scheduling. IEEE Trans. Comput. 58 9 (2009) 1250\u20131258.","DOI":"10.1109\/TC.2009.58"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3330351"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00064"},{"key":"e_1_3_3_2_53_2","unstructured":"Jie Zhao Xiong Gao Ruijie Xia Zhaochuang Zhang Deshi Chen Lei Chen Renwei Zhang Zhen Geng Bin Cheng and Xuefeng Jin. 2022. Apollo: Automatic partition-based operator fusion through layer by layer optimization. Proceedings of Machine Learning and Systems 4 (2022) 1\u201319."},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507723"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"crossref","unstructured":"Jianlong Zhong and Bingsheng He. 2013. Kernelet: High-throughput GPU kernel executions with dynamic slicing and scheduling. IEEE Transactions on Parallel and Distributed Systems 25 6 (2013) 1522\u20131532.","DOI":"10.1109\/TPDS.2013.257"}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759846","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:33:52Z","timestamp":1773254032000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759846"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":54,"alternative-id":["10.1145\/3712285.3759846","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759846","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}