{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,2]],"date-time":"2026-03-02T22:43:14Z","timestamp":1772491394742,"version":"3.50.1"},"reference-count":40,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2025,2,1]],"date-time":"2025-02-01T00:00:00Z","timestamp":1738368000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,2,1]],"date-time":"2025-02-01T00:00:00Z","timestamp":1738368000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,2,1]],"date-time":"2025-02-01T00:00:00Z","timestamp":1738368000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFB4501400"],"award-info":[{"award-number":["2022YFB4501400"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62302302"],"award-info":[{"award-number":["62302302"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62232011"],"award-info":[{"award-number":["62232011"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62022057"],"award-info":[{"award-number":["62022057"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61832006"],"award-info":[{"award-number":["61832006"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Comput."],"published-print":{"date-parts":[[2025,2]]},"DOI":"10.1109\/tc.2024.3477995","type":"journal-article","created":{"date-parts":[[2024,10,10]],"date-time":"2024-10-10T17:22:05Z","timestamp":1728580925000},"page":"386-400","source":"Crossref","is-referenced-by-count":1,"title":["Adaptive Kernel Fusion for Improving the GPU Utilization While Ensuring\u00a0QoS"],"prefix":"10.1109","volume":"74","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1561-5329","authenticated-orcid":false,"given":"Han","family":"Zhao","sequence":"first","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1239-5134","authenticated-orcid":false,"given":"Junxiao","family":"Deng","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6646-5260","authenticated-orcid":false,"given":"Weihao","family":"Cui","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5832-0347","authenticated-orcid":false,"given":"Quan","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8425-8743","authenticated-orcid":false,"given":"Youtao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Computer Science Department, University of Pittsburgh, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3276-1202","authenticated-orcid":false,"given":"Deze","family":"Zeng","sequence":"additional","affiliation":[{"name":"School of Computer Science, China University of Geosciences, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0034-2302","authenticated-orcid":false,"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2014"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2749472"},{"key":"ref3","article-title":"Nvidia volta GPU architecture whitepaper","year":"2024"},{"key":"ref4","first-page":"443","article-title":"Serving DNNs like clockwork: Performance predictability from the bottom up","volume-title":"Proc. 14th USENIX Symp. Operating Syst. Des. Implementation (OSDI)","author":"Gujarati","year":"2020"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/2954679.2872368"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080203"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037700"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00059"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref11","article-title":"Parboil: A revised benchmark suite for scientific and commercial throughput computing","volume":"127","author":"Stratton","year":"2012","journal-title":"Center Reliable High-Perform. Comput."},{"key":"ref12","article-title":"CUDA MPS","year":"2024"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446078"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD46524.2019.00075"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00030"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/2818950.2818979"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.23919\/DATE.2018.8341982"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476143"},{"key":"ref19","first-page":"17","article-title":"TimeGraph: GPU scheduling for real-time multi-tasking environments","volume-title":"Proc. USENIX ATC (ATC)","author":"Kato","year":"2011"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/RTSS.2013.12"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/IC2E55432.2022.00030"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378457"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00034"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2011.5749745"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056063"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3629526.3653835"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/s11704-020-0072-3"},{"key":"ref28","first-page":"1","article-title":"Characterization and prediction of deep learning workloads in large-scale gpu datacenters","volume-title":"Proc. Int. Conf. High Perform. Comput., Netw., Storage Anal.","author":"Hu","year":"2021"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"ref30","article-title":"Tensor core example code","year":"2024"},{"key":"ref31","article-title":"Nvidia cutlass","year":"2024"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339596"},{"key":"ref33","first-page":"881","article-title":"Rammer: Enabling holistic deep learning compiler optimizations with rTasks","volume-title":"Proc. 14th USENIX Symp. Operating Syst. Des. Implementation (OSDI)","author":"Ma","year":"2020"},{"key":"ref34","article-title":"Nvidia nsight compute","year":"2024"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835937"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2015.31"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00064"},{"key":"ref38","article-title":"Tensorflow create customized OPS"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00045"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/2508148.2485974"}],"container-title":["IEEE Transactions on Computers"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/12\/10849950\/10713257.pdf?arnumber=10713257","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,23]],"date-time":"2025-01-23T19:52:07Z","timestamp":1737661927000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10713257\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2]]},"references-count":40,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tc.2024.3477995","relation":{},"ISSN":["0018-9340","1557-9956","2326-3814"],"issn-type":[{"value":"0018-9340","type":"print"},{"value":"1557-9956","type":"electronic"},{"value":"2326-3814","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2]]}}}