{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T17:28:40Z","timestamp":1769102920166,"version":"3.49.0"},"reference-count":47,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"9","license":[{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Research Grants Council of Hong Kong","award":["RFS2425-4S02"],"award-info":[{"award-number":["RFS2425-4S02"]}]},{"name":"Research Grants Council of Hong Kong","award":["CUHK14210723"],"award-info":[{"award-number":["CUHK14210723"]}]},{"name":"MIND project","award":["MINDXZ202404"],"award-info":[{"award-number":["MINDXZ202404"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1109\/tpds.2025.3578630","type":"journal-article","created":{"date-parts":[[2025,6,11]],"date-time":"2025-06-11T13:46:51Z","timestamp":1749649611000},"page":"1904-1919","source":"Crossref","is-referenced-by-count":1,"title":["A Learned Performance Model With Transfer Learning Across GPUs on Tensorized Instructions"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5337-1783","authenticated-orcid":false,"given":"Yang","family":"Bai","sequence":"first","affiliation":[{"name":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong, SAR, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6740-8413","authenticated-orcid":false,"given":"Mingjun","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong, SAR, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5857-7393","authenticated-orcid":false,"given":"Wendong","family":"Xu","sequence":"additional","affiliation":[{"name":"Department of Electrical and Electronic Engineering, The University of Hong Kong, Hong Kong, SAR, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6406-4810","authenticated-orcid":false,"given":"Bei","family":"Yu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong, SAR, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412819"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01099"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00439"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00338"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"ref9","first-page":"578","article-title":"TVM: An automated end-to-end optimizing compiler for deep learning","volume-title":"Proc. USENIX Symp. Operating Syst. Des. Implementation","author":"Chen"},{"key":"ref10","first-page":"863","article-title":"Ansor: Generating high-performance tensor programs for deep learning","volume-title":"Proc. USENIX Symp. Operating Syst. Des. Implementation","author":"Zheng"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD51958.2021.9643487"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.3030548"},{"key":"ref13","first-page":"38","article-title":"Cortex: A compiler for recursive deep learning models","volume-title":"Proc. Conf. Mach. Learn. Syst.","author":"Fegade"},{"key":"ref14","first-page":"14","article-title":"TenSet: A large-scale program performance dataset for learned tensor compilers","volume-title":"Proc. 35th Conf. Neural Inf. Process. Syst. Datasets Benchmarks Track","author":"Zheng"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3322967"},{"key":"ref16","first-page":"578","article-title":"$\\lbrace${TVM$\\rbrace$}: An automated end-to-end optimizing compiler for deep learning","volume-title":"Proc. USENIX Symp. Operating Syst. Des. Implementation","author":"Chen"},{"key":"ref17","article-title":"XLA : Compiling machine learning for peak performance","author":"Sabne","year":"2020","journal-title":"Google Res"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3133901"},{"key":"ref19","first-page":"3393","article-title":"Learning to optimize tensor programs","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Chen"},{"key":"ref20","first-page":"233","article-title":"$\\lbrace${ROLLER$\\rbrace$}: Fast and efficient tensor compilation for deep learning","volume-title":"Proc. 16th USENIX Symp. Operating Syst. Des. Implementation","author":"Zhu"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939785"},{"key":"ref23","first-page":"17","article-title":"Chameleon: Adaptive code optimization for expedited deep neural network compilation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ahn"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i14.17462"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2023.3317169"},{"key":"ref26","first-page":"680","article-title":"ALCOP: Automatic load-compute pipelining in deep learning compiler for AI-GPUs","volume-title":"Proc. Conf. Mach. Learn. Syst.","author":"Huang"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2023.3241110"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P15-1150"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i8.16826"},{"key":"ref30","first-page":"387","article-title":"A learned performance model for tensor processing units","volume-title":"Proc. Conf. Mach. Learn. Syst.","author":"Kaufman"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530584"},{"key":"ref32","first-page":"323","article-title":"Value learning for throughput optimization of deep learning workloads","volume-title":"Proc. Conf. Mach. Learn. Syst.","author":"Steiner"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.534"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661197"},{"key":"ref35","article-title":"Nvidia tensor cores","year":"2017"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"ref37","article-title":"Intel math kernel library for deep learning networks","year":"2017"},{"key":"ref38","article-title":"ARM compute library","year":"2017"},{"key":"ref39","article-title":"cuDNN: Efficient primitives for deep learning","author":"Chetlur","year":"2014"},{"key":"ref40","first-page":"265","article-title":"TensorFlow: A system for large-scale machine learning","volume-title":"Proc. 12th {USENIX} Symp. Operating Syst. Des. Implementation","author":"Abadi"},{"key":"ref41","first-page":"8026","article-title":"PyTorch: An imperative style, high-performance deep learning library","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Paszke"},{"key":"ref42","article-title":"MXNet: A flexible and efficient machine learning library for heterogeneous distributed systems","author":"Chen","year":"2015"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.250"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3211346.3211348"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/1273496.1273513"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3576933"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00533"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/71\/11083525\/11030316.pdf?arnumber=11030316","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,18]],"date-time":"2025-07-18T17:47:04Z","timestamp":1752860824000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11030316\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9]]},"references-count":47,"journal-issue":{"issue":"9"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2025.3578630","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"value":"1045-9219","type":"print"},{"value":"1558-2183","type":"electronic"},{"value":"2161-9883","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9]]}}}