{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T09:07:26Z","timestamp":1771924046074,"version":"3.50.1"},"reference-count":54,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,1,31]]},"DOI":"10.1109\/cgo68049.2026.11395190","type":"proceedings-article","created":{"date-parts":[[2026,2,23]],"date-time":"2026-02-23T20:46:32Z","timestamp":1771879592000},"page":"362-374","source":"Crossref","is-referenced-by-count":0,"title":["From Threads to Tiles: T2T, a Compiler for CUDA-to-NPU Translation via 2D Vectorization"],"prefix":"10.1109","author":[{"given":"Shuaijiang","family":"Li","sequence":"first","affiliation":[{"name":"SKLP, ICT, CAS UCAS,Beijing,China"}]},{"given":"Jiacheng","family":"Zhao","sequence":"additional","affiliation":[{"name":"SKLP, ICT, CAS UCAS,Beijing,China"}]},{"given":"Ying","family":"Liu","sequence":"additional","affiliation":[{"name":"SKLP, ICT, CAS UCAS,Beijing,China"}]},{"given":"Shuoming","family":"Zhang","sequence":"additional","affiliation":[{"name":"SKLP, ICT, CAS UCAS,Beijing,China"}]},{"given":"Lei","family":"Chen","sequence":"additional","affiliation":[{"name":"UCAS,Beijing,China"}]},{"given":"Yijin","family":"Li","sequence":"additional","affiliation":[{"name":"SKLP, ICT, CAS,Beijing,China"}]},{"given":"Yangyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"SKLP, ICT, CAS UCAS,Beijing,China"}]},{"given":"Zhicheng","family":"Li","sequence":"additional","affiliation":[{"name":"SKLP, ICT, CAS UCAS,Beijing,China"}]},{"given":"Runyu","family":"Zhou","sequence":"additional","affiliation":[{"name":"SKLP, ICT, CAS UCAS,Beijing,China"}]},{"given":"Xiyu","family":"Shi","sequence":"additional","affiliation":[{"name":"SKLP, ICT, CAS,Beijing,China"}]},{"given":"Chunwei","family":"Xia","sequence":"additional","affiliation":[{"name":"University of Leeds,Leeds,UK"}]},{"given":"Yuan","family":"Wen","sequence":"additional","affiliation":[{"name":"University of Aberdeen,Aberdeen,UK"}]},{"given":"Xiaobing","family":"Feng","sequence":"additional","affiliation":[{"name":"SKLP, ICT, CAS UCAS,Beijing,China"}]},{"given":"Huimin","family":"Cui","sequence":"additional","affiliation":[{"name":"SKLP, ICT, CAS UCAS XCORESIGMA CO.,LTD.,Beijing,China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"CUDA C++ programming guide"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.softx.2015.06.001"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1021\/ct400314y"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1147\/jrd.2018.2888986"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1016\/b978-0-12-384988-5.00009-7"},{"key":"ref6","article-title":"RAPIDS: Open GPU data science","year":"2018"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/2688500.2688538"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3389705"},{"key":"ref9","article-title":"Pytorch: An imperative style, high-performance deep learning library","volume-title":"Advances in neural information processing systems","volume":"32","author":"Paszke","year":"2019"},{"key":"ref10","first-page":"265","article-title":"{TensorFlow}: a system for {Large-Scale} machine learning","volume-title":"12th USENIX symposium on operating systems design and implementation (OSDI 16)","author":"Abadi"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/hpca51647.2021.00071"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/2654822.2541967"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/hcs52781.2021.9567075"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/iiswc.2009.5306797"},{"key":"ref16","article-title":"The AI CUDA Engineer: Agentic CUDA kernel discovery, optimization and composition","author":"Lange","year":"2025","journal-title":"Sakana AI, Tech. Rep."},{"key":"ref17","article-title":"Cambricon MLU"},{"key":"ref18","article-title":"Ascend C"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/cgo.2011.5764682"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/cgo51591.2021.9370308"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3469030"},{"key":"ref22","article-title":"Composable and modular code generation in MLIR: A structured and retargetable approach to tensor compiler construction","author":"Vasilache","year":"2022"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/cgo51591.2021.9370332"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3579990.3580006"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3696443.3708956"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577475"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3554736"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/pact52795.2021.00011"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/cgo57630.2024.10444828"},{"key":"ref30","article-title":"NVIDIA cuDNN: CUDA deep neural network library","year":"2024"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3331059"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-89740-8_2"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/1854273.1854303"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/1854273.1854318"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/pact.2011.62"},{"key":"ref36","article-title":"DPCT","year":"2021"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3388333.3388641"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/1772954.1772971"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/2304576.2304623"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ispass48437.2020.00020"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3458744.3473356"},{"key":"ref42","article-title":"Supporting CUDA for an extended RISC-V GPU architecture","author":"Han","year":"2021"},{"key":"ref43","article-title":"Compiling and executing CUDA programs in emulation mode"},{"key":"ref44","article-title":"HIPIFY: Convert CUDA to portable C++ code"},{"key":"ref45","article-title":"CUDA on non-NVIDIA GPUs"},{"key":"ref46","article-title":"SYCL - the C++ single-source heterogeneous programming for acceleration","year":"2025"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/3529538.3529992"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3489525.3511681"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/3456669.3456684"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/s10766-014-0320-y"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/inpar.2012.6339601"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-28652-0_1"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/cgo.2004.1281665"},{"key":"ref54","first-page":"239","article-title":"QiMeng-Xpiler: Transcompiling tensor programs for deep learning systems with a neural-symbolic approach","volume-title":"Proceedings of the 19th USENIX Symposium on Operating Systems Design and Implementation (OSDI 2025)","author":"Dong"}],"event":{"name":"2026 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO)","location":"Sydney, Australia","start":{"date-parts":[[2026,1,31]]},"end":{"date-parts":[[2026,2,4]]}},"container-title":["2026 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11395173\/11394837\/11395190.pdf?arnumber=11395190","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T08:11:22Z","timestamp":1771920682000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11395190\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,31]]},"references-count":54,"URL":"https:\/\/doi.org\/10.1109\/cgo68049.2026.11395190","relation":{},"subject":[],"published":{"date-parts":[[2026,1,31]]}}}