{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T07:36:14Z","timestamp":1768030574596,"version":"3.49.0"},"reference-count":32,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"11","license":[{"start":{"date-parts":[[2023,11,1]],"date-time":"2023-11-01T00:00:00Z","timestamp":1698796800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,11,1]],"date-time":"2023-11-01T00:00:00Z","timestamp":1698796800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,11,1]],"date-time":"2023-11-01T00:00:00Z","timestamp":1698796800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key R&amp;D Program of China","award":["2021ZD0110203"],"award-info":[{"award-number":["2021ZD0110203"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U22A2028"],"award-info":[{"award-number":["U22A2028"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61732002"],"award-info":[{"award-number":["61732002"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072018"],"award-info":[{"award-number":["62072018"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62076168"],"award-info":[{"award-number":["62076168"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Comput."],"published-print":{"date-parts":[[2023,11]]},"DOI":"10.1109\/tc.2023.3288758","type":"journal-article","created":{"date-parts":[[2023,6,23]],"date-time":"2023-06-23T17:27:52Z","timestamp":1687541272000},"page":"3178-3190","source":"Crossref","is-referenced-by-count":9,"title":["HAOTuner: A Hardware Adaptive Operator Auto-Tuner for Dynamic Shape Tensor Compilers"],"prefix":"10.1109","volume":"72","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4847-8148","authenticated-orcid":false,"given":"Pengyu","family":"Mu","sequence":"first","affiliation":[{"name":"Hangzhou Innovation Institute, Beihang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1829-2817","authenticated-orcid":false,"given":"Yi","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2741-6033","authenticated-orcid":false,"given":"Rui","family":"Wang","sequence":"additional","affiliation":[{"name":"Hangzhou Innovation Institute, Beihang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6163-2074","authenticated-orcid":false,"given":"Guoxiang","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Beihang University, Beijing, China"}]},{"given":"Zhonghao","family":"Sun","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1101-7927","authenticated-orcid":false,"given":"Hailong","family":"Yang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Beihang University, Beijing, China"}]},{"given":"Zhongzhi","family":"Luan","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5382-1473","authenticated-orcid":false,"given":"Depei","family":"Qian","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Beihang University, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"cuBLAS: CUDA basic linear algebra subroutine library","year":"2021"},{"key":"ref2","article-title":"cuDNN: Efficient primitives for deep learning","author":"Chetlur","year":"2014"},{"key":"ref3","article-title":"CUTLASS: CUDA templates for linear algebra subroutines","year":"2022"},{"key":"ref4","article-title":"oneAPI deep neural network library (oneDNN)","year":"2021"},{"key":"ref5","first-page":"578","article-title":"TVM: An automated end-to-end optimizing compiler for deep learning","volume-title":"Proc. 13th USENIX Symp. Operating Syst. Des. Implementation","author":"Chen"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3355606"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3322967"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/cgo51591.2021.9370308"},{"key":"ref9","first-page":"863","article-title":"Ansor: Generating high-performance tensor programs for deep learning","volume-title":"Proc. 14th USENIX Symp. Operating Syst. Des. Implementation","author":"Zheng"},{"key":"ref10","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. 9th Int. Conf. Learn. Representations","author":"Dosovitskiy"},{"key":"ref11","first-page":"173","article-title":"Deep speech 2 : End-to-end speech recognition in English and Mandarin","volume-title":"Proc. 33nd Int. Conf. Mach. Learn.","author":"Amodei"},{"key":"ref12","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguistics: Hum. Lang. Technol.","author":"Devlin"},{"key":"ref13","article-title":"Packing: Towards 2x NLP BERT acceleration","author":"Kosec","year":"2021"},{"key":"ref14","article-title":"DietCode: Automatic optimization for dynamic tensor programs","volume-title":"Proc. Mach. Learn. Syst.","author":"Zheng"},{"key":"ref15","article-title":"OpenBLAS: An optimized BLAS library","year":"2021"},{"key":"ref16","article-title":"Intel math kernel library for deep learning networks","year":"2017"},{"key":"ref17","article-title":"The CORA tensor compiler: Compilation for ragged tensors with minimal padding","volume-title":"Proc. Mach. Learn. Syst.","author":"Fegade"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/jproc.2018.2857721"},{"key":"ref19","article-title":"Nimble: Efficiently compiling dynamic neural networks for model inference","volume-title":"Proc. Mach. Learn. Syst.","author":"Shen"},{"key":"ref20","first-page":"3146","article-title":"LightGBM: A highly efficient gradient boosting decision tree","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ke"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939785"},{"key":"ref22","first-page":"6679","article-title":"TabNet: Attentive interpretable tabular learning","volume-title":"Proc. 35th AAAI Conf. Artif. Intell., 33rd Conf. Innov. Appl. Artif. Intell., 11th Symp. Educ. Adv. Artif. Intell.","author":"Arik"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/1273496.1273521"},{"key":"ref24","first-page":"5998","article-title":"Attention is all you need","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Vaswani"},{"key":"ref25","article-title":"XLA: TensorFlow, compiled. TensorFlow dev summit (2017)","author":"Leary","year":"2017"},{"key":"ref26","first-page":"233","article-title":"ROLLER: Fast and efficient tensor compilation for deep learning","volume-title":"Proc. 16th USENIX Symp. Operating Syst. Des. Implementation","author":"Zhu"},{"key":"ref27","article-title":"Slective tuning","author":"Yu","year":"2019"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3437984.3458838"},{"key":"ref29","article-title":"TenSet: A large-scale program performance dataset for learned tensor compilers","volume-title":"Proc. Neural Inf. Process. Syst. Track Datasets Benchmarks","author":"Zheng"},{"key":"ref30","article-title":"Moses: Efficient exploitation of cross-device transferable features for tensor program optimization","author":"Zhao","year":"2022"},{"key":"ref31","article-title":"Chameleon: Adaptive code optimization for expedited deep neural network compilation","volume-title":"Proc. 8th Int. Conf. Learn. Representations","author":"Ahn"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3368826.3377928"}],"container-title":["IEEE Transactions on Computers"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/12\/10274758\/10160123.pdf?arnumber=10160123","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,12]],"date-time":"2024-12-12T19:10:41Z","timestamp":1734030641000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10160123\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11]]},"references-count":32,"journal-issue":{"issue":"11"},"URL":"https:\/\/doi.org\/10.1109\/tc.2023.3288758","relation":{},"ISSN":["0018-9340","1557-9956","2326-3814"],"issn-type":[{"value":"0018-9340","type":"print"},{"value":"1557-9956","type":"electronic"},{"value":"2326-3814","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,11]]}}}