{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:29:53Z","timestamp":1775579393345,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T00:00:00Z","timestamp":1717372800000},"content-version":"vor","delay-in-days":4,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"US National Science Foundation","award":["2217154"],"award-info":[{"award-number":["2217154"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3650200.3656626","type":"proceedings-article","created":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T14:11:54Z","timestamp":1717423914000},"page":"549-561","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Accelerated Auto-Tuning of GPU Kernels for Tensor Computations"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-2610-042X","authenticated-orcid":false,"given":"Chendi","family":"Li","sequence":"first","affiliation":[{"name":"University of Utah, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7787-6460","authenticated-orcid":false,"given":"Yufan","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Utah, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4285-1439","authenticated-orcid":false,"given":"Sina Mahdipour","family":"Saravani","sequence":"additional","affiliation":[{"name":"University of Utah, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4737-2034","authenticated-orcid":false,"given":"Ponnuswamy","family":"Sadayappan","sequence":"additional","affiliation":[{"name":"University of Utah, United States of America"}]}],"member":"320","published-online":{"date-parts":[[2024,6,3]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. Apache TVM: An End to End Machine Learning Compiler Framework for CPUs GPUs and accelerators. https:\/\/tvm.apache.org\/."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2628071.2628092"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661197"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2018.2841200"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2023.3279233"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CLOUD55607.2022.00061"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939785"},{"key":"e_1_3_2_1_8_1","volume-title":"TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie\u00a0Q. Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018, Carlsbad, CA, USA, October 8-10, 2018. USENIX Association, 578\u2013594. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/chen"},{"key":"e_1_3_2_1_9_1","unstructured":"Tianqi Chen Lianmin Zheng Eddie\u00a0Q. Yan Ziheng Jiang Thierry Moreau Luis Ceze Carlos Guestrin and Arvind Krishnamurthy. 2018. Learning to Optimize Tensor Programs. (2018) 3393\u20133404. https:\/\/proceedings.neurips.cc\/paper\/2018\/hash\/8b5700012be65c9da25f49408d959ca0-Abstract.html"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/N19-1423"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3559009.3569682"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","unstructured":"Erik\u00a0Orm Hellsten Artur L.\u00a0F. Souza Johannes Lenfers Rubens Lacouture Olivia Hsu Adel Ejjeh Fredrik Kjolstad Michel Steuwer Kunle Olukotun and Luigi Nardi. 2023. BaCO: A Fast and Portable Bayesian Compiler Optimization Framework. (2023) 19\u201342. https:\/\/doi.org\/10.1145\/3623278.3624770","DOI":"10.1145\/3623278.3624770"},{"key":"e_1_3_2_1_14_1","volume-title":"Optimizing CNN Model Inference on CPUs. In 2019 USENIX Annual Technical Conference, USENIX ATC 2019","author":"Liu Yizhi","year":"2019","unstructured":"Yizhi Liu, Yao Wang, Ruofei Yu, Mu Li, Vin Sharma, and Yida Wang. 2019. Optimizing CNN Model Inference on CPUs. In 2019 USENIX Annual Technical Conference, USENIX ATC 2019, Renton, WA, USA, July 10-12, 2019. USENIX Association, 1025\u20131040. https:\/\/www.usenix.org\/conference\/atc19\/presentation\/liu-yizhi"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454083"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.690"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3497776.3517774"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2009.5161054"},{"key":"e_1_3_2_1_20_1","volume-title":"Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions. CoRR abs\/1802.04730","author":"Vasilache Nicolas","year":"2018","unstructured":"Nicolas Vasilache, Oleksandr Zinenko, Theodoros Theodoridis, Priya Goyal, Zachary DeVito, William\u00a0S. Moses, Sven Verdoolaege, Andrew Adams, and Albert Cohen. 2018. Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions. CoRR abs\/1802.04730 (2018). arXiv:1802.04730http:\/\/arxiv.org\/abs\/1802.04730"},{"key":"e_1_3_2_1_21_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA. 5998\u20136008. https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/2400682.2400713"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2303.16245"},{"key":"e_1_3_2_1_24_1","volume-title":"Bolt: Bridging the Gap between Auto-tuners and Hardware-native Performance.","author":"Xing Jiarong","year":"2022","unstructured":"Jiarong Xing, Leyuan Wang, Shang Zhang, Jack Chen, Ang Chen, and Yibo Zhu. 2022. Bolt: Bridging the Gap between Auto-tuners and Hardware-native Performance. (2022). https:\/\/proceedings.mlsys.org\/paper\/2022\/hash\/38b3eff8baf56627478ec76a704e9b52-Abstract.html"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3559009.3569674"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575737"},{"key":"e_1_3_2_1_27_1","volume-title":"Apollo: Automatic Partition-based Operator Fusion through Layer by Layer Optimization.","author":"Zhao Jie","year":"2022","unstructured":"Jie Zhao, Xiong Gao, Ruijie Xia, Zhaochuang Zhang, Deshi Chen, Lei Chen, Renwei Zhang, Zhen Geng, Bin Cheng, and Xuefeng Jin. 2022. Apollo: Automatic Partition-based Operator Fusion through Layer by Layer Optimization. (2022). https:\/\/proceedings.mlsys.org\/paper\/2022\/hash\/069059b7ef840f0c74a814ec9237b6ec-Abstract.html"},{"key":"e_1_3_2_1_28_1","unstructured":"Lianmin Zheng. 2020. Optimizing Operators with Auto-scheduling. https:\/\/tvm.apache.org\/docs\/tutorial\/auto_scheduler_matmul_x86.html#sphx-glr-tutorial-auto-scheduler-matmul-x86-py."},{"key":"e_1_3_2_1_29_1","volume-title":"Ansor: Generating High-Performance Tensor Programs for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody\u00a0Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph\u00a0E. Gonzalez, and Ion Stoica. 2020. Ansor: Generating High-Performance Tensor Programs for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020, Virtual Event, November 4-6, 2020. USENIX Association, 863\u2013879. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/zheng"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507723"},{"key":"e_1_3_2_1_31_1","unstructured":"Hongyu Zhu. 2022. Get Started Tutorial: Generate a Matmul Kernel. https:\/\/github.com\/microsoft\/nnfusion\/blob\/a87b05c80c834db06aaa890d2ed4e364bc62eba4\/artifacts\/get_started_tutorial\/README_GET_STARTED.md."},{"key":"e_1_3_2_1_32_1","unstructured":"Hongyu Zhu. 2022. Roller RTX 3090 config file. https:\/\/github.com\/microsoft\/nnfusion\/blob\/roller\/artifacts\/roller\/arch\/rtx3090.py."},{"key":"e_1_3_2_1_33_1","volume-title":"ROLLER: Fast and Efficient Tensor Compilation for Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022","author":"Zhu Hongyu","year":"2022","unstructured":"Hongyu Zhu, Ruofan Wu, Yijia Diao, Shanbin Ke, Haoyu Li, Chen Zhang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Wei Cui, Fan Yang, Mao Yang, Lidong Zhou, Asaf Cidon, and Gennady Pekhimenko. 2022. ROLLER: Fast and Efficient Tensor Compilation for Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022, Carlsbad, CA, USA, July 11-13, 2022. USENIX Association, 233\u2013248. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/zhu"}],"event":{"name":"ICS '24: 2024 International Conference on Supercomputing","location":"Kyoto Japan","acronym":"ICS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 38th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656626","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3650200.3656626","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3650200.3656626","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:23:09Z","timestamp":1755876189000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656626"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":33,"alternative-id":["10.1145\/3650200.3656626","10.1145\/3650200"],"URL":"https:\/\/doi.org\/10.1145\/3650200.3656626","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}