{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T15:34:30Z","timestamp":1772724870365,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":25,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3]]},"DOI":"10.1145\/3696443.3708967","type":"proceedings-article","created":{"date-parts":[[2025,2,22]],"date-time":"2025-02-22T11:50:26Z","timestamp":1740225026000},"page":"107-122","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["IntelliGen: Instruction-Level Auto-tuning for Tensor Program with Monotonic Memory Optimization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9630-7645","authenticated-orcid":false,"given":"Zixuan","family":"Ma","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, China"},{"name":"Qingcheng.AI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4605-148X","authenticated-orcid":false,"given":"Haojie","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4727-0150","authenticated-orcid":false,"given":"Jingze","family":"Xing","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9442-7375","authenticated-orcid":false,"given":"Shuhong","family":"Huang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7327-748X","authenticated-orcid":false,"given":"Liyan","family":"Zheng","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9045-9269","authenticated-orcid":false,"given":"Chen","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3092-1578","authenticated-orcid":false,"given":"Huanqi","family":"Cao","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7273-0952","authenticated-orcid":false,"given":"Kezhao","family":"Huang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7573-2250","authenticated-orcid":false,"given":"Mingshu","family":"Zhai","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6543-0859","authenticated-orcid":false,"given":"Shizhi","family":"Tang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9308-6273","authenticated-orcid":false,"given":"Penghan","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7656-6428","authenticated-orcid":false,"given":"Jidong","family":"Zhai","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,3]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"265","volume-title":"TensorFlow: A System for Large-Scale Machine Learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jefrey Dean, Matthieu Devin, Sanjay Ghemawat, Geofrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. 2016. TensorFlow: A System for Large-Scale Machine Learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16). USENIX Association, Savannah, GA, 265-283. https:\/\/www. usenix.org\/conference\/osdi16\/technical-sessions\/presentation\/abadi"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640366"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/1375581.1375595"},{"key":"e_1_3_2_1_4_1","first-page":"578","volume-title":"TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). USENIX Association, Carlsbad, CA, 578-594. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/chen"},{"key":"e_1_3_2_1_5_1","unstructured":"Sharan Chetlur Clif Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cuDNN: Eficient Primitives for Deep Learning. arXiv:1410.0759 [cs.NE] https:\/\/arxiv.org\/abs\/1410.0759"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_7_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=YicbFdNTTy","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"e_1_3_2_1_8_1","volume-title":"XLA: Optimizing Compiler for TensorFlow. https:\/\/www.tensorflow.org\/xla.","year":"2017","unstructured":"Google. 2017. XLA: Optimizing Compiler for TensorFlow. https:\/\/www.tensorflow.org\/xla."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582018"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSSC"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3519939.3523446"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"key":"e_1_3_2_1_13_1","volume-title":"Involution: Inverting the Inherence of Convolution for Visual Recognition. arXiv: 2103.06255 [cs.CV] https:\/\/arxiv.org\/abs\/2103.06255","author":"Li Duo","year":"2021","unstructured":"Duo Li, Jie Hu, Changhu Wang, Xiangtai Li, Qi She, Lei Zhu, Tong Zhang, and Qifeng Chen. 2021. Involution: Inverting the Inherence of Convolution for Visual Recognition. arXiv: 2103.06255 [cs.CV] https:\/\/arxiv.org\/abs\/2103.06255"},{"key":"e_1_3_2_1_14_1","unstructured":"NVIDIA. 2017. NVIDIA TensorRT: Programmable Inference Accelerator. https:\/\/developer.nvidia.com\/tensorrt."},{"key":"e_1_3_2_1_15_1","first-page":"606","volume-title":"Proceedings of Machine Learning and Systems, D. Song, M. Carbin, and T. Chen (Eds.)","volume":"5","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jef Dean. 2023. Eficiently Scaling Transformer Inference. In Proceedings of Machine Learning and Systems, D. Song, M. Carbin, and T. Chen (Eds.), Vol. 5. Curan, 606-624. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2023\/file\/ c4be71ab8d24cdfb45e3d06dbfca2780-Paper-mlsys2023.pdf"},{"key":"e_1_3_2_1_16_1","unstructured":"Alec Radford Jefrey Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. OpenAI blog."},{"key":"e_1_3_2_1_17_1","first-page":"701","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Shi Yining","year":"2023","unstructured":"Yining Shi, Zhi Yang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Ziming Miao, Yuxiao Guo, Fan Yang, and Lidong Zhou. 2023. Welder: Scheduling Deep Learning Memory Access via Tile-graph. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 701-718. https:\/\/www.usenix.org\/conference\/osdi23\/presentation\/shi"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3519939.3523448"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/2400682"},{"key":"e_1_3_2_1_21_1","first-page":"37","volume-title":"PET: Optimizing Tensor Programs with Partially Equivalent Transformations and Automated Corrections. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Wang Haojie","year":"2021","unstructured":"Haojie Wang, Jidong Zhai, Mingyu Gao, Zixuan Ma, Shizhi Tang, Liyan Zheng, Yuanzhi Li, Kaiyuan Rong, Yuanyong Chen, and Zhihao Jia. 2021. PET: Optimizing Tensor Programs with Partially Equivalent Transformations and Automated Corrections. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21). USENIX Association, 37-54. https:\/\/www.usenix.org\/conference\/ osdi21\/presentation\/wang"},{"key":"e_1_3_2_1_22_1","unstructured":"Qiang Zhang Qiangqiang Yuan Jie Li Zhen Yang and Xiaoshuang Ma. 2018. Learning a Dilated Residual Network for SAR Image Despeckling. arXiv: 1709.02898 [cs.CV] https:\/\/arxiv.org\/abs\/1709.02898"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR"},{"key":"e_1_3_2_1_24_1","first-page":"863","volume-title":"Ansor: Generating HighPerformance Tensor Programs for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph E. Gonzalez, and Ion Stoica. 2020. Ansor: Generating HighPerformance Tensor Programs for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 863-879. https:\/\/www.usenix.org\/conference\/ osdi20\/presentation\/zheng"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507723"}],"event":{"name":"CGO '25: 23rd ACM\/IEEE International Symposium on Code Generation and Optimization","location":"Las Vegas NV USA","acronym":"CGO '25","sponsor":["SIGPLAN SIGPLAN Programming Languages","SIGMICRO SIGMICRO Microarchitecture","IEEE Computer Society IEEE Computer Society"]},"container-title":["Proceedings of the 23rd ACM\/IEEE International Symposium on Code Generation and Optimization"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696443.3708967","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:13Z","timestamp":1750295413000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696443.3708967"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3]]},"references-count":25,"alternative-id":["10.1145\/3696443.3708967","10.1145\/3696443"],"URL":"https:\/\/doi.org\/10.1145\/3696443.3708967","relation":{},"subject":[],"published":{"date-parts":[[2025,3]]},"assertion":[{"value":"2025-03-01","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}