{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T11:27:08Z","timestamp":1777030028913,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3676641.3716269","type":"proceedings-article","created":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T16:47:32Z","timestamp":1743094052000},"page":"949-965","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Pruner: A Draft-then-Verify Exploration Mechanism to Accelerate Tensor Program Tuning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3366-9881","authenticated-orcid":false,"given":"Liang","family":"Qiao","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9888-6238","authenticated-orcid":false,"given":"Jun","family":"Shi","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4115-9475","authenticated-orcid":false,"given":"Xiaoyu","family":"Hao","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4786-2551","authenticated-orcid":false,"given":"Xi","family":"Fang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6494-8502","authenticated-orcid":false,"given":"Sen","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4524-4830","authenticated-orcid":false,"given":"Minfan","family":"Zhao","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3236-283X","authenticated-orcid":false,"given":"Ziqi","family":"Zhu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6487-3658","authenticated-orcid":false,"given":"Junshi","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3900-3722","authenticated-orcid":false,"given":"Hong","family":"An","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3385-2053","authenticated-orcid":false,"given":"Xulong","family":"Tang","sequence":"additional","affiliation":[{"name":"University of Pittsburgh, Pittsburgh, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1304-7181","authenticated-orcid":false,"given":"Bing","family":"Li","sequence":"additional","affiliation":[{"name":"NIO, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8429-2758","authenticated-orcid":false,"given":"Honghui","family":"Yuan","sequence":"additional","affiliation":[{"name":"NIO, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1678-9028","authenticated-orcid":false,"given":"Xinyang","family":"Wang","sequence":"additional","affiliation":[{"name":"NIO, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"12th USENIX symposium on operating systems design and implementation (OSDI 16)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, et al. 2016. {TensorFlow}: a system for {Large-Scale} machine learning. In 12th USENIX symposium on operating systems design and implementation (OSDI 16). 265--283."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3322967"},{"key":"e_1_3_2_1_3_1","volume-title":"Learning to schedule halide pipelines for the gpu. arXiv e-prints","author":"Anderson Luke","year":"2020","unstructured":"Luke Anderson, Andrew Adams, Karima Ma, Tzu-Mao Li, and Jonathan Ragan-Kelley. 2020. Learning to schedule halide pipelines for the gpu. arXiv e-prints (2020), arXiv--2012."},{"key":"e_1_3_2_1_4_1","first-page":"181","article-title":"A deep learning based cost model for automatic code optimization","volume":"3","author":"Baghdadi Riyadh","year":"2021","unstructured":"Riyadh Baghdadi, Massinissa Merouani, Mohamed-Hicham Leghettas, Kamel Abdous, Taha Arbaoui, Karima Benatchba, et al. 2021. A deep learning based cost model for automatic code optimization. Proceedings of Machine Learning and Systems, Vol. 3 (2021), 181--193.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.5555\/3314872.3314896"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582061"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/1273496.1273513"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_9_1","volume-title":"Rethinking atrous convolution for semantic image segmentation. arXiv preprint arXiv:1706.05587","author":"Chen Liang-Chieh","year":"2017","unstructured":"Liang-Chieh Chen, George Papandreou, Florian Schroff, and Hartwig Adam. 2017. Rethinking atrous convolution for semantic image segmentation. arXiv preprint arXiv:1706.05587 (2017)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939785"},{"key":"e_1_3_2_1_11_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. 2018a. {TVM}: An automated {End-to-End} optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578--594."},{"key":"e_1_3_2_1_12_1","volume-title":"Advances in Neural Information Processing Systems","volume":"31","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Lianmin Zheng, Eddie Yan, Ziheng Jiang, Thierry Moreau, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018b. Learning to optimize tensor programs. Advances in Neural Information Processing Systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_13_1","volume-title":"Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297","author":"Chen Xinlei","year":"2020","unstructured":"Xinlei Chen, Haoqi Fan, Ross Girshick, and Kaiming He. 2020. Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297 (2020)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"e_1_3_2_1_15_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_16_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_2_1_20_1","volume-title":"Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al.","author":"Jiang Albert Q","year":"2023","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al. 2023. Mistral 7B. arXiv preprint arXiv:2310.06825 (2023)."},{"key":"e_1_3_2_1_21_1","first-page":"387","article-title":"A learned performance model for tensor processing units","volume":"3","author":"Kaufman Sam","year":"2021","unstructured":"Sam Kaufman, Phitchaya Phothilimthana, Yanqi Zhou, Charith Mendis, Sudip Roy, Amit Sabne, and Mike Burrows. 2021. A learned performance model for tensor processing units. Proceedings of Machine Learning and Systems, Vol. 3 (2021), 387--400.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3133901"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.435"},{"key":"e_1_3_2_1_24_1","first-page":"14807","article-title":"Adatune: Adaptive tensor program compilation made efficient","volume":"33","author":"Li Menghao","year":"2020","unstructured":"Menghao Li, Minjia Zhang, Chi Wang, and Mingqin Li. 2020. Adatune: Adaptive tensor program compilation made efficient. Advances in Neural Information Processing Systems, Vol. 33 (2020), 14807--14819.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446759"},{"key":"e_1_3_2_1_26_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Ma Lingxiao","year":"2020","unstructured":"Lingxiao Ma, Zhiqiang Xie, Zhi Yang, Jilong Xue, Youshan Miao, Wei Cui, Wenxiang Hu, Fan Yang, Lintao Zhang, and Lidong Zhou. 2020. Rammer: Enabling holistic deep learning compiler optimizations with {rTasks}. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 881--897."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/2897824.2925952"},{"key":"e_1_3_2_1_28_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_29_1","volume-title":"Unsupervised representation learning with deep convolutional generative adversarial networks. arXiv preprint arXiv:1511.06434","author":"Radford Alec","year":"2015","unstructured":"Alec Radford. 2015. Unsupervised representation learning with deep convolutional generative adversarial networks. arXiv preprint arXiv:1511.06434 (2015)."},{"key":"e_1_3_2_1_30_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/2499370.2462176"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"e_1_3_2_1_33_1","first-page":"35783","article-title":"Tensor program optimization with probabilistic programs","volume":"35","author":"Shao Junru","year":"2022","unstructured":"Junru Shao, Xiyou Zhou, Siyuan Feng, Bohan Hou, Ruihang Lai, Hongyi Jin, Wuwei Lin, Masahiro Masuda, Cody Hao Yu, and Tianqi Chen. 2022. Tensor program optimization with probabilistic programs. Advances in Neural Information Processing Systems, Vol. 35 (2022), 35783--35796.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_34_1","first-page":"323","article-title":"Value learning for throughput optimization of deep learning workloads","volume":"3","author":"Steiner Benoit","year":"2021","unstructured":"Benoit Steiner, Chris Cummins, Horace He, and Hugh Leather. 2021. Value learning for throughput optimization of deep learning workloads. Proceedings of Machine Learning and Systems, Vol. 3 (2021), 323--334.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_36_1","unstructured":"TensorFlow. [n. d.]. XLA: Optimizing Compiler for Machine Learning. https:\/\/www.tensorflow.org\/xla."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_38_1","unstructured":"Torch-TensorRT. [n. d.]. https:\/\/pytorch.org\/TensorRT\/."},{"key":"e_1_3_2_1_39_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3269206.3271784"},{"key":"e_1_3_2_1_42_1","first-page":"204","article-title":"Bolt: Bridging the gap between auto-tuners and hardware-native performance","volume":"4","author":"Xing Jiarong","year":"2022","unstructured":"Jiarong Xing, Leyuan Wang, Shang Zhang, Jack Chen, Ang Chen, and Yibo Zhu. 2022. Bolt: Bridging the gap between auto-tuners and hardware-native performance. Proceedings of Machine Learning and Systems, Vol. 4 (2022), 204--216.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_43_1","volume-title":"Wide residual networks. arXiv preprint arXiv:1605.07146","author":"Zagoruyko Sergey","year":"2016","unstructured":"Sergey Zagoruyko and Nikos Komodakis. 2016. Wide residual networks. arXiv preprint arXiv:1605.07146 (2016)."},{"key":"e_1_3_2_1_44_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhai Yi","year":"2024","unstructured":"Yi Zhai, Sijia Yang, Keyu Pan, Renwei Zhang, Shuo Liu, Chao Liu, Zichun Ye, Jianmin Ji, Jie Zhao, Yu Zhang, et al. 2024. Enabling Tensor Language Model to Assist in Generating {High-Performance} Tensor Programs for Deep Learning. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 289--305."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575737"},{"key":"e_1_3_2_1_46_1","volume-title":"Xi Victoria Lin, et al","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454106"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651348"},{"key":"e_1_3_2_1_49_1","volume-title":"Moses: Efficient exploitation of cross-device transferable features for tensor program optimization. arXiv preprint arXiv:2201.05752","author":"Zhao Zhihe","year":"2022","unstructured":"Zhihe Zhao, Xian Shuai, Yang Bai, Neiwen Ling, Nan Guan, Zhenyu Yan, and Guoliang Xing. 2022. Moses: Efficient exploitation of cross-device transferable features for tensor program optimization. arXiv preprint arXiv:2201.05752 (2022)."},{"key":"e_1_3_2_1_50_1","volume-title":"14th USENIX symposium on operating systems design and implementation (OSDI 20)","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, et al. 2020a. Ansor: Generating {High-Performance} tensor programs for deep learning. In 14th USENIX symposium on operating systems design and implementation (OSDI 20). 863--879."},{"key":"e_1_3_2_1_51_1","volume-title":"Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1).","author":"Zheng Lianmin","year":"2021","unstructured":"Lianmin Zheng, Ruochen Liu, Junru Shao, Tianqi Chen, Joseph E Gonzalez, Ion Stoica, and Ameer Haj Ali. 2021. Tenset: A large-scale program performance dataset for learned tensor compilers. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378508"},{"key":"e_1_3_2_1_53_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zhu Hongyu","year":"2022","unstructured":"Hongyu Zhu, Ruofan Wu, Yijia Diao, Shanbin Ke, Haoyu Li, Chen Zhang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Wei Cui, et al. 2022. {ROLLER}: Fast and efficient tensor compilation for deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 233--248."}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3716269","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676641.3716269","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:11:12Z","timestamp":1755774672000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3716269"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":53,"alternative-id":["10.1145\/3676641.3716269","10.1145\/3676641"],"URL":"https:\/\/doi.org\/10.1145\/3676641.3716269","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}