{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T14:34:12Z","timestamp":1774449252886,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T00:00:00Z","timestamp":1674777600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,1,27]]},"DOI":"10.1145\/3575693.3575702","type":"proceedings-article","created":{"date-parts":[[2023,1,30]],"date-time":"2023-01-30T22:56:55Z","timestamp":1675119415000},"page":"370-384","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":27,"title":["Hidet: Task-Mapping Programming Paradigm for Deep Learning Tensor Programs"],"prefix":"10.1145","author":[{"given":"Yaoyao","family":"Ding","sequence":"first","affiliation":[{"name":"University of Toronto, Canada \/ Vector Institute, Canada"}]},{"given":"Cody Hao","family":"Yu","sequence":"additional","affiliation":[{"name":"Amazon Web Services, USA"}]},{"given":"Bojian","family":"Zheng","sequence":"additional","affiliation":[{"name":"University of Toronto, Canada \/ Vector Institute, Canada"}]},{"given":"Yizhi","family":"Liu","sequence":"additional","affiliation":[{"name":"Amazon Web Services, USA"}]},{"given":"Yida","family":"Wang","sequence":"additional","affiliation":[{"name":"Amazon Web Services, USA"}]},{"given":"Gennady","family":"Pekhimenko","sequence":"additional","affiliation":[{"name":"University of Toronto, Canada \/ Vector Institute, Canada"}]}],"member":"320","published-online":{"date-parts":[[2023,1,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Mart\u00edn Abadi Ashish Agarwal Paul Barham Eugene Brevdo Zhifeng Chen Craig Citro Greg S. Corrado Andy Davis Jeffrey Dean Matthieu Devin Sanjay Ghemawat Ian Goodfellow Andrew Harp Geoffrey Irving Michael Isard Yangqing Jia Rafal Jozefowicz Lukasz Kaiser Manjunath Kudlur Josh Levenberg Dandelion Man\u00e9 Rajat Monga Sherry Moore Derek Murray Chris Olah Mike Schuster Jonathon Shlens Benoit Steiner Ilya Sutskever Kunal Talwar Paul Tucker Vincent Vanhoucke Vijay Vasudevan Fernanda Vi\u00e9gas Oriol Vinyals Pete Warden Martin Wattenberg Martin Wicke Yuan Yu and Xiaoqiang Zheng. 2015. TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems. https:\/\/www.tensorflow.org\/ Software available from tensorflow.org \t\t\t\t  Mart\u00edn Abadi Ashish Agarwal Paul Barham Eugene Brevdo Zhifeng Chen Craig Citro Greg S. Corrado Andy Davis Jeffrey Dean Matthieu Devin Sanjay Ghemawat Ian Goodfellow Andrew Harp Geoffrey Irving Michael Isard Yangqing Jia Rafal Jozefowicz Lukasz Kaiser Manjunath Kudlur Josh Levenberg Dandelion Man\u00e9 Rajat Monga Sherry Moore Derek Murray Chris Olah Mike Schuster Jonathon Shlens Benoit Steiner Ilya Sutskever Kunal Talwar Paul Tucker Vincent Vanhoucke Vijay Vasudevan Fernanda Vi\u00e9gas Oriol Vinyals Pete Warden Martin Wattenberg Martin Wicke Yuan Yu and Xiaoqiang Zheng. 2015. TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems. https:\/\/www.tensorflow.org\/ Software available from tensorflow.org"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3322967"},{"key":"e_1_3_2_1_3_1","volume-title":"Deep Learning using Rectified Linear Units (ReLU). ArXiv, abs\/1803.08375","author":"Agarap Abien Fred","year":"2018","unstructured":"Abien Fred Agarap . 2018. Deep Learning using Rectified Linear Units (ReLU). ArXiv, abs\/1803.08375 ( 2018 ). Abien Fred Agarap. 2018. Deep Learning using Rectified Linear Units (ReLU). ArXiv, abs\/1803.08375 (2018)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661197"},{"key":"e_1_3_2_1_5_1","volume-title":"ONNX: Open Neural Network Exchange. https:\/\/github.com\/onnx\/onnx","author":"Bai Junjie","year":"2019","unstructured":"Junjie Bai , Fang Lu , and Ke Zhang . 2019 . ONNX: Open Neural Network Exchange. https:\/\/github.com\/onnx\/onnx Junjie Bai, Fang Lu, and Ke Zhang. 2019. ONNX: Open Neural Network Exchange. https:\/\/github.com\/onnx\/onnx"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063400"},{"key":"e_1_3_2_1_7_1","unstructured":"Louis Bavoil. 2020. Optimizing Compute Shaders for L2 Locality using Thread-Group ID Swizzling. https:\/\/developer.nvidia.com\/blog\/optimizing-compute-shaders-for-l2-locality-using-thread-group-id-swizzling\/ \t\t\t\t  Louis Bavoil. 2020. Optimizing Compute Shaders for L2 Locality using Thread-Group ID Swizzling. https:\/\/developer.nvidia.com\/blog\/optimizing-compute-shaders-for-l2-locality-using-thread-group-id-swizzling\/"},{"key":"e_1_3_2_1_8_1","volume-title":"Tenth international workshop on frontiers in handwriting recognition.","author":"Chellapilla Kumar","year":"2006","unstructured":"Kumar Chellapilla , Sidd Puri , and Patrice Simard . 2006 . High performance convolutional neural networks for document processing . In Tenth international workshop on frontiers in handwriting recognition. Kumar Chellapilla, Sidd Puri, and Patrice Simard. 2006. High performance convolutional neural networks for document processing. In Tenth international workshop on frontiers in handwriting recognition."},{"key":"e_1_3_2_1_9_1","volume-title":"TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In OSDI.","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen , Thierry Moreau , Ziheng Jiang , Lianmin Zheng , Eddie Q. Yan , Haichen Shen , Meghan Cowan , Leyuan Wang , Yuwei Hu , Luis Ceze , Carlos Guestrin , and Arvind Krishnamurthy . 2018 . TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In OSDI. Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Q. Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In OSDI."},{"key":"#cr-split#-e_1_3_2_1_10_1.1","unstructured":"Tianqi Chen Bing Xu Chiyuan Zhang and Carlos Guestrin. 2016. Training Deep Nets with Sublinear Memory Cost. https:\/\/doi.org\/10.48550\/ARXIV.1604.06174 10.48550\/ARXIV.1604.06174"},{"key":"#cr-split#-e_1_3_2_1_10_1.2","unstructured":"Tianqi Chen Bing Xu Chiyuan Zhang and Carlos Guestrin. 2016. Training Deep Nets with Sublinear Memory Cost. https:\/\/doi.org\/10.48550\/ARXIV.1604.06174"},{"key":"e_1_3_2_1_11_1","unstructured":"Tianqi Chen Lianmin Zheng Eddie Yan Ziheng Jiang Thierry Moreau Luis Ceze Carlos Guestrin and Arvind Krishnamurthy. 2018. Learning to optimize tensor programs. In Advances in Neural Information Processing Systems. 3389\u20133400. \t\t\t\t  Tianqi Chen Lianmin Zheng Eddie Yan Ziheng Jiang Thierry Moreau Luis Ceze Carlos Guestrin and Arvind Krishnamurthy. 2018. Learning to optimize tensor programs. In Advances in Neural Information Processing Systems. 3389\u20133400."},{"key":"e_1_3_2_1_12_1","volume-title":"cuDNN: Efficient Primitives for Deep Learning. ArXiv, abs\/1410.0759","author":"Chetlur Sharan","year":"2014","unstructured":"Sharan Chetlur , Cliff Woolley , Philippe Vandermersch , Jonathan M. Cohen , John Tran , Bryan Catanzaro , and Evan Shelhamer . 2014. cuDNN: Efficient Primitives for Deep Learning. ArXiv, abs\/1410.0759 ( 2014 ). Sharan Chetlur, Cliff Woolley, Philippe Vandermersch, Jonathan M. Cohen, John Tran, Bryan Catanzaro, and Evan Shelhamer. 2014. cuDNN: Efficient Primitives for Deep Learning. ArXiv, abs\/1410.0759 (2014)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3061394"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.350"},{"key":"e_1_3_2_1_15_1","unstructured":"ONNX Runtime developers. 2021. ONNX Runtime. https:\/\/onnxruntime.ai\/ Version: 1.11.1 \t\t\t\t  ONNX Runtime developers. 2021. ONNX Runtime. https:\/\/onnxruntime.ai\/ Version: 1.11.1"},{"key":"e_1_3_2_1_16_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR, abs\/1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . 2018 . BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR, abs\/1810.04805 (2018), arXiv:1810.04805. arxiv:1810.04805 Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR, abs\/1810.04805 (2018), arXiv:1810.04805. arxiv:1810.04805"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.5281\/zenodo.7429879"},{"key":"e_1_3_2_1_18_1","first-page":"167","article-title":"Ios: Inter-operator scheduler for cnn acceleration","volume":"3","author":"Ding Yaoyao","year":"2021","unstructured":"Yaoyao Ding , Ligeng Zhu , Zhihao Jia , Gennady Pekhimenko , and Song Han . 2021 . Ios: Inter-operator scheduler for cnn acceleration . Proceedings of Machine Learning and Systems , 3 (2021), 167 \u2013 180 . Yaoyao Ding, Ligeng Zhu, Zhihao Jia, Gennady Pekhimenko, and Song Han. 2021. Ios: Inter-operator scheduler for cnn acceleration. Proceedings of Machine Learning and Systems, 3 (2021), 167\u2013180.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.14778\/3407790.3407857"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.). 4, 721\u2013747","author":"Fegade Pratik","year":"2022","unstructured":"Pratik Fegade , Tianqi Chen , Phillip Gibbons , and Todd Mowry . 2022 . The CoRa Tensor Compiler: Compilation for Ragged Tensors with Minimal Padding . In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.). 4, 721\u2013747 . https:\/\/proceedings.mlsys.org\/paper\/2022\/file\/d3d9446802a44259755d38e6d163e820-Paper.pdf Pratik Fegade, Tianqi Chen, Phillip Gibbons, and Todd Mowry. 2022. The CoRa Tensor Compiler: Compilation for Ragged Tensors with Minimal Padding. In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.). 4, 721\u2013747. https:\/\/proceedings.mlsys.org\/paper\/2022\/file\/d3d9446802a44259755d38e6d163e820-Paper.pdf"},{"key":"e_1_3_2_1_21_1","volume-title":"Mowry","author":"Fegade Pratik","year":"2020","unstructured":"Pratik Fegade , Tianqi Chen , Phil Gibbons , and Todd C . Mowry . 2020 . Cortex : A Compiler for Recursive Deep Learning Models. ArXiv , abs\/2011.01383 (2020). Pratik Fegade, Tianqi Chen, Phil Gibbons, and Todd C. Mowry. 2020. Cortex: A Compiler for Recursive Deep Learning Models. ArXiv, abs\/2011.01383 (2020)."},{"key":"e_1_3_2_1_22_1","volume-title":"Yong Yu, and Tianqi Chen.","author":"Feng Siyuan","year":"2022","unstructured":"Siyuan Feng , Bohan Hou , Hongyi Jin , Wuwei Lin , Junru Shao , Ruihang Lai , Zihao Ye , Lianmin Zheng , Cody Hao Yu , Yong Yu, and Tianqi Chen. 2022 . TensorIR: An Abstraction for Automatic Tensorized Program Optimization . https:\/\/doi.org\/10.48550\/ARXIV.2207.04296 10.48550\/ARXIV.2207.04296 Siyuan Feng, Bohan Hou, Hongyi Jin, Wuwei Lin, Junru Shao, Ruihang Lai, Zihao Ye, Lianmin Zheng, Cody Hao Yu, Yong Yu, and Tianqi Chen. 2022. TensorIR: An Abstraction for Automatic Tensorized Program Optimization. https:\/\/doi.org\/10.48550\/ARXIV.2207.04296"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453953.3453972"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414632"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_26_1","unstructured":"NVIDIA Inc.. 2022. Basic Linear Algebra on NVIDIA GPUs. https:\/\/developer.nvidia.com\/cublas \t\t\t\t  NVIDIA Inc.. 2022. Basic Linear Algebra on NVIDIA GPUs. https:\/\/developer.nvidia.com\/cublas"},{"key":"e_1_3_2_1_27_1","unstructured":"NVIDIA Inc.. 2022. NVIDIA TensorRT. https:\/\/developer.nvidia.com\/tensorrt \t\t\t\t  NVIDIA Inc.. 2022. NVIDIA TensorRT. https:\/\/developer.nvidia.com\/tensorrt"},{"key":"e_1_3_2_1_28_1","unstructured":"NVIDIA Inc.. 2022. Parallel Thread Execution ISA. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index.html \t\t\t\t  NVIDIA Inc.. 2022. Parallel Thread Execution ISA. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index.html"},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of Machine Learning and Systems, I. Dhillon, D. Papailiopoulos, and V. Sze (Eds.). 2, 497\u2013511","author":"Jain Paras","year":"2020","unstructured":"Paras Jain , Ajay Jain , Aniruddha Nrusimha , Amir Gholami , Pieter Abbeel , Joseph Gonzalez , Kurt Keutzer , and Ion Stoica . 2020 . Checkmate: Breaking the Memory Wall with Optimal Tensor Rematerialization . In Proceedings of Machine Learning and Systems, I. Dhillon, D. Papailiopoulos, and V. Sze (Eds.). 2, 497\u2013511 . https:\/\/proceedings.mlsys.org\/paper\/2020\/file\/084b6fbb10729ed4da8c3d3f5a3ae7c9-Paper.pdf Paras Jain, Ajay Jain, Aniruddha Nrusimha, Amir Gholami, Pieter Abbeel, Joseph Gonzalez, Kurt Keutzer, and Ion Stoica. 2020. Checkmate: Breaking the Memory Wall with Optimal Tensor Rematerialization. In Proceedings of Machine Learning and Systems, I. Dhillon, D. Papailiopoulos, and V. Sze (Eds.). 2, 497\u2013511. https:\/\/proceedings.mlsys.org\/paper\/2020\/file\/084b6fbb10729ed4da8c3d3f5a3ae7c9-Paper.pdf"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3140659.3080246"},{"key":"e_1_3_2_1_32_1","volume-title":"CUTLASS: CUDA TEMPLATE LIBRARY FOR DENSE LINEAR ALGEBRA AT ALL LEVELS AND SCALES.","author":"Kerr Andrew","year":"2018","unstructured":"Andrew Kerr , Duane Merrill , Julien Demouth , John Tran , Naila Farooqui , Markus Tavenrath , Vince Schuster , Eddie Gornish , Jerry Zheng , and Bageshri Sathe . 2018 . CUTLASS: CUDA TEMPLATE LIBRARY FOR DENSE LINEAR ALGEBRA AT ALL LEVELS AND SCALES. Andrew Kerr, Duane Merrill, Julien Demouth, John Tran, Naila Farooqui, Markus Tavenrath, Vince Schuster, Eddie Gornish, Jerry Zheng, and Bageshri Sathe. 2018. CUTLASS: CUDA TEMPLATE LIBRARY FOR DENSE LINEAR ALGEBRA AT ALL LEVELS AND SCALES."},{"key":"e_1_3_2_1_33_1","volume-title":"Dynamic Tensor Rematerialization. In 9th International Conference on Learning Representations, ICLR 2021","author":"Kirisame Marisa","year":"2021","unstructured":"Marisa Kirisame , Steven Lyubomirsky , Altan Haan , Jennifer Brennan , Mike He , Jared Roesch , Tianqi Chen , and Zachary Tatlock . 2021 . Dynamic Tensor Rematerialization. In 9th International Conference on Learning Representations, ICLR 2021 , Virtual Event, Austria , May 3-7, 2021. OpenReview.net. https:\/\/openreview.net\/forum?id=Vfs_2RnOD0H Marisa Kirisame, Steven Lyubomirsky, Altan Haan, Jennifer Brennan, Mike He, Jared Roesch, Tianqi Chen, and Zachary Tatlock. 2021. Dynamic Tensor Rematerialization. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021. OpenReview.net. https:\/\/openreview.net\/forum?id=Vfs_2RnOD0H"},{"key":"e_1_3_2_1_34_1","unstructured":"Alex Krizhevsky Ilya Sutskever and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems. 1097\u20131105. \t\t\t\t  Alex Krizhevsky Ilya Sutskever and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems. 1097\u20131105."},{"key":"e_1_3_2_1_35_1","volume-title":"Deep learning. nature, 521, 7553","author":"LeCun Yann","year":"2015","unstructured":"Yann LeCun , Yoshua Bengio , and Geoffrey Hinton . 2015. Deep learning. nature, 521, 7553 ( 2015 ), 436\u2013444. Yann LeCun, Yoshua Bengio, and Geoffrey Hinton. 2015. Deep learning. nature, 521, 7553 (2015), 436\u2013444."},{"key":"e_1_3_2_1_36_1","volume-title":"BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. In ACL.","author":"Lewis Mike","year":"2020","unstructured":"Mike Lewis , Yinhan Liu , Naman Goyal , Marjan Ghazvininejad , Abdelrahman Mohamed , Omer Levy , Veselin Stoyanov , and Luke Zettlemoyer . 2020 . BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. In ACL. Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Veselin Stoyanov, and Luke Zettlemoyer. 2020. BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. In ACL."},{"key":"e_1_3_2_1_37_1","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Liu Yizhi","year":"2019","unstructured":"Yizhi Liu , Yao Wang , Ruofei Yu , Mu Li , Vin Sharma , and Yida Wang . 2019 . Optimizing $CNN$ Model Inference on $CPUs$ . In 2019 USENIX Annual Technical Conference (USENIX ATC 19) . 1025\u20131040. Yizhi Liu, Yao Wang, Ruofei Yu, Mu Li, Vin Sharma, and Yida Wang. 2019. Optimizing $CNN$ Model Inference on $CPUs$. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). 1025\u20131040."},{"key":"e_1_3_2_1_38_1","volume-title":"RAMMER: Enabling Holistic Deep Learning Compiler Optimizations with Rtasks","author":"Ma Lingxiao","year":"2020","unstructured":"Lingxiao Ma , Zhiqiang Xie , Zhi Yang , Jilong Xue , Youshan Miao , Wei Cui , Wenxiang Hu , Fan Yang , Lintao Zhang , and Lidong Zhou . 2020 . RAMMER: Enabling Holistic Deep Learning Compiler Optimizations with Rtasks . USENIX Association , USA. 17. isbn:978-1-939133-19-9 Lingxiao Ma, Zhiqiang Xie, Zhi Yang, Jilong Xue, Youshan Miao, Wei Cui, Wenxiang Hu, Fan Yang, Lintao Zhang, and Lidong Zhou. 2020. RAMMER: Enabling Holistic Deep Learning Compiler Optimizations with Rtasks. USENIX Association, USA. 17. isbn:978-1-939133-19-9"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/1365490.1365500"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2010.41"},{"key":"e_1_3_2_1_41_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","unstructured":"Adam Paszke , Sam Gross , Francisco Massa , Adam Lerer , James Bradbury , Gregory Chanan , Trevor Killeen , Zeming Lin , Natalia Gimelshein , Luca Antiga , Alban Desmaison , Andreas Kopf , Edward Yang , Zachary DeVito , Martin Raison , Alykhan Tejani , Sasank Chilamkurthy , Benoit Steiner , Lu Fang , Junjie Bai , and Soumith Chintala . 2019. PyTorch: An Imperative Style , High-Performance Deep Learning Library . In Advances in Neural Information Processing Systems 32, H. Wallach, H. Larochelle, A. Beygelzimer, F. d' Alch\u00e9-Buc, E. Fox, and R. Garnett (Eds.). Curran Associates, Inc., 8024\u20138035. http:\/\/papers.neurips.cc\/paper\/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems 32, H. Wallach, H. Larochelle, A. Beygelzimer, F. d' Alch\u00e9-Buc, E. Fox, and R. Garnett (Eds.). Curran Associates, Inc., 8024\u20138035. http:\/\/papers.neurips.cc\/paper\/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf"},{"key":"e_1_3_2_1_42_1","volume-title":"Language models are unsupervised multitask learners. OpenAI blog, 1, 8","author":"Radford Alec","year":"2019","unstructured":"Alec Radford , Jeffrey Wu , Rewon Child , David Luan , Dario Amodei , and Ilya Sutskever . 2019. Language models are unsupervised multitask learners. OpenAI blog, 1, 8 ( 2019 ), 9. Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. 2019. Language models are unsupervised multitask learners. OpenAI blog, 1, 8 (2019), 9."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/2499370.2462176"},{"key":"e_1_3_2_1_44_1","volume-title":"XLA : Compiling Machine Learning for Peak Performance.","author":"Sabne Amit","year":"2020","unstructured":"Amit Sabne . 2020 . XLA : Compiling Machine Learning for Peak Performance. Amit Sabne. 2020. XLA : Compiling Machine Learning for Peak Performance."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"e_1_3_2_1_46_1","volume-title":"Cody Hao Yu, and Tianqi Chen","author":"Shao Junru","year":"2022","unstructured":"Junru Shao , Xiyou Zhou , Siyuan Feng , Bohan Hou , Ruihang Lai , Hongyi Jin , Wuwei Lin , Masahiro Masuda , Cody Hao Yu, and Tianqi Chen . 2022 . Tensor Program Optimization with Probabilistic Programs. ArXiv , abs\/2205.13603 (2022). Junru Shao, Xiyou Zhou, Siyuan Feng, Bohan Hou, Ruihang Lai, Hongyi Jin, Wuwei Lin, Masahiro Masuda, Cody Hao Yu, and Tianqi Chen. 2022. Tensor Program Optimization with Probabilistic Programs. ArXiv, abs\/2205.13603 (2022)."},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of Machine Learning and Systems, A. Smola, A. Dimakis, and I. Stoica (Eds.). 3, 208\u2013222","author":"Shen Haichen","year":"2021","unstructured":"Haichen Shen , Jared Roesch , Zhi Chen , Wei Chen , Yong Wu , Mu Li , Vin Sharma , Zachary Tatlock , and Yida Wang . 2021 . Nimble: Efficiently Compiling Dynamic Neural Networks for Model Inference . In Proceedings of Machine Learning and Systems, A. Smola, A. Dimakis, and I. Stoica (Eds.). 3, 208\u2013222 . https:\/\/proceedings.mlsys.org\/paper\/2021\/file\/4e732ced3463d06de0ca9a15b6153677-Paper.pdf Haichen Shen, Jared Roesch, Zhi Chen, Wei Chen, Yong Wu, Mu Li, Vin Sharma, Zachary Tatlock, and Yida Wang. 2021. Nimble: Efficiently Compiling Dynamic Neural Networks for Model Inference. In Proceedings of Machine Learning and Systems, A. Smola, A. Dimakis, and I. Stoica (Eds.). 3, 208\u2013222. https:\/\/proceedings.mlsys.org\/paper\/2021\/file\/4e732ced3463d06de0ca9a15b6153677-Paper.pdf"},{"key":"e_1_3_2_1_48_1","unstructured":"Ilya Sutskever Oriol Vinyals and Quoc V Le. 2014. Sequence to sequence learning with neural networks. In Advances in neural information processing systems. 3104\u20133112. \t\t\t\t  Ilya Sutskever Oriol Vinyals and Quoc V Le. 2014. Sequence to sequence learning with neural networks. In Advances in neural information processing systems. 3104\u20133112."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3519939.3523448"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477497"},{"key":"e_1_3_2_1_54_1","volume-title":"Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions. ArXiv, abs\/1802.04730","author":"Vasilache Nicolas","year":"2018","unstructured":"Nicolas Vasilache , Oleksandr Zinenko , Theodoros Theodoridis , Priya Goyal , Zach DeVito , William S. Moses , Sven Verdoolaege , Andrew Adams , and Albert Cohen . 2018 . Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions. ArXiv, abs\/1802.04730 (2018). Nicolas Vasilache, Oleksandr Zinenko, Theodoros Theodoridis, Priya Goyal, Zach DeVito, William S. Moses, Sven Verdoolaege, Andrew Adams, and Albert Cohen. 2018. Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions. ArXiv, abs\/1802.04730 (2018)."},{"key":"e_1_3_2_1_55_1","volume-title":"\u0141 ukasz Kaiser, and Illia Polosukhin","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani , Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan N Gomez , \u0141 ukasz Kaiser, and Illia Polosukhin . 2017 . Attention is All you Need. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.). 30, Curran Associates, Inc .. https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.). 30, Curran Associates, Inc.. https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_1_56_1","volume-title":"PET: Optimizing Tensor Programs with Partially Equivalent Transformations and Automated Corrections. In USENIX Symposium on Operating Systems Design and Implementation.","author":"Wang Haojie","year":"2021","unstructured":"Haojie Wang , Jidong Zhai , Mingyu Gao , Zixuan Ma , Shizhi Tang , Liyan Zheng , Yuanzhi Li , Kaiyuan Rong , Yuanyong Chen , and Zhihao Jia . 2021 . PET: Optimizing Tensor Programs with Partially Equivalent Transformations and Automated Corrections. In USENIX Symposium on Operating Systems Design and Implementation. Haojie Wang, Jidong Zhai, Mingyu Gao, Zixuan Ma, Shizhi Tang, Liyan Zheng, Yuanzhi Li, Kaiyuan Rong, Yuanyong Chen, and Zhihao Jia. 2021. PET: Optimizing Tensor Programs with Partially Equivalent Transformations and Automated Corrections. In USENIX Symposium on Operating Systems Design and Implementation."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370330"},{"key":"e_1_3_2_1_58_1","volume-title":"Proceedings of Machine Learning and Systems. 4.","author":"Xing Jiarong","year":"2022","unstructured":"Jiarong Xing , Leyuan Wang , Shang Zhang , Jack Chen , Ang Chen , and Yibo Zhu . 2022 . Bolt: Bridging the Gap between Auto-tuners and Hardware-native Performance . In Proceedings of Machine Learning and Systems. 4. Jiarong Xing, Leyuan Wang, Shang Zhang, Jack Chen, Ang Chen, and Yibo Zhu. 2022. Bolt: Bridging the Gap between Auto-tuners and Hardware-native Performance. In Proceedings of Machine Learning and Systems. 4."},{"key":"e_1_3_2_1_59_1","unstructured":"Bing Xu Ying Zhang Hao Lu Yang Chen Terry Chen Mike Iovine Mu-Chu Lee and Zhijing Li. 2022. AITemplate. https:\/\/github.com\/facebookincubator\/AITemplate \t\t\t\t  Bing Xu Ying Zhang Hao Lu Yang Chen Terry Chen Mike Iovine Mu-Chu Lee and Zhijing Li. 2022. AITemplate. https:\/\/github.com\/facebookincubator\/AITemplate"},{"key":"e_1_3_2_1_60_1","volume-title":"Proceedings of Machine Learning and Systems, A. Smola, A. Dimakis, and I. Stoica (Eds.). 3, 255\u2013268","author":"Yang Yichen","year":"2021","unstructured":"Yichen Yang , Phitchaya Phothilimthana , Yisu Wang , Max Willsey , Sudip Roy , and Jacques Pienaar . 2021 . Equality Saturation for Tensor Graph Superoptimization . In Proceedings of Machine Learning and Systems, A. Smola, A. Dimakis, and I. Stoica (Eds.). 3, 255\u2013268 . https:\/\/proceedings.mlsys.org\/paper\/2021\/file\/65ded5353c5ee48d0b7d48c591b8f430-Paper.pdf Yichen Yang, Phitchaya Phothilimthana, Yisu Wang, Max Willsey, Sudip Roy, and Jacques Pienaar. 2021. Equality Saturation for Tensor Graph Superoptimization. In Proceedings of Machine Learning and Systems, A. Smola, A. Dimakis, and I. Stoica (Eds.). 3, 255\u2013268. https:\/\/proceedings.mlsys.org\/paper\/2021\/file\/65ded5353c5ee48d0b7d48c591b8f430-Paper.pdf"},{"key":"e_1_3_2_1_61_1","volume-title":"Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.). 4, 1\u201319","author":"Zhao Jie","year":"2022","unstructured":"Jie Zhao , Xiong Gao , Ruijie Xia , Zhaochuang Zhang , Deshi Chen , Lei Chen , Renwei Zhang , Zhen Geng , Bin Cheng , and Xuefeng Jin . 2022 . Apollo: Automatic Partition-based Operator Fusion through Layer by Layer Optimization . In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.). 4, 1\u201319 . https:\/\/proceedings.mlsys.org\/paper\/2022\/file\/069059b7ef840f0c74a814ec9237b6ec-Paper.pdf Jie Zhao, Xiong Gao, Ruijie Xia, Zhaochuang Zhang, Deshi Chen, Lei Chen, Renwei Zhang, Zhen Geng, Bin Cheng, and Xuefeng Jin. 2022. Apollo: Automatic Partition-based Operator Fusion through Layer by Layer Optimization. In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.). 4, 1\u201319. https:\/\/proceedings.mlsys.org\/paper\/2022\/file\/069059b7ef840f0c74a814ec9237b6ec-Paper.pdf"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454106"},{"key":"e_1_3_2_1_63_1","volume-title":"Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.). 4, 848\u2013863","author":"Zheng Bojian","year":"2022","unstructured":"Bojian Zheng , Ziheng Jiang , Cody Hao Yu , Haichen Shen , Joshua Fromm , Yizhi Liu , Yida Wang , Luis Ceze , Tianqi Chen , and Gennady Pekhimenko . 2022 . DietCode: Automatic Optimization for Dynamic Tensor Programs . In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.). 4, 848\u2013863 . https:\/\/proceedings.mlsys.org\/paper\/2022\/file\/fa7cdfad1a5aaf8370ebeda47a1ff1c3-Paper.pdf Bojian Zheng, Ziheng Jiang, Cody Hao Yu, Haichen Shen, Joshua Fromm, Yizhi Liu, Yida Wang, Luis Ceze, Tianqi Chen, and Gennady Pekhimenko. 2022. DietCode: Automatic Optimization for Dynamic Tensor Programs. In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.). 4, 848\u2013863. https:\/\/proceedings.mlsys.org\/paper\/2022\/file\/fa7cdfad1a5aaf8370ebeda47a1ff1c3-Paper.pdf"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00092"},{"key":"e_1_3_2_1_65_1","volume-title":"Ansor: Generating High-Performance Tensor Programs for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng , Chengfan Jia , Minmin Sun , Zhao Wu , Cody Hao Yu , Ameer Haj-Ali , Yida Wang , Jun Yang , Danyang Zhuo , Koushik Sen , Joseph E. Gonzalez , and Ion Stoica . 2020 . Ansor: Generating High-Performance Tensor Programs for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20) . 863\u2013879. Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph E. Gonzalez, and Ion Stoica. 2020. Ansor: Generating High-Performance Tensor Programs for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 863\u2013879."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527440"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378508"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507723"},{"key":"e_1_3_2_1_69_1","volume-title":"ROLLER: Fast and Efficient Tensor Compilation for Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zhu Hongyu","year":"2022","unstructured":"Hongyu Zhu , Ruofan Wu , Yijia Diao , Shanbin Ke , Haoyu Li , Chen Zhang , Jilong Xue , Lingxiao Ma , Yuqing Xia , Wei Cui , Fan Yang , Mao Yang , Lidong Zhou , Asaf Cidon , and Gennady Pekhimenko . 2022 . ROLLER: Fast and Efficient Tensor Compilation for Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22) . USENIX Association, Carlsbad, CA. 233\u2013248. isbn:978-1-939133-28-1 https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/zhu Hongyu Zhu, Ruofan Wu, Yijia Diao, Shanbin Ke, Haoyu Li, Chen Zhang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Wei Cui, Fan Yang, Mao Yang, Lidong Zhou, Asaf Cidon, and Gennady Pekhimenko. 2022. ROLLER: Fast and Efficient Tensor Compilation for Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA. 233\u2013248. isbn:978-1-939133-28-1 https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/zhu"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437984.3458838"},{"key":"e_1_3_2_1_71_1","unstructured":"Barret Zoph and Quoc V Le. 2016. Neural architecture search with reinforcement learning. arXiv preprint arXiv:1611.01578. \t\t\t\t  Barret Zoph and Quoc V Le. 2016. Neural architecture search with reinforcement learning. arXiv preprint arXiv:1611.01578."}],"event":{"name":"ASPLOS '23: 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","location":"Vancouver BC Canada","acronym":"ASPLOS '23","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages"]},"container-title":["Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3575693.3575702","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3575693.3575702","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T18:43:52Z","timestamp":1750272232000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3575693.3575702"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,27]]},"references-count":72,"alternative-id":["10.1145\/3575693.3575702","10.1145\/3575693"],"URL":"https:\/\/doi.org\/10.1145\/3575693.3575702","relation":{},"subject":[],"published":{"date-parts":[[2023,1,27]]},"assertion":[{"value":"2023-01-30","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}