{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T12:27:57Z","timestamp":1773318477789,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","funder":[{"name":"National Key Research and Development Program of China","award":["2023YFB3001503"],"award-info":[{"award-number":["2023YFB3001503"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62272474, 61972408"],"award-info":[{"award-number":["62272474, 61972408"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759898","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:04:47Z","timestamp":1762963487000},"page":"185-199","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Constraint-Driven Auto-Tuning of GEMM-like Operators for MT-3000 Many-core Processor"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8316-2934","authenticated-orcid":false,"given":"Xinxin","family":"Qi","sequence":"first","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3542-4869","authenticated-orcid":false,"given":"Jianbin","family":"Fang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8364-9793","authenticated-orcid":false,"given":"Peng","family":"Zhang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6906-4940","authenticated-orcid":false,"given":"Yonggang","family":"Che","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3183-7228","authenticated-orcid":false,"given":"Jie","family":"Ren","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence and Computer Science, Shaanxi Normal University, Xi'an, Shaanxi, China"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","unstructured":"Jun Bi Qi Guo Xiaqing Li Yongwei Zhao Yuanbo Wen Yuxuan Guo Enshuai Zhou Xing Hu Zidong Du Ling Li Huaping Chen and Tianshi Chen. 2023. Heron: Automatically Constrained High-Performance Library Generation for Deep Learning Accelerators(ASPLOS 2023). Association for Computing Machinery New York NY USA 314\u2013328. 10.1145\/3582016.3582061","DOI":"10.1145\/3582016.3582061"},{"key":"e_1_3_3_2_3_2","unstructured":"Mariusz Bojarski Philip Yeres Anna Choromanska Krzysztof Choromanski Bernhard Firner Lawrence\u00a0D. Jackel and Urs Muller. 2017. Explaining How a Deep Neural Network Trained with End-to-End Learning Steers a Car. CoRR abs\/1704.07911 (2017). arXiv:https:\/\/arXiv.org\/abs\/1704.07911http:\/\/arxiv.org\/abs\/1704.07911"},{"key":"e_1_3_3_2_4_2","unstructured":"Tianqi Chen Thierry Moreau Ziheng Jiang Lianmin Zheng Eddie Yan Meghan Cowan Haichen Shen Leyuan Wang Yuwei Hu Luis Ceze Carlos Guestrin and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. Cornell University - arXiv Cornell University - arXiv (Feb 2018)."},{"key":"e_1_3_3_2_5_2","unstructured":"Tianqi Chen Lianmin Zheng Eddie Yan Ziheng Jiang Thierry Moreau Luis Ceze Carlos Guestrin and Arvind Krishnamurthy. 2018. Learning to Optimize Tensor Programs. Neural Information Processing Systems Neural Information Processing Systems (May 2018)."},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2017.20"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","unstructured":"Jianbin Fang Peng Zhang Chun Huang Tao Tang Kai Lu Ruibo Wang and Zheng Wang. 2023. Programming bare-metal accelerators with heterogeneous threading models: a case study of Matrix-3000. Frontiers Inf. Technol. Electron. Eng. 24 4 (2023) 509\u2013520. 10.1631\/FITEE.2200359","DOI":"10.1631\/FITEE.2200359"},{"key":"e_1_3_3_2_8_2","first-page":"1","volume-title":"SC21: International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Georganas Evangelos","year":"2021","unstructured":"Evangelos Georganas, Dhiraj Kalamkar, Sasikanth Avancha, Menachem Adelman, Cristina Anderson, Alexander Breuer, Jeremy Bruestle, Narendra Chaudhary, Abhisek Kundu, Denise Kutnick, Frank Laub, Vasimuddin Md, Sanchit Misra, Ramanarayan Mohanty, Hans Pabst, Barukh Ziv, and Alexander Heinecke. 2021. Tensor Processing Primitives: A Programming Abstraction for Efficiency and Portability in Deep Learning Workloads. In SC21: International Conference for High Performance Computing, Networking, Storage and Analysis. 1\u201316."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","unstructured":"Kazushige Goto and Robert Van De\u00a0Geijn. 2008. High-Performance Implementation of the Level-3 BLAS. ACM Trans. Math. Softw. 35 1 Article 4 (jul 2008) 14\u00a0pages. 10.1145\/1377603.1377607","DOI":"10.1145\/1377603.1377607"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00047"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_2_12_2","first-page":"1","volume-title":"SC25: International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Hemeng Wang","year":"2025","unstructured":"Wang Hemeng, Du Yang, Li Sidu, Tian Xiaowen, Sun Qingxiao, and Liu Weifeng. 2025. KAMI: Communication-Avoiding General Matrix Multiplication within a Single GPU. In SC25: International Conference for High Performance Computing, Networking, Storage and Analysis. 1\u201316."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00050"},{"key":"e_1_3_3_2_14_2","unstructured":"Intel. 2024. OneAPI Deep Neural Network Library.https:\/\/github.com\/oneapi-src\/oneDNN."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","unstructured":"A.\u00a0B. Kahn. 1962. Topological sorting of large networks. Commun. ACM 5 11 (Nov. 1962) 558\u2013562. 10.1145\/368996.369025","DOI":"10.1145\/368996.369025"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","unstructured":"Vasilios Kelefouras A. Kritikakou Iosif Mporas and Vasilios Kolonias. 2016. A High-Performance Matrix\u2014Matrix Multiplication Methodology for CPU and GPU Architectures. J. Supercomput. 72 3 (mar 2016) 804\u2013844. 10.1007\/s11227-015-1613-7","DOI":"10.1007\/s11227-015-1613-7"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","unstructured":"Alex Krizhevsky Ilya Sutskever and Geoffrey\u00a0E. Hinton. 2017. ImageNet classification with deep convolutional neural networks. Commun. ACM 60 6 (may 2017) 84\u201390. 10.1145\/3065386","DOI":"10.1145\/3065386"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","unstructured":"Alex Krizhevsky Ilya Sutskever and Geoffrey\u00a0E. Hinton. 2017. ImageNet Classification with Deep Convolutional Neural Networks. Commun. ACM (May 2017) 84\u201390. 10.1145\/3065386","DOI":"10.1145\/3065386"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"crossref","unstructured":"Kai Lu Yaohua Wang Yang Guo Chun Huang Sheng Liu Ruibo Wang Jianbin Fang Tao Tang Zhaoyun Chen Biwei Liu Zhong Liu Yuanwu Lei and Haiyan Sun. 2022. MT-3000: a heterogeneous multi-zone processor for HPC. CCF Transactions on High Performance Computing 4 (2022) 150 \u2013 164. https:\/\/api.semanticscholar.org\/CorpusID:249054941","DOI":"10.1007\/s42514-022-00095-y"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","unstructured":"Sheng Ma Zhong Liu Shenggang Chen Libo Huang Yang Guo Zhiying Wang and Meidi Zhang. 2019. Coordinated DMA: Improving the DRAM Access Efficiency for Matrix Multiplication. IEEE Trans. Parallel Distributed Syst. 30 10 (2019) 2148\u20132164. 10.1109\/TPDS.2019.2906891","DOI":"10.1109\/TPDS.2019.2906891"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.3389\/fams.2022.1038885"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063392"},{"key":"e_1_3_3_2_23_2","unstructured":"Maxim Naumov Dheevatsa Mudigere Hao-Jun\u00a0Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson\u00a0G. Azzolini Dmytro Dzhulgakov Andrey Mallevich Ilia Cherniavskii Yinghai Lu Raghuraman Krishnamoorthi Ansha Yu Volodymyr Kondratenko Stephanie Pereira Xianjie Chen Wenlin Chen Vijay Rao Bill Jia Liang Xiong and Misha Smelyanskiy. 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. CoRR abs\/1906.00091 (2019). arXiv:https:\/\/arXiv.org\/abs\/1906.00091http:\/\/arxiv.org\/abs\/1906.00091"},{"key":"e_1_3_3_2_24_2","unstructured":"Nvidia. 2024. CuBLAS. https:\/\/developer.nvidia.com\/cublas."},{"key":"e_1_3_3_2_25_2","unstructured":"Nvidia. 2024. CuDNN. https:\/\/developer.nvidia.com\/cudnn."},{"key":"e_1_3_3_2_26_2","unstructured":"Nvidia. 2024. CUTLASS. https:\/\/github.com\/NVIDIA\/cutlass."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","unstructured":"LUIS SENTIS and OUSSAMA KHATIB. 2005. SYNTHESIS OF WHOLE-BODY BEHAVIORS THROUGH HIERARCHICAL CONTROL OF BEHAVIORAL PRIMITIVES. International Journal of Humanoid Robotics 02 04 (2005) 505\u2013518. arXiv:10.1142\/S021984360500059410.1142\/S0219843605000594","DOI":"10.1142\/S0219843605000594"},{"key":"e_1_3_3_2_29_2","unstructured":"Karen Simonyan and Andrew Zisserman. 2015. Very Deep Convolutional Networks for Large-Scale Image Recognition. International Conference on Learning Representations International Conference on Learning Representations (Jan 2015)."},{"key":"e_1_3_3_2_30_2","unstructured":"Ilya Sutskever Oriol Vinyals and Quoc\u00a0V. Le. 2014. Sequence to sequence learning with neural networks(NIPS\u201914). MIT Press Cambridge MA USA 3104\u20133112."},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","unstructured":"Sanket Tavarageri Alexander Heinecke Sasikanth Avancha Bharat Kaul Gagandeep Goyal and Ramakrishna Upadrasta. 2021. PolyDL: Polyhedral Optimizations for Creation of High-performance DL Primitives. ACM Transactions on Architecture and Code Optimization (Mar 2021) 1\u201327. 10.1145\/3433103","DOI":"10.1145\/3433103"},{"key":"e_1_3_3_2_32_2","unstructured":"A\u00e4ron van\u00a0den Oord Sander Dieleman Heiga Zen Karen Simonyan Oriol Vinyals Alex Graves Nal Kalchbrenner Andrew\u00a0W. Senior and Koray Kavukcuoglu. 2016. WaveNet: A Generative Model for Raw Audio. CoRR abs\/1609.03499 (2016). arXiv:https:\/\/arXiv.org\/abs\/1609.03499http:\/\/arxiv.org\/abs\/1609.03499"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2008.5214359"},{"key":"e_1_3_3_2_35_2","first-page":"204","volume-title":"Proceedings of Machine Learning and Systems","volume":"4","author":"Xing Jiarong","year":"2022","unstructured":"Jiarong Xing, Leyuan Wang, Shang Zhang, Jack Chen, Ang Chen, and Yibo Zhu. 2022. Bolt: Bridging the Gap between Auto-tuners and Hardware-native Performance. In Proceedings of Machine Learning and Systems , D.\u00a0Marculescu, Y.\u00a0Chi, and C.\u00a0Wu (Eds.), Vol.\u00a04. 204\u2013216. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2022\/file\/1f8053a67ec8e0b57455713cefdd8218-Paper.pdf"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640390"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00090"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454106"},{"key":"e_1_3_3_2_39_2","series-title":"(OSDI\u201920)","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody\u00a0Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph\u00a0E. Gonzalez, and Ion Stoica. 2020. Ansor: generating high-performance tensor programs for deep learning. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation(OSDI\u201920). USENIX Association, USA, Article 49, 17\u00a0pages."},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527440"}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759898","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:43:01Z","timestamp":1773254581000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759898"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":39,"alternative-id":["10.1145\/3712285.3759898","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759898","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}