{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:39:26Z","timestamp":1766219966677,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":27,"publisher":"ACM","funder":[{"DOI":"10.13039\/100006502","name":"Defense Sciences Office, DARPA","doi-asserted-by":"publisher","award":["HR0011-24-9-0517"],"award-info":[{"award-number":["HR0011-24-9-0517"]}],"id":[{"id":"10.13039\/100006502","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000015","name":"U.S. Department of Energy","doi-asserted-by":"publisher","award":["DE-SC0025645"],"award-info":[{"award-number":["DE-SC0025645"]}],"id":[{"id":"10.13039\/100000015","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,8]]},"DOI":"10.1145\/3754598.3754612","type":"proceedings-article","created":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:34:32Z","timestamp":1766219672000},"page":"531-540","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Architecture-Aware Models of AI Engines for High-Performance Matrix Matrix Multiplication"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3588-5606","authenticated-orcid":false,"given":"Elliott D.","family":"Binder","sequence":"first","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0872-9172","authenticated-orcid":false,"given":"Jeffrey","family":"Low","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5179-8249","authenticated-orcid":false,"given":"Tze Meng","family":"Low","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,12,20]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"2025. Xilinx\/llvm-aie. https:\/\/github.com\/Xilinx\/llvm-aie original-date: 2024-04-22T18:55:00Z."},{"key":"e_1_3_3_1_3_2","unstructured":"2025. Xilinx\/mlir-aie. https:\/\/github.com\/Xilinx\/mlir-aie original-date: 2021-08-27T17:36:02Z."},{"key":"e_1_3_3_1_4_2","unstructured":"AMD. 2024. Overview \u2022 Versal Adaptive SoC AIE-ML Architecture Manual (AM020) \u2022 Reader \u2022 AMD Technical Information Portal. https:\/\/docs.amd.com\/r\/en-US\/am020-versal-aie-ml\/Overview"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"crossref","unstructured":"E Anderson Z Bai C Bischof J Demmel J Dongarra J DuCroz A Greenbaum S Hammarling A McKenney and D Sorensen. 1990. LAPACK: A Portable Linear Algebra Library for High-Performance Computers. (1990).","DOI":"10.1109\/SUPERC.1990.129995"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS64566.2025.00106"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"L\u00a0Susan Blackford Antoine Petitet Roldan Pozo Karin Remington R\u00a0Clint Whaley James Demmel Jack Dongarra Iain Duff Sven Hammarling Greg Henry et\u00a0al. 2002. An updated set of basic linear algebra subprograms (BLAS). ACM Trans. Math. Software 28 2 (2002) 135\u2013151.","DOI":"10.1145\/567806.567807"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Nick Brown and Gabriel\u00a0Rodr\u00edguez Canal. 2025. Seamless acceleration of Fortran intrinsics via AMD AI engines. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.10254 (2025).","DOI":"10.1145\/3706628.3708854"},{"key":"e_1_3_3_1_9_2","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared\u00a0D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et\u00a0al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877\u20131901."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.5555\/645455.653765"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","unstructured":"Tze\u00a0Meng Low Francisco\u00a0D. Igual Tyler\u00a0M. Smith and Enrique\u00a0S. Quintana-Orti. 2017. Analytical Modeling Is Enough for High-Performance BLIS. ACM Trans. Math. Software 43 2 (June 2017) 1\u201318. 10.1145\/2925987","DOI":"10.1145\/2925987"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","unstructured":"Alejandro Rico Satyaprakash Pareek Javier Cabezas David Clarke Baris Ozgul Francisco Barat Yao Fu Stephan M\u00fcnz Dylan Stuart Patrick Schlangen Pedro Duarte Sneha Date Indrani Paul Jian Weng Sonal Santan Vinod Kathail Ashish Sirasao and Juanjo Noguera. 2024. AMD XDNA NPU in Ryzen AI Processors. IEEE Micro 44 6 (Nov. 2024) 73\u201382. 10.1109\/MM.2024.3423692Conference Name: IEEE Micro.","DOI":"10.1109\/MM.2024.3423692"},{"key":"e_1_3_3_1_13_2","unstructured":"Andr\u00e9 R\u00f6sti and Michael Franz. 2025. Unlocking the AMD Neural Processing Unit for ML Training on the Client Using Bare-Metal-Programming Tools. arxiv:https:\/\/arXiv.org\/abs\/2504.03083\u00a0[cs.AR] https:\/\/arxiv.org\/abs\/2504.03083"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Martin\u00a0D Schatz Robert\u00a0A Van\u00a0de Geijn and Jack Poulson. 2016. Parallel matrix multiplication: A systematic journey. SIAM Journal on Scientific Computing 38 6 (2016) C748\u2013C781.","DOI":"10.1137\/140993478"},{"key":"e_1_3_3_1_15_2","unstructured":"Yilin Shen. 2025. Evaluating the Efficiency of Neural Network Implementations on AMD Versal AI Engines. PhD Thesis."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2014.110"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICFPT59805.2023.00016"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS53621.2022.00117"},{"key":"e_1_3_3_1_19_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","unstructured":"Field G.\u00a0Van Zee. 2020. Implementing High-Performance Complex Matrix Multiplication via the 1m Method. SIAM Journal on Scientific Computing 42 5 (Sept. 2020) C221\u2013C244. 10.1137\/19M1282040","DOI":"10.1137\/19M1282040"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","unstructured":"Field G.\u00a0Van Zee and Robert A. van\u00a0de Geijn. 2015. BLIS: A Framework for Rapidly Instantiating BLAS Functionality. ACM Trans. Math. Software 41 3 (June 2015) 14:1\u201314:33. 10.1145\/2764454","DOI":"10.1145\/2764454"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","unstructured":"Field G.\u00a0Van Zee Devangi\u00a0N. Parikh and Robert A. van\u00a0de Geijn. 2021. Supporting Mixed-domain Mixed-precision Matrix Multiplication within the BLIS Framework. ACM Trans. Math. Software 47 2 (April 2021) 12:1\u201312:26. 10.1145\/3402225","DOI":"10.1145\/3402225"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","unstructured":"Field G.\u00a0Van Zee and Tyler Smith. 2017. Implementing High-performance Complex Matrix Multiplication via the 3m and 4m Methods. ACM Trans. Math. Software 44 1 (July 2017) 7:1\u20137:36. 10.1145\/3086466","DOI":"10.1145\/3086466"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","unstructured":"Field G.\u00a0Van Zee Tyler Smith Francisco\u00a0D. Igual Mikhail Smelyanskiy Xianyi Zhang Michael Kistler Vernon Austel John Gunnels Tze\u00a0Meng Low Bryan Marker Lee Killough and Robert A. van\u00a0de Geijn. 2016. The BLIS Framework: Experiments in Portability. ACM Trans. Math. Software 42 2 (June 2016) 12:1\u201312:19. 10.1145\/2755561","DOI":"10.1145\/2755561"},{"key":"e_1_3_3_1_25_2","series-title":"Proceedings of Machine Learning Research","first-page":"5776","volume-title":"Proceedings of the 35th International Conference on Machine Learning","volume":"80","author":"Zhang Jiyuan","year":"2018","unstructured":"Jiyuan Zhang, Franz Franchetti, and Tze\u00a0Meng Low. 2018. High Performance Zero-Memory Overhead Direct Convolutions. In Proceedings of the 35th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a080), Jennifer Dy and Andreas Krause (Eds.). PMLR, 5776\u20135785. https:\/\/proceedings.mlr.press\/v80\/zhang18d.html"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3543622.3573210"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","unstructured":"Jinming Zhuang Jason Lau Hanchen Ye Zhuoping Yang Shixin Ji Jack Lo Kristof Denolf Stephen Neuendorffer Alex Jones Jingtong Hu Yiyu Shi Deming Chen Jason Cong and Peipei Zhou. 2024. CHARM 2.0: Composing Heterogeneous Accelerators for Deep Learning on Versal ACAP Architecture. ACM Trans. Reconfigurable Technol. Syst. 17 3 (Sept. 2024) 51:1\u201351:31. 10.1145\/3686163","DOI":"10.1145\/3686163"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Jinming Zhuang Shaojie Xiang Hongzheng Chen Niansong Zhang Zhuoping Yang Tony Mao Zhiru Zhang and Peipei Zhou. 2025. ARIES: An Agile MLIR-Based Compilation Flow for Reconfigurable Devices with AI Engines. (2025).","DOI":"10.1145\/3706628.3708870"}],"event":{"name":"ICPP '25: 54th International Conference on Parallel Processing","location":"San Diego CA USA","acronym":"ICPP '25"},"container-title":["Proceedings of the 54th International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3754598.3754612","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:34:43Z","timestamp":1766219683000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3754598.3754612"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,8]]},"references-count":27,"alternative-id":["10.1145\/3754598.3754612","10.1145\/3754598"],"URL":"https:\/\/doi.org\/10.1145\/3754598.3754612","relation":{},"subject":[],"published":{"date-parts":[[2025,9,8]]},"assertion":[{"value":"2025-12-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}