{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T12:02:34Z","timestamp":1767960154135,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":16,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,1,26]]},"DOI":"10.1145\/3773656.3773660","type":"proceedings-article","created":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T10:22:11Z","timestamp":1767954131000},"page":"81-90","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Optimization of a GEMM Implementation using Intel AMX"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-5363-345X","authenticated-orcid":false,"given":"Yusuke","family":"Endo","sequence":"first","affiliation":[{"name":"Kyushu university, Fukuoka, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4073-5688","authenticated-orcid":false,"given":"Satoshi","family":"Ohshima","sequence":"additional","affiliation":[{"name":"Research Institute for Information Technology, Kyushu University, Fukuoka, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4111-9425","authenticated-orcid":false,"given":"Takeshi","family":"Nanri","sequence":"additional","affiliation":[{"name":"Research Institute for Information Technology, Kyushu University, Fukuoka, Japan"}]}],"member":"320","published-online":{"date-parts":[[2026,1,25]]},"reference":[{"key":"e_1_3_3_1_2_2","volume-title":"Optimizing Batched HGEMM on Small Sizes Using Tensor Cores","author":"Abdelfattah Ahmad","year":"2019","unstructured":"Ahmad Abdelfattah, Stanimire Tomov, and Jack Dongarra. 2019. Optimizing Batched HGEMM on Small Sizes Using Tensor Cores. Technical Report ICL-UTK-1234-2019. Innovative Computing Laboratory, University of Tennessee. https:\/\/icl.utk.edu\/files\/publications\/2019\/icl-utk-1234-2019.pdf Also presented as a poster at the GPU Technology Conference (GTC) 2019, San Jose, CA."},{"key":"e_1_3_3_1_3_2","volume-title":"Intel\u00ae Intrinsics Guide","year":"2025","unstructured":"Intel. 2025. Intel\u00ae Intrinsics Guide. https:\/\/www.intel.com\/content\/www\/us\/en\/docs\/intrinsics-guide\/index.html"},{"key":"e_1_3_3_1_4_2","volume-title":"What Is Intel\u00ae Advanced Matrix Extensions?","year":"2025","unstructured":"Intel. 2025. What Is Intel\u00ae Advanced Matrix Extensions?https:\/\/www.intel.com\/content\/www\/us\/en\/products\/docs\/accelerator-engines\/what-is-intel-amx.html Accessed: 2025-07-02."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_3_1_6_2","unstructured":"Dhiraj Kalamkar Dheevatsa Mudigere Naveen Mellempudi Dipankar Das Kunal Banerjee Sasikanth Avancha Dharma\u00a0Teja Vooturi Nataraj Jammalamadaka Jianyu Huang Hector Yuen Jiyan Yang Jongsoo Park Alexander Heinecke Evangelos Georganas Sudarshan Srinivasan Abhisek Kundu Misha Smelyanskiy Bharat Kaul and Pradeep Dubey. 2019. A Study of BFLOAT16 for Deep Learning Training. arxiv:https:\/\/arXiv.org\/abs\/1905.12322\u00a0[cs.LG]"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2018.00091"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-10549-5_35"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","unstructured":"Daichi Mukunoki Katsuhisa Ozaki Takeshi Ogita and Toshiyuki Imamura. 2020. DGEMM Using Tensor Cores and Its Accurate and Reproducible Versions. Scientific Programming 2020 (2020) 8821958:1\u20138821958:14. 10.1155\/2020\/8821958","DOI":"10.1155\/2020\/8821958"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","unstructured":"Hiroyuki Ootomo Katsuhisa Ozaki and Rio Yokota. 2024. DGEMM on Integer Matrix Multiplication Unit. International Journal of High Performance Computing Applications 38 4 (2024) 297\u2013313. 10.1177\/10943420241239588","DOI":"10.1177\/10943420241239588"},{"key":"e_1_3_3_1_11_2","volume-title":"OpenBLAS","author":"Project OpenBLAS","year":"2025","unstructured":"OpenBLAS Project. 2025. OpenBLAS. https:\/\/www.openblas.net\/"},{"key":"e_1_3_3_1_12_2","volume-title":"perf: Linux profiling with performance counters","author":"community perf","year":"2025","unstructured":"perf community. 2025. perf: Linux profiling with performance counters. https:\/\/perfwiki.github.io\/main\/"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Paul Springer and Paolo Bientinesi. 2017. Design of a High-Performance GEMM-like Tensor\u2013Tensor Multiplication. ACM Transactions on Mathematical Software (TOMS) 44 3 (2017) 28:1\u201328:29.","DOI":"10.1145\/3157733"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Hao Tang Kazuhiko Komatsu Masayuki Sato and Hiroaki Kobayashi. 2021. Efficient Mixed-Precision Tall-and-Skinny Matrix-Matrix Multiplication for GPUs. International Journal of Networking and Computing 11 2 (2021) 267\u2013282.","DOI":"10.15803\/ijnc.11.2_267"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3624062.3624084"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00030"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3369583.3392685"}],"event":{"name":"SCA\/HPCAsia 2026: Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region","location":"Osaka Japan","acronym":"SCA\/HPCAsia 2026"},"container-title":["Proceedings of the Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region"],"original-title":[],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T10:22:58Z","timestamp":1767954178000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3773656.3773660"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,25]]},"references-count":16,"alternative-id":["10.1145\/3773656.3773660","10.1145\/3773656"],"URL":"https:\/\/doi.org\/10.1145\/3773656.3773660","relation":{},"subject":[],"published":{"date-parts":[[2026,1,25]]},"assertion":[{"value":"2026-01-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}