{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,3]],"date-time":"2026-07-03T02:47:04Z","timestamp":1783046824902,"version":"3.54.6"},"publisher-location":"New York, NY, USA","reference-count":19,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001691","name":"Japan Society for the Promotion of Science","doi-asserted-by":"publisher","award":["23K28100, 24K23874, 25H01109, 25K03126"],"award-info":[{"award-number":["23K28100, 24K23874, 25H01109, 25K03126"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3731599.3767539","type":"proceedings-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T16:18:44Z","timestamp":1762532324000},"page":"1824-1831","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["High-Performance and Power-Efficient Emulation of Matrix Multiplication using INT8 Matrix Engines"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5906-6624","authenticated-orcid":false,"given":"Yuki","family":"Uchino","sequence":"first","affiliation":[{"name":"RIKEN Center for Computational Science (R-CCS), Kobe, Hyogo, Japan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0431-6232","authenticated-orcid":false,"given":"Katsuhisa","family":"Ozaki","sequence":"additional","affiliation":[{"name":"Shibaura Institute of Technology, Saitama, Saitama, Japan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1601-9710","authenticated-orcid":false,"given":"Toshiyuki","family":"Imamura","sequence":"additional","affiliation":[{"name":"RIKEN Center for Computational Science (R-CCS), Kobe, Hyogo, Japan"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","volume-title":"AMD matrix cores","author":"Inc. AMD","year":"2025","unstructured":"AMD Inc.2025. AMD matrix cores. https:\/\/rocm.blogs.amd.com\/software-tools-optimization\/matrix-cores\/README.html retrieved 21 March, 2025."},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","unstructured":"William Dawson Katsuhisa Ozaki Jens Domke and Takahito Nakajima. 2024. Reducing Numerical Precision Requirements in Quantum Chemistry Calculations. Journal of Chemical Theory and Computation 20 24 (2024) 10826\u201310837. 10.1021\/acs.jctc.4c00938","DOI":"10.1021\/acs.jctc.4c00938"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","unstructured":"Jack\u00a0J Dongarra Piotr Luszczek and Antoine Petitet. 2003. The LINPACK benchmark: past present and future. Concurrency and Computation: practice and experience 15 9 (2003) 803\u2013820. 10.1002\/cpe.728","DOI":"10.1002\/cpe.728"},{"key":"e_1_3_3_2_5_2","volume-title":"Cloud Tensor Processing Unit","author":"LLC Google","year":"2025","unstructured":"Google LLC. 2025. Cloud Tensor Processing Unit. https:\/\/cloud.google.com\/tpu\/docs\/intro-to-tpu retrieved 21 March, 2025."},{"key":"e_1_3_3_2_6_2","unstructured":"Babak Hejazi Mark Wolf Roman Dubtsov and Becca Zandstein. 2025. Boosting Matrix Multiplication Speed and Flexibility with NVIDIA cuBLAS 12.9. https:\/\/developer.nvidia.com\/blog\/boosting-matrix-multiplication-speed-and-flexibility-with-nvidia-cublas-12-9\/"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Greg Henry Ping Tak\u00a0Peter Tang and Alexander Heinecke. 2019. Leveraging the bfloat16 Artificial Intelligence Datatype For Higher-Precision Computations. arxiv:https:\/\/arXiv.org\/abs\/1904.06376\u00a0[cs.MS]","DOI":"10.1109\/ARITH.2019.00019"},{"key":"e_1_3_3_2_8_2","volume-title":"NVIDIA Tensor Cores","author":"Corporation NVIDIA","year":"2025","unstructured":"NVIDIA Corporation. 2025. NVIDIA Tensor Cores. https:\/\/www.nvidia.com\/en-us\/data-center\/tensor-cores\/ retrieved 25 July, 2025."},{"key":"e_1_3_3_2_9_2","volume-title":"cuMpSGEMM - CUDA Mutable-precision SGEMM","author":"Ootomo Hiroyuki","year":"2023","unstructured":"Hiroyuki Ootomo. 2023. cuMpSGEMM - CUDA Mutable-precision SGEMM. https:\/\/github.com\/enp1s0\/cuMpSGEMM"},{"key":"e_1_3_3_2_10_2","volume-title":"ozIMMU - DGEMM on Int8 Tensor Core","author":"Ootomo Hiroyuki","year":"2024","unstructured":"Hiroyuki Ootomo. 2024. ozIMMU - DGEMM on Int8 Tensor Core. https:\/\/github.com\/enp1s0\/ozIMMU"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-32041-514"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","unstructured":"Hiroyuki Ootomo Katsuhisa Ozaki and Rio Yokota. 2024. DGEMM on integer matrix multiplication unit. The International Journal of High Performance Computing Applications 38 4 (2024) 297\u2013313. 10.1177\/10943420241239588","DOI":"10.1177\/10943420241239588"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","unstructured":"Hiroyuki Ootomo and Rio Yokota. 2022. Recovering single precision accuracy from Tensor Cores while surpassing the FP32 theoretical peak performance. The International Journal of High Performance Computing Applications 36 4 (2022) 475\u2013491. 10.1177\/10943420221090256","DOI":"10.1177\/10943420221090256"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","unstructured":"Katsuhisa Ozaki Takeshi Ogita Shin\u2019ichi Oishi and Siegfried\u00a0M. Rump. 2012. Error-free transformations of matrix multiplication by using fast routines of matrix multiplication and its applications. Numerical Algorithms 59 1 (2012) 95\u2013118. 10.1007\/s11075-011-9478-1","DOI":"10.1007\/s11075-011-9478-1"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","unstructured":"Katsuhisa Ozaki Takeshi Ogita Shin\u2019ichi Oishi and Siegfried\u00a0M Rump. 2013. Generalization of error-free transformation for matrix multiplication and its application. Nonlinear Theory and Its Applications IEICE 4 1 (2013) 2\u201311. 10.1587\/nolta.4.2","DOI":"10.1587\/nolta.4.2"},{"key":"e_1_3_3_2_16_2","unstructured":"Katsuhisa Ozaki Yuki Uchino and Toshiyuki Imamura. 2025. Ozaki Scheme II: A GEMM-oriented emulation of floating-point matrix multiplication using an integer modular technique. arxiv:https:\/\/arXiv.org\/abs\/2504.08009\u00a0[cs.MS]"},{"key":"e_1_3_3_2_17_2","volume-title":"Heterogeneous AI Powerhouse: Unveiling the Hardware and Software Foundation of Intel\u00ae Core\u2122 Ultra Processors for the Edge, White Paper","author":"Perumal Ramesh","year":"2024","unstructured":"Ramesh Perumal, Nikitha Chinthalapani, and Rohit D\u2019Souza. 2024. Heterogeneous AI Powerhouse: Unveiling the Hardware and Software Foundation of Intel\u00ae Core\u2122 Ultra Processors for the Edge, White Paper. https:\/\/www.intel.com\/content\/www\/us\/en\/content-details\/817736"},{"key":"e_1_3_3_2_18_2","volume-title":"Accelerator for ozIMMU","author":"Uchino Yuki","year":"2024","unstructured":"Yuki Uchino. 2024. Accelerator for ozIMMU. https:\/\/github.com\/RIKEN-RCCS\/accelerator_for_ozIMMU"},{"key":"e_1_3_3_2_19_2","volume-title":"GEMMul8: GEMM emulation using int8 matrix engines based on the Ozaki scheme II","author":"Uchino Yuki","year":"2025","unstructured":"Yuki Uchino. 2025. GEMMul8: GEMM emulation using int8 matrix engines based on the Ozaki scheme II. https:\/\/github.com\/RIKEN-RCCS\/GEMMul8"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","unstructured":"Yuki Uchino Katsuhisa Ozaki and Toshiyuki Imamura. 2025. Performance enhancement of the Ozaki Scheme on integer matrix multiplication unit. The International Journal of High Performance Computing Applications 39 3 (2025) 462\u2013476. 10.1177\/10943420241313064","DOI":"10.1177\/10943420241313064"}],"event":{"name":"SC Workshops '25: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St Louis MO USA","acronym":"SC Workshops '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731599.3767539","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:32:41Z","timestamp":1767987161000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731599.3767539"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":19,"alternative-id":["10.1145\/3731599.3767539","10.1145\/3731599"],"URL":"https:\/\/doi.org\/10.1145\/3731599.3767539","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}