{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,20]],"date-time":"2026-04-20T21:52:48Z","timestamp":1776721968991,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","funder":[{"name":"JSPS KAKENHI Grant Number","award":["JP25H01109"],"award-info":[{"award-number":["JP25H01109"]}]},{"name":"JSPS KAKENHI Grant Number","award":["JP25K00141"],"award-info":[{"award-number":["JP25K00141"]}]},{"name":"JSPS KAKENHI Grant Number","award":["JP24K22299"],"award-info":[{"award-number":["JP24K22299"]}]},{"name":"JSPS KAKENHI Grant Number","award":["JP24K02949"],"award-info":[{"award-number":["JP24K02949"]}]},{"name":"JSPS KAKENHI Grant Number","award":["JP23H00462"],"award-info":[{"award-number":["JP23H00462"]}]},{"name":"Joint Usage\/Research Center for Interdisciplinary Large-scale Information Infrastructures","award":["jh250083"],"award-info":[{"award-number":["jh250083"]}]},{"name":"Joint Usage\/Research Center for Interdisciplinary Large-scale Information Infrastructures","award":["jh250019"],"award-info":[{"award-number":["jh250019"]}]},{"name":"Joint Usage\/Research Center for Interdisciplinary Large-scale Information Infrastructures","award":["jh250032"],"award-info":[{"award-number":["jh250032"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,1,26]]},"DOI":"10.1145\/3773656.3773678","type":"proceedings-article","created":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T10:22:11Z","timestamp":1767954131000},"page":"153-164","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Tensor-Core-Optimized Strategies for BLR \u00d7 Tall-Skinny Matrix Multiplication in BEM"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7751-1093","authenticated-orcid":false,"given":"Akihiro","family":"Ida","sequence":"first","affiliation":[{"name":"Japan Agency for Marine-Earth Science and Technology, Yokohama, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4146-8134","authenticated-orcid":false,"given":"Kazuya","family":"Goto","sequence":"additional","affiliation":[{"name":"PExProCS, LLC, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7573-7873","authenticated-orcid":false,"given":"Rio","family":"Yokota","sequence":"additional","affiliation":[{"name":"Institute of Science Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4285-893X","authenticated-orcid":false,"given":"Tasuku","family":"Hiraishi","sequence":"additional","affiliation":[{"name":"Kyoto Tachibana University, Kyoto, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2970-6037","authenticated-orcid":false,"given":"Toshihiro","family":"Hanawa","sequence":"additional","affiliation":[{"name":"The University of Tokyo, Kashiwa, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1938-1723","authenticated-orcid":false,"given":"Takeshi","family":"Iwashita","sequence":"additional","affiliation":[{"name":"Kyoto University, Kyoto, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1454-202X","authenticated-orcid":false,"given":"Masatoshi","family":"Kawai","sequence":"additional","affiliation":[{"name":"Tohoku University, Sendai, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4073-5688","authenticated-orcid":false,"given":"Satoshi","family":"Ohshima","sequence":"additional","affiliation":[{"name":"Kyushu University, Fukuoka, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5349-6852","authenticated-orcid":false,"given":"Tetsuya","family":"Hoshino","sequence":"additional","affiliation":[{"name":"Nagoya University, Nagoya, Japan"}]}],"member":"320","published-online":{"date-parts":[[2026,1,25]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Emmanuel Agullo Luc Giraud and Yan-Fei Jing. 2014. Block GMRES method with inexact breakdowns and deflated restarting. SIAM J. Matrix Anal. Appl. 35 4 (2014) 1625\u20131651.","DOI":"10.1137\/140961912"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-58667-0_2"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Sivaram Ambikasaran and Eric Darve. 2013. An O(N log N) fast direct solver for partial hierarchically semi-separable matrices. Journal of Scientific Computing 57 3 (2013) 477\u2013501.","DOI":"10.1007\/s10915-013-9714-z"},{"key":"e_1_3_3_1_5_2","unstructured":"AMD. 2025. AMD Matrix Cores. Retrieved September 13 2025 from https:\/\/fs.hlrs.de\/projects\/par\/events\/2024\/GPU-AMD\/day4\/20.% 20AMD_Matrix_Cores.pdf."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Patrick Amestoy Cleve Ashcraft Olivier Boiteau Alfredo Buttari Jean-Yves L\u2019Excellent and Cl\u00e9ment Weisbecker. 2015. Improving multifrontal methods by means of block low-rank representations. SIAM Journal on Scientific Computing 37 3 (2015) A1451\u2013A1474.","DOI":"10.1137\/120903476"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Mario Bebendorf. 2000. Approximation of boundary element matrices. Numer. Math. 86 4 (2000) 565\u2013589.","DOI":"10.1007\/PL00005410"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Shankar Chandrasekaran Patrick Dewilde Ming Gu William Lyons and Tim Pals. 2006. A fast solver for HSS representations via sparse matrices. SIAM J. Matrix Anal. Appl. 29 1 (2006) 67\u201381.","DOI":"10.1137\/050639028"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-96983-1_57"},{"key":"e_1_3_3_1_10_2","unstructured":"Intel Corporation. 2025. Optimizing Machine Learning Models with Intel\u00ae Advanced Matrix Extensions (Intel\u00ae AMX). Retrieved September 13 2025 from https:\/\/www.intel.com\/content\/dam\/www\/central-libraries\/us\/en\/documents\/2022-12\/optimizing-ml-models-with-amx-brief.pdf."},{"key":"e_1_3_3_1_11_2","unstructured":"NVIDIA Corporation. 2025. cuBLAS Library: cublasGemmBatchedEx API. Retrieved September 13 2025 from https:\/\/docs.nvidia.com\/cuda\/cublas\/index.html."},{"key":"e_1_3_3_1_12_2","unstructured":"NVIDIA Corporation. 2025. NVIDIA A100 Tensor Core GPU Architecture. Retrieved September 13 2025 from https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/datacenter\/nvidia-ampere-architecture-whitepaper.pdf."},{"key":"e_1_3_3_1_13_2","unstructured":"Abdelghani El Guennouni Kassem Jbilou and Abdelbaki Sadok. 2003. A block version of BiCGSTAB for linear systems with multiple right-hand sides. Electronic Transactions on Numerical Analysis 16 2 (2003) 129\u2013142."},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Wolfgang Hackbusch. 1999. A sparse matrix arithmetic based on H-matrices. Part I: Introduction to H-matrices. Computing 62 2 (1999) 89\u2013108.","DOI":"10.1007\/s006070050015"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Wolfgang Hackbusch and Steffen B\u00f6rm. 2002. H2-matrix approximation of integral operators by interpolation. Applied Numerical Mathematics 43 1\u20132 (2002) 129\u2013143.","DOI":"10.1016\/S0168-9274(02)00121-6"},{"key":"e_1_3_3_1_16_2","unstructured":"Nicholas\u00a0J. Higham and Theo Mary. 2019. Solving block low-rank linear systems by LU factorization is numerically stable. eprints.maths.manchester.ac.uk."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2018.00049"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"Akihiro Ida. 2022. Solving block low-rank matrix eigenvalue problems. Journal of Information Processing 30 (2022) 538\u2013551.","DOI":"10.2197\/ipsjjip.30.538"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Akihiro Ida Tadashi Ataka and Atsushi Furuya. 2020. Lattice H-Matrices for Massively Parallel Micromagnetic Simulations of Current-Induced Domain Wall Motion. IEEE Transactions on Magnetics 56 4 (2020) 1\u20134.","DOI":"10.1109\/TMAG.2019.2959349"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Akihiro Ida Tadashi Ataka Yasuhito Takahashi Takeshi Mifune Takeshi Iwashita and Atsushi Furuya. 2017. Application of improved H-matrices in micromagnetic simulations of spin torque oscillator. IEEE Transactions on Magnetics 54 3 (2017) 1\u20134.","DOI":"10.1109\/TMAG.2017.2763611"},{"key":"e_1_3_3_1_21_2","unstructured":"Akihiro Ida and Takeshi Iwashita. 2015. HACApK library. Retrieved September 13 2025 from https:\/\/github.com\/Post-Peta-Crest\/ppOpenHPC\/tree\/MATH\/HACApK."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Akihiro Ida Takeshi Iwashita Takeshi Mifune and Yasuhito Takahashi. 2014. Parallel hierarchical matrices with adaptive cross approximation on symmetric multiprocessing clusters. Journal of Information Processing 22 (2014) 642\u2013650.","DOI":"10.2197\/ipsjjip.22.642"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Akihiro Ida Takeshi Iwashita Takeshi Mifune and Yasuhito Takahashi. 2015. Variable preconditioning of Krylov subspace methods for hierarchical matrices with adaptive cross approximation. IEEE Transactions on Magnetics 52 3 (2015) 1\u20134.","DOI":"10.1109\/TMAG.2015.2464104"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Akihiro Ida Takeshi Iwashita Makiko Ohtani and Kazuro Hirahara. 2015. Improvement of hierarchical matrices with adaptive cross approximation for large-scale simulation. Journal of Information Processing 23 3 (2015) 366\u2013372.","DOI":"10.2197\/ipsjjip.23.366"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Akihiro Ida Hiroshi Nakashima Tasuku Hiraishi Ichitaro Yamazaki Rio Yokota and Takeshi Iwashita. 2019. QR factorization of block low-rank matrices with weak admissibility condition. Journal of Information Processing 27 (2019) 831\u2013839.","DOI":"10.2197\/ipsjjip.27.831"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3149457.3149477"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Soichiro Ikuno Yuki Kawaguchi Norihisa Fujita Taku Itoh Susumu Nakata and Kota Watanabe. 2012. Iterative solver for linear system obtained by edge element: variable preconditioned method with mixed precision on GPU. IEEE Transactions on Magnetics 48 2 (2012) 467\u2013470.","DOI":"10.1109\/TMAG.2011.2175375"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2017.05.263"},{"key":"e_1_3_3_1_29_2","unstructured":"Takeshi Iwashita Takeshi Mifune Yuki Noseda Yasuhito Takahashi Masatoshi Kawai and Akihiro Ida. 2014. ppohBEM: BEM-BB framework. Retrieved September 13 2025 from https:\/\/github.com\/Post-Peta-Crest\/ppOpenHPC\/tree\/APPL\/BEM."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"crossref","unstructured":"Takashi Mifune Naoki Tominaga Yusuke Sogabe Yudai Mizobata Masahiro Yasunaga Akihiro Ida Takeshi Iwashita and Naoyuki Amemiya. 2019. Large-scale electromagnetic field analyses of coils wound with coated conductors using a current-vector-potential formulation with a thin-strip approximation. Superconductor Science and Technology 32 9 (2019) 094002.","DOI":"10.1088\/1361-6668\/ab1d35"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Takumi Miyajima Ryosuke Ando and Akihiro Ida. 2025. An efficient boundary integral equation method applicable to 3D dynamic rupture simulations. JSIAM Letters 17 (2025) 37\u201340.","DOI":"10.14495\/jsiaml.17.37"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-69953-0_16"},{"key":"e_1_3_3_1_33_2","first-page":"9","volume-title":"Proceedings of the 2019 IEEE 13th International Symposium on Embedded Multicore\/Many-core Systems-on-Chip (MCSoC)","author":"Ohshima Satoshi","year":"2019","unstructured":"Satoshi Ohshima, Ichitaro Yamazaki, Akihiro Ida, and Rio Yokota. 2019. Optimization of numerous small dense-matrix\u2013vector multiplications in H-matrix arithmetic on GPU. In Proceedings of the 2019 IEEE 13th International Symposium on Embedded Multicore\/Many-core Systems-on-Chip (MCSoC). IEEE, 9\u201316."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-29927-8_28"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3368474.3368479"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"So Ozawa Akihiro Ida Tetsuya Hoshino and Ryosuke Ando. 2023. Large-scale earthquake sequence simulations on 3-D non-planar faults using the boundary element method accelerated by lattice H-matrices. Geophysical Journal International 232 3 (2023) 1471\u20131481.","DOI":"10.1093\/gji\/ggac386"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/PAW-ATM49560.2019.00008"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"crossref","unstructured":"Vladimir Puzyrev and Jose\u00a0M. Cela. 2015. A review of block Krylov subspace methods for multisource electromagnetic modelling. Geophysical Journal International 202 2 (2015) 1241\u20131252.","DOI":"10.1093\/gji\/ggv216"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"crossref","unstructured":"Yousef Saad. 1993. A flexible inner-outer preconditioned GMRES algorithm. SIAM J. Sci. Statist. Comput. 14 (1993) 461\u2013469.","DOI":"10.1137\/0914028"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Naoki Tominaga Takeshi Mifune Akihiro Ida Yusuke Sogabe Takeshi Iwashita and Naoyuki Amemiya. 2017. Application of hierarchical matrices to large-scale electromagnetic field analyses of coils wound with coated conductors. IEEE Transactions on Applied Superconductivity 28 3 (2017) 1\u20135.","DOI":"10.1109\/TASC.2017.2780821"}],"event":{"name":"SCA\/HPCAsia 2026: Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region","location":"Osaka Japan","acronym":"SCA\/HPCAsia 2026"},"container-title":["Proceedings of the Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region"],"original-title":[],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T10:22:29Z","timestamp":1767954149000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3773656.3773678"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,25]]},"references-count":39,"alternative-id":["10.1145\/3773656.3773678","10.1145\/3773656"],"URL":"https:\/\/doi.org\/10.1145\/3773656.3773678","relation":{},"subject":[],"published":{"date-parts":[[2026,1,25]]},"assertion":[{"value":"2026-01-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}