{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T12:30:29Z","timestamp":1773318629408,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","funder":[{"name":"Guangdong S&T Program","award":["2024B0101040005"],"award-info":[{"award-number":["2024B0101040005"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759769","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:04:47Z","timestamp":1762963487000},"page":"1816-1829","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["HStencil: Matrix-Vector Stencil Computation with Interleaved Outer Product and MLA"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-3774-9115","authenticated-orcid":false,"given":"Han","family":"Huang","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0770-3086","authenticated-orcid":false,"given":"Jiabin","family":"Xie","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1382-280X","authenticated-orcid":false,"given":"Guangnan","family":"Feng","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3507-4299","authenticated-orcid":false,"given":"Xianwei","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5582-1031","authenticated-orcid":false,"given":"Dan","family":"Huang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9318-5715","authenticated-orcid":false,"given":"Zhiguang","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5315-3375","authenticated-orcid":false,"given":"Yutong","family":"Lu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","volume-title":"Clang Compiler User\u2019s Manual","year":"2023","unstructured":"LLVM Project 2023. Clang Compiler User\u2019s Manual. LLVM Project. https:\/\/releases.llvm.org\/17.0.1\/tools\/clang\/docs\/index.html Version 17.0.1."},{"key":"e_1_3_3_2_3_2","unstructured":"Apple. 2024. Apple introduces M4 Pro and M4 Max. https:\/\/www.apple.com\/newsroom\/2024\/10\/apple-introduces-m4-pro-and-m4-max\/ Apple Newsroom."},{"key":"e_1_3_3_2_4_2","unstructured":"Apple. 2024. Command Line Tools for Xcode 16.2 Beta 2. https:\/\/download.developer.apple.com\/Developer_Tools\/Command_Line_Tools_for_Xcode_16.2_beta_2\/Command_Line_Tools_for_Xcode_16.2_beta_2.dmg Apple Developer Tools."},{"key":"e_1_3_3_2_5_2","volume-title":"SME and SME2 Overview","year":"2023","unstructured":"ARM. 2023. SME and SME2 Overview. https:\/\/developer.arm.com\/documentation\/109246\/0100\/SME-Overview\/SME-and-SME2"},{"key":"e_1_3_3_2_6_2","unstructured":"ARM. 2023. Streaming SVE mode and ZA storage. https:\/\/developer.arm.com\/documentation\/109246\/0100\/Introduction\/The-Scalable-Matrix-Extensions\/Streaming-SVE-mode-and-ZA-storage. Accessed: 2023-10-24."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","unstructured":"Krste Asanovic Rastislav Bodik James Demmel Tony Keaveny Kurt Keutzer John Kubiatowicz Nelson Morgan David Patterson Koushik Sen John Wawrzynek David Wessel and Katherine Yelick. 2009. A view of the parallel computing landscape. Commun. ACM 52 10 (Oct. 2009) 56\u201367. 10.1145\/1562764.1562783","DOI":"10.1145\/1562764.1562783"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/125826.125932"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/2872362.2872368"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638476"},{"key":"e_1_3_3_2_11_2","unstructured":"Nvidia Corporation. 2021. Nvidia A100 Tensor Core GPU Architecture. https:\/\/resources.nvidia.com\/en-us-genomics-ep\/ampere-architecture-white-paper"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3331057"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651378"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441599"},{"key":"e_1_3_3_2_15_2","volume-title":"Cloud TPU","author":"Cloud Google","year":"2021","unstructured":"Google Cloud. 2021. Cloud TPU. https:\/\/cloud.google.com\/tpu"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-63778-0_15"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-19861-8_13"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISVLSI54635.2022.00051"},{"key":"e_1_3_3_2_19_2","volume-title":"Advanced Matrix Extensions Overview","author":"Corporation Intel","year":"2021","unstructured":"Intel Corporation. 2021. Advanced Matrix Extensions Overview. https:\/\/www.intel.com\/content\/www\/us\/en\/products\/docs\/accelerator-engines\/advanced-matrix-extensions\/overview.html"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3649411.3649412"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","unstructured":"Vinay Kukutla Ramachandra Achar and Wai\u00a0Kong Lee. 2024. TC-GVF: Tensor Core GPU based Vector Fitting via Accelerated Tall-Skinny QR Solvers. IEEE Transactions on Components Packaging and Manufacturing Technology (2024) 1\u20131. 10.1109\/TCPMT.2024.3410298","DOI":"10.1109\/TCPMT.2024.3410298"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-69583-4_1"},{"key":"e_1_3_3_2_23_2","series-title":"(SC \u201922)","volume-title":"Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis","author":"Li Shigang","year":"2022","unstructured":"Shigang Li, Kazuki Osawa, and Torsten Hoefler. 2022. Efficient quantized sparse matrix operations on tensor cores. In Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis (Dallas, Texas) (SC \u201922). IEEE Press, Article 37, 15\u00a0pages."},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3652032.3657567"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532392"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607051"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00058"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","unstructured":"Chandra\u00a0Sekhar Mummidi Victor\u00a0C. Ferreira Sudarshan Srinivasan and Sandip Kundu. 2024. Highly Efficient Self-checking Matrix Multiplication on Tiled AMX Accelerators. ACM Trans. Archit. Code Optim. 21 2 Article 21 (Feb. 2024) 22\u00a0pages. 10.1145\/3633332","DOI":"10.1145\/3633332"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS49936.2021.00059"},{"key":"e_1_3_3_2_30_2","unstructured":"Stefan Remke and Alexander Breuer. 2024. Hello SME! Generating Fast Matrix Multiplication Kernels Using the Scalable Matrix Extension. arxiv:https:\/\/arXiv.org\/abs\/2409.18779\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2409.18779"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","unstructured":"Kevin Stock Martin Kong Tobias Grosser Louis-No\u00ebl Pouchet Fabrice Rastello J. Ramanujam and P. Sadayappan. 2014. A framework for enhancing data reuse via associative reordering. SIGPLAN Not. 49 6 (June 2014) 65\u201376. 10.1145\/2666356.2594342","DOI":"10.1145\/2666356.2594342"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","unstructured":"Haotian Wang Wangdong Yang Rong Hu Renqiu Ouyang Kenli Li and Keqin Li. 2023. A Novel Parallel Algorithm for Sparse Tensor Matrix Chain Multiplication via TCU-Acceleration. IEEE Transactions on Parallel and Distributed Systems 34 8 (2023) 2419\u20132432. 10.1109\/TPDS.2023.3288520","DOI":"10.1109\/TPDS.2023.3288520"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/PMBS56514.2022.00018"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638479"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476149"},{"key":"e_1_3_3_2_36_2","unstructured":"Orestis Zachariadis Nitin Satpute Juan G\u00f3mez-Luna and Joaqu\u00edn Olivares. 2020. Accelerating Sparse Matrix-Matrix Multiplication with GPU Tensor Cores. CoRR abs\/2009.14600 (2020). arXiv:https:\/\/arXiv.org\/abs\/2009.14600https:\/\/arxiv.org\/abs\/2009.14600"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3673038.3673108"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00088"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3369583.3392685"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/ScalA51936.2020.00011"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577516"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3472456.3473522"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3330351"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00059"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD53106.2021.00054"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3650200.3656611"}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759769","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:46:55Z","timestamp":1773254815000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759769"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":45,"alternative-id":["10.1145\/3712285.3759769","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759769","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}