{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T11:38:48Z","timestamp":1767958728521,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,1,26]]},"DOI":"10.1145\/3773656.3773663","type":"proceedings-article","created":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T10:22:11Z","timestamp":1767954131000},"page":"102-111","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Towards Unified Acceleration: Weight-Stationary GEMM on HPC-oriented Elastic CGRAs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3016-1628","authenticated-orcid":false,"given":"Chenlin","family":"Shi","sequence":"first","affiliation":[{"name":"RIKEN Center for Computational Science, Kobe, Hyogo, Japan and The University of Electro-Communications, Chofu, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8165-9792","authenticated-orcid":false,"given":"Boma Anantasatya","family":"Adhi","sequence":"additional","affiliation":[{"name":"RIKEN Center for Computational Science, Kobe, Hyogo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9854-0081","authenticated-orcid":false,"given":"Lin","family":"Teng","sequence":"additional","affiliation":[{"name":"The University of Electro-Communications, Chofu, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1543-5277","authenticated-orcid":false,"given":"Jiaheng","family":"Liu","sequence":"additional","affiliation":[{"name":"RIKEN Center for Computational Science, Kobe, Hyogo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0315-3216","authenticated-orcid":false,"given":"Shinobu","family":"Miwa","sequence":"additional","affiliation":[{"name":"The University of Electro-Communications, Chofu, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6681-4192","authenticated-orcid":false,"given":"Kentaro","family":"Sano","sequence":"additional","affiliation":[{"name":"RIKEN Center for Computational Science, Kobe, Hyogo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,1,25]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW59300.2023.00077"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER51413.2022.00046"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW55747.2022.00113"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICFPT56656.2022.9974525"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Fischer Black and Myron Scholes. 1973. The pricing of options and corporate liabilities. Journal of Political Economy 81 3 (1973) 637\u2013654.","DOI":"10.1086\/260062"},{"key":"e_1_3_3_2_7_2","unstructured":"Evan Burton. [n. d.]. FTLE Unsteady Flows. https:\/\/github.com\/iamboorrito\/FTLE-Unsteady-Flows."},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"crossref","unstructured":"Longlong Chen Jianfeng Zhu Yangdong Deng Zhaoshi Li Jian Chen Xiaowei Jiang Shouyi Yin Shaojun Wei and Leibo Liu. 2021. An elastic task scheduling scheme on coarse-grained reconfigurable architectures. IEEE Transactions on Parallel and Distributed Systems 32 12 (2021) 3066\u20133080.","DOI":"10.1109\/TPDS.2021.3084804"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","unstructured":"Shiyi Chen Zheng Wang Xiaowen Shan and Gary\u00a0D Doolen. 1992. Lattice Boltzmann computational fluid dynamics in three dimensions. Journal of Statistical Physics 68 3 (1992) 379\u2013400. 10.1007\/BF01341754","DOI":"10.1007\/BF01341754"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","unstructured":"Suhyeong Choi Jinwook Jung Andrew\u00a0B. Kahng Minsoo Kim Chul-Hong Park Bodhisatta Pramanik and Dooseok Yoon. 2024. PROBE3.0: A Systematic Framework for Design-Technology Pathfinding With Improved Design Enablement. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 43 4 (2024) 1218\u20131231. 10.1109\/TCAD.2023.3334591","DOI":"10.1109\/TCAD.2023.3334591"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"crossref","unstructured":"Lawrence\u00a0T. Clark Vinay Vashishtha Lucian Shifren Aditya Gujja Saurabh Sinha Brian Cline Chandarasekaran Ramamurthy and Greg Yeric. 2016. ASAP7: A 7-nm FinFET predictive process design kit. Microelectronics Journal 53 (2016) 105\u2013115.","DOI":"10.1016\/j.mejo.2016.04.006"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/ASAP.2018.8445106"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00065"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Hadi Esmaeilzadeh Soroush Ghodrati Andrew\u00a0B. Kahng Sean Kinzer Susmita\u00a0Dey Manasi Sachin\u00a0S. Sapatnekar and Zhiang Wang. 2024. Performance Analysis of CNN Inference\/Training with Convolution and Non-Convolution Operations on ASIC Accelerators. ACM Trans. Des. Autom. Electron. Syst. 30 1 Article 3 (Nov. 2024) 34\u00a0pages.","DOI":"10.1145\/3696665"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586216"},{"key":"e_1_3_3_2_16_2","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2015. Deep Residual Learning for Image Recognition. arxiv:https:\/\/arXiv.org\/abs\/1512.03385\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/1512.03385"},{"key":"e_1_3_3_2_17_2","unstructured":"Dan Hendrycks and Kevin Gimpel. 2023. Gaussian Error Linear Units (GELUs). arxiv:https:\/\/arXiv.org\/abs\/1606.08415\u00a0[cs.LG]"},{"key":"e_1_3_3_2_18_2","volume-title":"HPC Programming [Japanese]","author":"Seiji\u00a0Fujino Toshio\u00a0Nagashima Hikaru\u00a0Samukawa,","year":"2009","unstructured":"Toshio\u00a0Nagashima Hikaru\u00a0Samukawa, Seiji\u00a0Fujino and Daisuke Takahashi. 2009. HPC Programming [Japanese]. Ohm Publishing Co., Tokyo, Japan."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/2435264.2435296"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Takaji Inamuro Masato Yoshino and Fumimaru Ogino. 1995. A non-slip boundary condition for lattice boltzmann simulations. Journal of Physics of Fluids 7 12 (1995) 2928\u20132930.","DOI":"10.1063\/1.868766"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ASAP57973.2023.00036"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","unstructured":"Norman\u00a0P. Jouppi et\u00a0al. 2017. In-Datacenter Performance Analysis of a Tensor Processing Unit. SIGARCH Comput. Archit. News 45 2 (June 2017) 1\u201312. 10.1145\/3140659.3080246","DOI":"10.1145\/3140659.3080246"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/MC.1982.1653825"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00080"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","unstructured":"Leibo Liu Jianfeng Zhu Zhaoshi Li Yanan Lu Yangdong Deng Jie Han Shouyi Yin and Shaojun Wei. 2019. A Survey of Coarse-Grained Reconfigurable Architecture and Design: Taxonomy Challenges and Applications. ACM Comput. Surv. 52 6 Article 118 (Oct. 2019) 39\u00a0pages. 10.1145\/3357375","DOI":"10.1145\/3357375"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC56929.2023.10247873"},{"key":"e_1_3_3_2_27_2","volume-title":"NVDLA Open Source Hardware, version 1.0","author":"Corporation NVIDIA","year":"2017","unstructured":"NVIDIA Corporation. 2017. NVDLA Open Source Hardware, version 1.0. Technical Report. https:\/\/github.com\/nvdla"},{"key":"e_1_3_3_2_28_2","unstructured":"OpenAI. 2024. GPT-4 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2303.08774\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","unstructured":"Artur Podobas Kentaro Sano and Satoshi Matsuoka. 2020. A Survey on Coarse-Grained Reconfigurable Architectures From a Performance Perspective. IEEE Access 8 (2020) 146719\u2013146743. 10.1109\/ACCESS.2020.3012084","DOI":"10.1109\/ACCESS.2020.3012084"},{"key":"e_1_3_3_2_30_2","unstructured":"Louis-Noel Pouchet and Tomofumi Yuki. 2016. Polybench C 4.2. https:\/\/sourceforge.net\/projects\/polybench\/"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3716013"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW63119.2024.00124"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW55747.2022.00115"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/FPL57034.2022.00067"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/FPL57034.2022.00067"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS48437.2020.00016"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/FPT.2007.4439254"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN55064.2022.9891914"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3728179.3728196"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"crossref","unstructured":"Vivienne Sze Yu-Hsin Chen Tien-Ju Yang and Joel\u00a0S Emer. 2017. Efficient processing of deep neural networks: A tutorial and survey. Proc. IEEE 105 12 (2017) 2295\u20132329.","DOI":"10.1109\/JPROC.2017.2761740"},{"key":"e_1_3_3_2_41_2","first-page":"23","volume-title":"ASPIRE Workshop 2024","author":"TAKISHITA Hajime","year":"2024","unstructured":"Hajime TAKISHITA, Takuya KOJIMA, and Hideharu AMANO. 2024. CGRA architecture generation for implementing a systolic array. In ASPIRE Workshop 2024. IEICE, 23."},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00042"},{"key":"e_1_3_3_2_43_2","unstructured":"Daniel Vazquez Jose Miranda Alfonso Rodriguez Andres Otero Pascuale\u00a0Davide Schiavone and David Atienza. 2024. Strela: Streaming elastic cgra accelerator for embedded systems. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.12503 (2024)."},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/ReCoSoC.2018.8449384"}],"event":{"name":"SCA\/HPCAsia 2026: Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region","location":"Osaka Japan","acronym":"SCA\/HPCAsia 2026"},"container-title":["Proceedings of the Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region"],"original-title":[],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T10:22:51Z","timestamp":1767954171000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3773656.3773663"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,25]]},"references-count":43,"alternative-id":["10.1145\/3773656.3773663","10.1145\/3773656"],"URL":"https:\/\/doi.org\/10.1145\/3773656.3773663","relation":{},"subject":[],"published":{"date-parts":[[2026,1,25]]},"assertion":[{"value":"2026-01-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}