{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T18:36:20Z","timestamp":1777487780854,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,2,12]],"date-time":"2023-02-12T00:00:00Z","timestamp":1676160000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2217003,2213701"],"award-info":[{"award-number":["2217003,2213701"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,2,12]]},"DOI":"10.1145\/3543622.3573210","type":"proceedings-article","created":{"date-parts":[[2023,2,10]],"date-time":"2023-02-10T23:15:13Z","timestamp":1676070913000},"page":"153-164","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":41,"title":["CHARM:\n            <u>C<\/u>\n            omposing\n            <u>H<\/u>\n            eterogeneous\n            <u>A<\/u>\n            ccele\n            <u>R<\/u>\n            ators for\n            <u>M<\/u>\n            atrix Multiply on Versal ACAP Architecture"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3659-339X","authenticated-orcid":false,"given":"Jinming","family":"Zhuang","sequence":"first","affiliation":[{"name":"University of Pittsburgh, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0751-8227","authenticated-orcid":false,"given":"Jason","family":"Lau","sequence":"additional","affiliation":[{"name":"University of California, Los Angeles, Los Angeles, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6646-8146","authenticated-orcid":false,"given":"Hanchen","family":"Ye","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign, Urbana, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7655-4080","authenticated-orcid":false,"given":"Zhuoping","family":"Yang","sequence":"additional","affiliation":[{"name":"University of Pittsburgh, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9153-7318","authenticated-orcid":false,"given":"Yubo","family":"Du","sequence":"additional","affiliation":[{"name":"University of Pittsburgh, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4918-478X","authenticated-orcid":false,"given":"Jack","family":"Lo","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices Inc, San Jose, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6668-4562","authenticated-orcid":false,"given":"Kristof","family":"Denolf","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices Inc, Longmont, CO, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2956-8428","authenticated-orcid":false,"given":"Stephen","family":"Neuendorffer","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices Inc, San Jose, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7498-0206","authenticated-orcid":false,"given":"Alex","family":"Jones","sequence":"additional","affiliation":[{"name":"University of Pittsburgh, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4029-4034","authenticated-orcid":false,"given":"Jingtong","family":"Hu","sequence":"additional","affiliation":[{"name":"University of Pittsburgh, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3016-0270","authenticated-orcid":false,"given":"Deming","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign, Urbana, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2887-6963","authenticated-orcid":false,"given":"Jason","family":"Cong","sequence":"additional","affiliation":[{"name":"University of California, Los Angeles, Los Angeles, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0493-1844","authenticated-orcid":false,"given":"Peipei","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Pittsburgh, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,2,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Attention is all you need. Advances in neural information processing systems, 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, ?ukasz Kaiser, and Illia Polosukhin. Attention is all you need. Advances in neural information processing systems, 30, 2017."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3038912.3052569"},{"key":"e_1_3_2_1_3_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929, 2020."},{"key":"e_1_3_2_1_4_1","volume-title":"arXiv preprint arXiv:1907.10701","author":"Wang Yu Emma","year":"2019","unstructured":"Yu Emma Wang, Gu-Yeon Wei, and David Brooks. Benchmarking TPU, GPU, and CPU platforms for deep learning. arXiv preprint arXiv:1907.10701, 2019."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-16-7487-7_7"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3110993"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322231"},{"key":"e_1_3_2_1_9_1","first-page":"1942","volume-title":"Networking Storage and Analysis","author":"Demmel Jim","year":"2012","unstructured":"Jim Demmel. Communication avoiding algorithms. In 2012 SC Companion: High Performance Computing, Networking Storage and Analysis, pages 1942--2000. IEEE, 2012."},{"key":"e_1_3_2_1_10_1","unstructured":"AMD\/Xilinx. Versal Adaptive Compute Acceleration Platform."},{"key":"e_1_3_2_1_11_1","volume-title":"IP Overlays of Deep learning Processing Unit","author":"AMD.","year":"2022","unstructured":"AMD. IP Overlays of Deep learning Processing Unit , 2022."},{"key":"e_1_3_2_1_12_1","volume-title":"ACM SIGARCH Computer Architecture News","author":"Yu-Hsin","year":"2016","unstructured":"Yu-Hsin Chen et al. Eyeriss: A spatial architecture for energy-efficient dataflow for convolutional neural networks. ACM SIGARCH Computer Architecture News, 2016."},{"key":"e_1_3_2_1_13_1","volume-title":"Eyeriss v2: A flexible accelerator for emerging deep neural networks on mobile devices","author":"Chen Yu-Hsin","year":"2019","unstructured":"Yu-Hsin Chen, Tien-Ju Yang, Joel Emer, and Vivienne Sze. Eyeriss v2: A flexible accelerator for emerging deep neural networks on mobile devices. IEEE Journal on Emerging and Selected Topics in Circuits and Systems, 9(2):292--308, 2019."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750389"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM.2019.00035"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICFPT51103.2020.00011"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00012"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00063"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373087.3375296"},{"key":"e_1_3_2_1_20_1","first-page":"161","volume-title":"Proc. of FPGA","author":"Chen","year":"2015","unstructured":"Chen Zhang et al. Optimizing fpga-based accelerator design for deep convolutional neural networks. In Proc. of FPGA, pages 161--170. ACM, 2015."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3174243.3174258"},{"key":"e_1_3_2_1_22_1","first-page":"93","volume-title":"Jason Cong. AutoSA: A Polyhedral Compiler for High-Performance Systolic Arrays on FPGA. In The 2021 ACM\/SIGDA International Symposium on Field-Programmable Gate Arrays, FPGA '21","author":"Wang Jie","year":"2021","unstructured":"Jie Wang, Licheng Guo, and Jason Cong. AutoSA: A Polyhedral Compiler for High-Performance Systolic Arrays on FPGA. In The 2021 ACM\/SIGDA International Symposium on Field-Programmable Gate Arrays, FPGA '21, page 93--104. Association for Computing Machinery, Feb 2021."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3490422.3502357"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530420"},{"key":"e_1_3_2_1_25_1","first-page":"125","volume-title":"Peipei Zhou. Latte: Locality Aware Transformation for High-Level Synthesis. In 2018 IEEE 26th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)","author":"Cong Jason","year":"2018","unstructured":"Jason Cong, Peng Wei, Cody Hao Yu, and Peipei Zhou. Latte: Locality Aware Transformation for High-Level Synthesis. In 2018 IEEE 26th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM), pages 125--128, 2018."},{"key":"e_1_3_2_1_26_1","first-page":"172","volume-title":"Andr\u00e9 DeHon. Energy Efficiency of Full Pipelining: A Case Study for Matrix Multiplication. In 2016 IEEE 24th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)","author":"Zhou Peipei","year":"2016","unstructured":"Peipei Zhou, Hyunseok Park, Zhenman Fang, Jason Cong, and Andr\u00e9 DeHon. Energy Efficiency of Full Pipelining: A Case Study for Matrix Multiplication. In 2016 IEEE 24th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM), pages 172--175, 2016."},{"key":"e_1_3_2_1_27_1","first-page":"273","volume-title":"Jason Cong. MOCHA: Multinode Cost Optimization in Heterogeneous Clouds with Accelerators. In The 2021 ACM\/SIGDA International Symposium on Field- Programmable Gate Arrays, FPGA '21","author":"Zhou Peipei","year":"2021","unstructured":"Peipei Zhou, Jiayi Sheng, Cody Hao Yu, Peng Wei, Jie Wang, Di Wu, and Jason Cong. MOCHA: Multinode Cost Optimization in Heterogeneous Clouds with Accelerators. In The 2021 ACM\/SIGDA International Symposium on Field- Programmable Gate Arrays, FPGA '21, page 273--279, New York, NY, USA, 2021. Association for Computing Machinery."},{"key":"e_1_3_2_1_28_1","first-page":"56","volume-title":"Proc. ICCAD","author":"Xiaofan","unstructured":"Xiaofan Zhang et al. Dnnbuilder: an automated tool for building high- performance dnn hardware accelerators for fpgas. In Proc. ICCAD, page 56. ACM, 2018."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3400302.3415609"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037702"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304014"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00016"},{"key":"e_1_3_2_1_33_1","unstructured":"Nvidia. Website. http:\/\/nvdla.org\/."},{"key":"e_1_3_2_1_34_1","first-page":"9","volume-title":"Peipei Zhou. A Fully Pipelined and Dynamically Composable Architecture of CGRA. In 2014 IEEE 22nd Annual International Symposium on Field-Programmable Custom Computing Machines","author":"Cong Jason","year":"2014","unstructured":"Jason Cong, Hui Huang, Chiyuan Ma, Bingjun Xiao, and Peipei Zhou. A Fully Pipelined and Dynamically Composable Architecture of CGRA. In 2014 IEEE 22nd Annual International Symposium on Field-Programmable Custom Computing Machines, pages 9--16, 2014."},{"key":"e_1_3_2_1_35_1","first-page":"379","volume-title":"Glenn Reinman. CHARM: A Composable Heterogeneous Accelerator-Rich Microprocessor. In Proceedings of the 2012 ACM\/IEEE International Symposium on Low Power Electronics and Design, ISLPED '12","author":"Cong Jason","year":"2012","unstructured":"Jason Cong, Mohammad Ali Ghodrat, Michael Gill, Beayna Grigorian, and Glenn Reinman. CHARM: A Composable Heterogeneous Accelerator-Rich Microprocessor. In Proceedings of the 2012 ACM\/IEEE International Symposium on Low Power Electronics and Design, ISLPED '12, page 379--384, New York, NY, USA, 2012. Association for Computing Machinery."},{"key":"e_1_3_2_1_36_1","volume-title":"Versal AI Core Series VCK190 Evaluation Kit","year":"2022","unstructured":"AMD\/Xilinx. Versal AI Core Series VCK190 Evaluation Kit, 2022."},{"key":"e_1_3_2_1_37_1","volume-title":"AI Engine Technology","year":"2022","unstructured":"AMD\/Xilinx. AI Engine Technology, 2022."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2011.2110592"},{"key":"e_1_3_2_1_39_1","volume-title":"FPGA HLS Today: successes, challenges, and opportunities. ACM Transactions on Reconfigurable Technology and Systems (TRETS), 15(4):1--42","author":"Cong Jason","year":"2022","unstructured":"Jason Cong, Jason Lau, Gai Liu, Stephen Neuendorffer, Peichen Pan, Kees Vissers, and Zhiru Zhang. FPGA HLS Today: successes, challenges, and opportunities. ACM Transactions on Reconfigurable Technology and Systems (TRETS), 15(4):1--42, 2022."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/SASP.2009.5226333"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM.2011.29"},{"key":"e_1_3_2_1_42_1","first-page":"2012","article-title":"productivity, performance, and software constraints","author":"Liang Yun","year":"2012","unstructured":"Yun Liang, Kyle Rupnow, Yinan Li, Dongbo Min, Minh N Do, and Deming Chen. High-level synthesis: productivity, performance, and software constraints. Journal of Electrical and Computer Engineering, 2012, 2012.","journal-title":"Journal of Electrical and Computer Engineering"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM51124.2021.00032"},{"key":"e_1_3_2_1_44_1","unstructured":"AMD\/Xilinx. Adaptive Data Flow API."},{"key":"e_1_3_2_1_45_1","unstructured":"AMD\/Xilinx. Board evaluation and management Tool."},{"key":"e_1_3_2_1_46_1","unstructured":"AMD\/Xilinx. AI Engine API and Intrinsics User Guide."},{"key":"e_1_3_2_1_47_1","unstructured":"AMD\/Xilinx. Versal ACAP AI Engine System C simulator."},{"key":"e_1_3_2_1_48_1","volume-title":"H-GCN: A graph convolutional network accelerator on versal acap architecture. arXiv preprint arXiv:2206.13734","author":"Zhang Chengming","year":"2022","unstructured":"Chengming Zhang, Tong Geng, Anqi Guo, Jiannan Tian, Martin Herbordt, Ang Li, and Dingwen Tao. H-GCN: A graph convolutional network accelerator on versal acap architecture. arXiv preprint arXiv:2206.13734, 2022."}],"event":{"name":"FPGA '23: The 2023 ACM\/SIGDA International Symposium on Field Programmable Gate Arrays","location":"Monterey CA USA","acronym":"FPGA '23","sponsor":["SIGDA ACM Special Interest Group on Design Automation"]},"container-title":["Proceedings of the 2023 ACM\/SIGDA International Symposium on Field Programmable Gate Arrays"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3543622.3573210","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3543622.3573210","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3543622.3573210","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:48Z","timestamp":1750186848000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3543622.3573210"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2,12]]},"references-count":48,"alternative-id":["10.1145\/3543622.3573210","10.1145\/3543622"],"URL":"https:\/\/doi.org\/10.1145\/3543622.3573210","relation":{},"subject":[],"published":{"date-parts":[[2023,2,12]]},"assertion":[{"value":"2023-02-12","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}