{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,30]],"date-time":"2026-06-30T03:08:16Z","timestamp":1782788896510,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,11,2]],"date-time":"2020-11-02T00:00:00Z","timestamp":1604275200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"NSF\/Intel","award":["1723773, 1723715"],"award-info":[{"award-number":["1723773, 1723715"]}]},{"name":"NSF","award":["1909661"],"award-info":[{"award-number":["1909661"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,11,2]]},"DOI":"10.1145\/3400302.3415644","type":"proceedings-article","created":{"date-parts":[[2020,12,18]],"date-time":"2020-12-18T01:20:55Z","timestamp":1608254455000},"page":"1-9","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":32,"title":["SuSy"],"prefix":"10.1145","author":[{"given":"Yi-Hsiang","family":"Lai","sequence":"first","affiliation":[{"name":"Cornell University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hongbo","family":"Rong","sequence":"additional","affiliation":[{"name":"Intel"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Size","family":"Zheng","sequence":"additional","affiliation":[{"name":"Peking University, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Weihao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tsinghua University, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiuping","family":"Cui","sequence":"additional","affiliation":[{"name":"Peking University, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yunshan","family":"Jia","sequence":"additional","affiliation":[{"name":"Peking University, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jie","family":"Wang","sequence":"additional","affiliation":[{"name":"University of California"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Brendan","family":"Sullivan","sequence":"additional","affiliation":[{"name":"Cornell University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhiru","family":"Zhang","sequence":"additional","affiliation":[{"name":"Cornell University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yun","family":"Liang","sequence":"additional","affiliation":[{"name":"Peking University, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Youhui","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tsinghua University, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jason","family":"Cong","sequence":"additional","affiliation":[{"name":"University of California"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Nithin","family":"George","sequence":"additional","affiliation":[{"name":"Intel"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jose","family":"Alvarez","sequence":"additional","affiliation":[{"name":"Intel"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Christopher","family":"Hughes","sequence":"additional","affiliation":[{"name":"Intel"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pradeep","family":"Dubey","sequence":"additional","affiliation":[{"name":"Intel"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2020,12,17]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Tiramisu: A Polyhedral Compiler for Expressing Fast and Portable Code. Int'l Symp. on Code Generation and Optimization (CGO)","author":"Baghdadi R.","year":"2019","unstructured":"R. Baghdadi , J. Ray , M. B. Romdhane , E. Del Sozzo , A. Akkas , Y. Zhang , P. Suriana , S. Kamil , and S. Amarasinghe . Tiramisu: A Polyhedral Compiler for Expressing Fast and Portable Code. Int'l Symp. on Code Generation and Optimization (CGO) , 2019 . R. Baghdadi, J. Ray, M. B. Romdhane, E. Del Sozzo, A. Akkas, Y. Zhang, P. Suriana, S. Kamil, and S. Amarasinghe. Tiramisu: A Polyhedral Compiler for Expressing Fast and Portable Code. Int'l Symp. on Code Generation and Optimization (CGO), 2019."},{"key":"e_1_3_2_1_2_1","volume-title":"ACM SIGPLAN Conf. on Principles and Practice of Parallel Programming (PPoPP)","author":"Bondhugula U.","year":"2007","unstructured":"U. Bondhugula , J. Ramanujam , and P. Sadayappan . Automatic Mapping of Nested Loops to FPGAs . ACM SIGPLAN Conf. on Principles and Practice of Parallel Programming (PPoPP) , 2007 . U. Bondhugula, J. Ramanujam, and P. Sadayappan. Automatic Mapping of Nested Loops to FPGAs. ACM SIGPLAN Conf. on Principles and Practice of Parallel Programming (PPoPP), 2007."},{"key":"e_1_3_2_1_3_1","volume-title":"Software Infrastructure for Enabling FPGA-Based Accelerations in Data Centers. Int'l Symp. on Low Power Electronics and Design (ISLPED)","author":"Cong J.","year":"2016","unstructured":"J. Cong , M. Huang , P. Pan , D. Wu , and P. Zhang . Software Infrastructure for Enabling FPGA-Based Accelerations in Data Centers. Int'l Symp. on Low Power Electronics and Design (ISLPED) , 2016 . J. Cong, M. Huang, P. Pan, D. Wu, and P. Zhang. Software Infrastructure for Enabling FPGA-Based Accelerations in Data Centers. Int'l Symp. on Low Power Electronics and Design (ISLPED), 2016."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2011.2110592"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240765.3240838"},{"key":"e_1_3_2_1_6_1","volume-title":"Parallel and Pipeline Architecture. Design Automation Conf. (DAC)","author":"Cong J.","year":"2018","unstructured":"J. Cong , P. Wei , C. H. Yu , and P. Zhang . Automated Accelerator Generation and Optimization with Composable , Parallel and Pipeline Architecture. Design Automation Conf. (DAC) , 2018 . J. Cong, P. Wei, C. H. Yu, and P. Zhang. Automated Accelerator Generation and Optimization with Composable, Parallel and Pipeline Architecture. Design Automation Conf. (DAC), 2018."},{"key":"e_1_3_2_1_7_1","volume-title":"Flexible Communication Avoiding Matrix Multiplication on FPGA with High-Level Synthesis. Int'l Symp. on Field-Programmable Gate Arrays (FPGA)","author":"de Fine Licht J.","year":"2020","unstructured":"J. de Fine Licht , G. Kwasniewski , and T. Hoefler . Flexible Communication Avoiding Matrix Multiplication on FPGA with High-Level Synthesis. Int'l Symp. on Field-Programmable Gate Arrays (FPGA) , 2020 . J. de Fine Licht, G. Kwasniewski, and T. Hoefler. Flexible Communication Avoiding Matrix Multiplication on FPGA with High-Level Synthesis. Int'l Symp. on Field-Programmable Gate Arrays (FPGA), 2020."},{"key":"e_1_3_2_1_8_1","volume-title":"et al. Gemmini: An Agile Systolic Array Generator Enabling Systematic Evaluations of Deep-Learning Architectures. arXiv preprint arXiv:1911.09925","author":"Genc H.","year":"2019","unstructured":"H. Genc , A. Haj-Ali , V. Iyer , A. Amid , H. Mao , J. Wright , C. Schmidt , J. Zhao , A. Ou , M. Banister , et al. Gemmini: An Agile Systolic Array Generator Enabling Systematic Evaluations of Deep-Learning Architectures. arXiv preprint arXiv:1911.09925 , 2019 . H. Genc, A. Haj-Ali, V. Iyer, A. Amid, H. Mao, J. Wright, C. Schmidt, J. Zhao, A. Ou, M. Banister, et al. Gemmini: An Agile Systolic Array Generator Enabling Systematic Evaluations of Deep-Learning Architectures. arXiv preprint arXiv:1911.09925, 2019."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1117\/12.932507"},{"key":"e_1_3_2_1_10_1","volume-title":"Hardware Design Methodology with the Alpha Language. FDL'01","author":"Guillou A.-C.","year":"2001","unstructured":"A.-C. Guillou , F. Quiller\u00e9 , P. Quinton , S. Rajopadhye , and T. Risset . Hardware Design Methodology with the Alpha Language. FDL'01 , 2001 . A.-C. Guillou, F. Quiller\u00e9, P. Quinton, S. Rajopadhye, and T. Risset. Hardware Design Methodology with the Alpha Language. FDL'01, 2001."},{"key":"e_1_3_2_1_11_1","volume-title":"PARO: Synthesis of Hardware Accelerators for Multi-dimensional Dataflow-intensive Applications. International Workshop on Applied Reconfigurable Computing","author":"Hannig F.","year":"2008","unstructured":"F. Hannig , H. Ruckdeschel , H. Dutta , and J. Teich . PARO: Synthesis of Hardware Accelerators for Multi-dimensional Dataflow-intensive Applications. International Workshop on Applied Reconfigurable Computing , 2008 . F. Hannig, H. Ruckdeschel, H. Dutta, and J. Teich. PARO: Synthesis of Hardware Accelerators for Multi-dimensional Dataflow-intensive Applications. International Workshop on Applied Reconfigurable Computing, 2008."},{"key":"e_1_3_2_1_12_1","volume-title":"Accelerating Genomics Research with OpenCL, and FPGAs","year":"2017","unstructured":"Intel. Accelerating Genomics Research with OpenCL, and FPGAs . 2017 . Intel. Accelerating Genomics Research with OpenCL, and FPGAs. 2017."},{"key":"e_1_3_2_1_13_1","volume-title":"vLab Academic Cluster. URL: https:\/\/wiki.intel-research.net","year":"2019","unstructured":"Intel. vLab Academic Cluster. URL: https:\/\/wiki.intel-research.net , 2019 . Intel. vLab Academic Cluster. URL: https:\/\/wiki.intel-research.net, 2019."},{"key":"e_1_3_2_1_14_1","volume-title":"In-Datacenter Performance Analysis of a Tensor Processing Unit. Int'l Symp. on Computer Architecture (ISCA)","author":"Jouppi N. P.","year":"2017","unstructured":"N. P. Jouppi , C. Young , N. Patil , D. Patterson , G. Agrawal , R. Bajwa , S. Bates , S. Bhatia , N. Boden , A. Borchers , In-Datacenter Performance Analysis of a Tensor Processing Unit. Int'l Symp. on Computer Architecture (ISCA) , 2017 . N. P. Jouppi, C. Young, N. Patil, D. Patterson, G. Agrawal, R. Bajwa, S. Bates, S. Bhatia, N. Boden, A. Borchers, et al. In-Datacenter Performance Analysis of a Tensor Processing Unit. Int'l Symp. on Computer Architecture (ISCA), 2017."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/321406.321418"},{"key":"e_1_3_2_1_16_1","volume-title":"Spatial: A Language and Compiler for Application Accelerators. ACM SIGPLAN Conf. on Programming Language Design and Implementation (PLDI)","author":"Koeplinger D.","year":"2018","unstructured":"D. Koeplinger , M. Feldman , R. Prabhakar , Y. Zhang , S. Hadjis , R. Fiszel , T. Zhao , L. Nardi , A. Pedram , C. Kozyrakis , Spatial: A Language and Compiler for Application Accelerators. ACM SIGPLAN Conf. on Programming Language Design and Implementation (PLDI) , 2018 . D. Koeplinger, M. Feldman, R. Prabhakar, Y. Zhang, S. Hadjis, R. Fiszel, T. Zhao, L. Nardi, A. Pedram, C. Kozyrakis, et al. Spatial: A Language and Compiler for Application Accelerators. ACM SIGPLAN Conf. on Programming Language Design and Implementation (PLDI), 2018."},{"key":"e_1_3_2_1_17_1","volume-title":"Systolic Arrays for (VLSI)","author":"Kung H.","year":"1978","unstructured":"H. Kung and C. Leiserson . Systolic Arrays for (VLSI) . 1978 . H. Kung and C. Leiserson. Systolic Arrays for (VLSI). 1978."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"H. Kung B. McDanel and S. Q. Zhang. Packing Sparse Convolutional Neural Networks for Efficient Systolic Array Implementations: Column Combining under Joint Optimization. Int'l Conf. on Architectural Support for Programming Languages and Operating Systems (ASPLOS) 2019.  H. Kung B. McDanel and S. Q. Zhang. Packing Sparse Convolutional Neural Networks for Efficient Systolic Array Implementations: Column Combining under Joint Optimization. Int'l Conf. on Architectural Support for Programming Languages and Operating Systems (ASPLOS) 2019.","DOI":"10.1145\/3297858.3304028"},{"key":"e_1_3_2_1_19_1","author":"Kung S.","year":"1985","unstructured":"S. Kung . VLSI Array Processors. IEEE ASSP Magazine , 1985 . S. Kung. VLSI Array Processors. IEEE ASSP Magazine, 1985.","journal-title":"VLSI Array Processors. IEEE ASSP Magazine"},{"key":"e_1_3_2_1_20_1","volume-title":"Virtual Systolic Array for QR Decomposition. Int'l Parallel and Distributed Processing Symp. (IPDPS)","author":"Kurzak J.","year":"2013","unstructured":"J. Kurzak , P. Luszczek , M. Gates , I. Yamazaki , and J. Dongarra . Virtual Systolic Array for QR Decomposition. Int'l Parallel and Distributed Processing Symp. (IPDPS) , 2013 . J. Kurzak, P. Luszczek, M. Gates, I. Yamazaki, and J. Dongarra. Virtual Systolic Array for QR Decomposition. Int'l Parallel and Distributed Processing Symp. (IPDPS), 2013."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3289602.3293910"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1201\/9781482276046-23"},{"key":"e_1_3_2_1_23_1","volume-title":"Image and Video Technology","author":"Le Verge H.","year":"1991","unstructured":"H. Le Verge , C. Mauras , and P. Quinton . The ALPHA Language and Its Use for the Design of Systolic Arrays. Journal of VLSI Signal Processing Systems for Signal , Image and Video Technology , 1991 . H. Le Verge, C. Mauras, and P. Quinton. The ALPHA Language and Its Use for the Design of Systolic Arrays. Journal of VLSI Signal Processing Systems for Signal, Image and Video Technology, 1991."},{"key":"e_1_3_2_1_24_1","volume-title":"Maximizing Parallelism and Minimizing Synchronization with Affine Transforms. ACM SIGPLAN-SIGACT Symp. on Principles of Programming Languages (POPL)","author":"Lim A. W.","year":"1997","unstructured":"A. W. Lim and M. S. Lam . Maximizing Parallelism and Minimizing Synchronization with Affine Transforms. ACM SIGPLAN-SIGACT Symp. on Principles of Programming Languages (POPL) , 1997 . A. W. Lim and M. S. Lam. Maximizing Parallelism and Minimizing Synchronization with Affine Transforms. ACM SIGPLAN-SIGACT Symp. on Principles of Programming Languages (POPL), 1997."},{"key":"e_1_3_2_1_25_1","volume-title":"VTA: An Open Hardware-Software Stack for Deep Learning. arXiv preprint arXiv:1807.04188","author":"Moreau T.","year":"2018","unstructured":"T. Moreau , T. Chen , Z. Jiang , L. Ceze , C. Guestrin , and A. Krishnamurthy . VTA: An Open Hardware-Software Stack for Deep Learning. arXiv preprint arXiv:1807.04188 , 2018 . T. Moreau, T. Chen, Z. Jiang, L. Ceze, C. Guestrin, and A. Krishnamurthy. VTA: An Open Hardware-Software Stack for Deep Learning. arXiv preprint arXiv:1807.04188, 2018."},{"key":"e_1_3_2_1_26_1","volume-title":"A Customizable Matrix Multiplication Framework for the Intel HARPv2 Platform - A Deep Learning Case Study. Int'l Symp. on Field-Programmable Gate Arrays (FPGA)","author":"Moss D.","year":"2018","unstructured":"D. Moss , S. Krishnan , E. Nurvitadhi , and A Customizable Matrix Multiplication Framework for the Intel HARPv2 Platform - A Deep Learning Case Study. Int'l Symp. on Field-Programmable Gate Arrays (FPGA) , 2018 . D. Moss, S. Krishnan, E. Nurvitadhi, and et al. A Customizable Matrix Multiplication Framework for the Intel HARPv2 Platform - A Deep Learning Case Study. Int'l Symp. on Field-Programmable Gate Arrays (FPGA), 2018."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3107953"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/800015.808184"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"e_1_3_2_1_30_1","volume-title":"Programmatic Control of a Compiler for Generating High-Performance Spatial Hardware. arXiv preprint arXiv:1711.07606","author":"Rong H.","year":"2017","unstructured":"H. Rong . Programmatic Control of a Compiler for Generating High-Performance Spatial Hardware. arXiv preprint arXiv:1711.07606 , 2017 . H. Rong. Programmatic Control of a Compiler for Generating High-Performance Spatial Hardware. arXiv preprint arXiv:1711.07606, 2017."},{"issue":"2","key":"e_1_3_2_1_31_1","first-page":"127","article-title":"PICO-NPA: High-level Synthesis of Nonprogrammable Hardware Accelerators. Journal of VLSI Signal Processing Systems for Signal","volume":"31","author":"Schreiber R.","year":"2002","unstructured":"R. Schreiber , S. Aditya , S. Mahlke , V. Kathail , B. R. Rau , D. Cronquist , and M. Sivaraman . PICO-NPA: High-level Synthesis of Nonprogrammable Hardware Accelerators. Journal of VLSI Signal Processing Systems for Signal , Image and Video Technology , 31 ( 2 ): 127 -- 142 , 2002 . R. Schreiber, S. Aditya, S. Mahlke, V. Kathail, B. R. Rau, D. Cronquist, and M. Sivaraman. PICO-NPA: High-level Synthesis of Nonprogrammable Hardware Accelerators. Journal of VLSI Signal Processing Systems for Signal, Image and Video Technology, 31(2):127--142, 2002.","journal-title":"Image and Video Technology"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM.2019.00033"},{"key":"e_1_3_2_1_33_1","volume-title":"Automated Systolic Array Architecture Synthesis for High Throughput CNN Inference on FPGAs. Design Automation Conf. (DAC)","author":"Wei X.","year":"2017","unstructured":"X. Wei , C. H. Yu , P. Zhang , Y. Chen , Y. Wang , H. Hu , Y. Liang , and J. Cong . Automated Systolic Array Architecture Synthesis for High Throughput CNN Inference on FPGAs. Design Automation Conf. (DAC) , 2017 . X. Wei, C. H. Yu, P. Zhang, Y. Chen, Y. Wang, H. Hu, Y. Liang, and J. Cong. Automated Systolic Array Architecture Synthesis for High Throughput CNN Inference on FPGAs. Design Automation Conf. (DAC), 2017."},{"key":"e_1_3_2_1_34_1","volume-title":"SDAccel: Enabling Hardware-Accelerated Software","year":"2020","unstructured":"Xilinx. SDAccel: Enabling Hardware-Accelerated Software . 2020 . Xilinx. SDAccel: Enabling Hardware-Accelerated Software. 2020."},{"key":"e_1_3_2_1_35_1","volume-title":"Formal Synthesis of Control Signals for Systolic Arrays","author":"Xue J.","year":"1992","unstructured":"J. Xue . Formal Synthesis of Control Signals for Systolic Arrays . 1992 . J. Xue. Formal Synthesis of Control Signals for Systolic Arrays. 1992."}],"event":{"name":"ICCAD '20: IEEE\/ACM International Conference on Computer-Aided Design","location":"Virtual Event USA","acronym":"ICCAD '20","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE CAS","IEEE CEDA","IEEE CS"]},"container-title":["Proceedings of the 39th International Conference on Computer-Aided Design"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3400302.3415644","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3400302.3415644","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3400302.3415644","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T21:31:41Z","timestamp":1750195901000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3400302.3415644"}},"subtitle":["a programming model for productive construction of high-performance systolic arrays on FPGAs"],"short-title":[],"issued":{"date-parts":[[2020,11,2]]},"references-count":35,"alternative-id":["10.1145\/3400302.3415644","10.1145\/3400302"],"URL":"https:\/\/doi.org\/10.1145\/3400302.3415644","relation":{},"subject":[],"published":{"date-parts":[[2020,11,2]]},"assertion":[{"value":"2020-12-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}