{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T16:35:18Z","timestamp":1773246918375,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,1]],"date-time":"2024-04-01T00:00:00Z","timestamp":1711929600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the National Science Foundation of China","award":["T2293700,T2293701,T2325001"],"award-info":[{"award-number":["T2293700,T2293701,T2325001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4]]},"DOI":"10.1145\/3626202.3637566","type":"proceedings-article","created":{"date-parts":[[2024,4,2]],"date-time":"2024-04-02T18:04:51Z","timestamp":1712081091000},"page":"199-210","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["POPA: Expressing High and Portable Performance across Spatial and Vector Architectures for Tensor Computations"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2127-6011","authenticated-orcid":false,"given":"Xiaochen","family":"Hao","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3275-7791","authenticated-orcid":false,"given":"Hongbo","family":"Rong","sequence":"additional","affiliation":[{"name":"Parallel Computing Lab, Intel, Santa Clara, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1464-5271","authenticated-orcid":false,"given":"Mingzhe","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6915-1489","authenticated-orcid":false,"given":"Ce","family":"Sun","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6944-3883","authenticated-orcid":false,"given":"Hong","family":"Jiang","sequence":"additional","affiliation":[{"name":"Intel, Santa Clara, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9076-7998","authenticated-orcid":false,"given":"Yun","family":"Liang","sequence":"additional","affiliation":[{"name":"Peking University &amp; Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,4,2]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Symposium on Operating Systems Design and Implementation (OSDI'16)","author":"Abadi Mart'in","year":"2016","unstructured":"Mart'in Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. 2016. Tensorflow: A system for large-scale machine learning. In Symposium on Operating Systems Design and Implementation (OSDI'16)."},{"key":"e_1_3_2_1_2_1","volume-title":"Tensor decompositions for learning latent variable models. The Journal of Machine Learning Research","author":"Anandkumar Animashree","year":"2014","unstructured":"Animashree Anandkumar, Rong Ge, Daniel Hsu, Sham M Kakade, and Matus Telgarsky. 2014. Tensor decompositions for learning latent variable models. The Journal of Machine Learning Research (2014)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661197"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3317550.3321441"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356173"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356162"},{"key":"e_1_3_2_1_7_1","volume-title":"Symposium on Operating Systems Design and Implementation (OSDI'18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. 2018. TVM: An automated end-to-end optimizing compiler for deep learning. In Symposium on Operating Systems Design and Implementation (OSDI'18)."},{"key":"e_1_3_2_1_8_1","unstructured":"Yen-Kuang Chen and S.Y. Kung. 1998. A Systolic Design Methodology with Application to Full-Search Block-Matching Architectures. Journal of VLSI signal processing systems for signal image and video technology (1998)."},{"key":"e_1_3_2_1_9_1","volume-title":"Professional CUDA C programming","author":"Cheng John","unstructured":"John Cheng, Max Grossman, and Ty McKercher. 2014. Professional CUDA C programming. John Wiley & Sons."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240765.3240850"},{"key":"e_1_3_2_1_11_1","volume-title":"IEEE Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM'21)","author":"Chi Yuze","year":"2021","unstructured":"Yuze Chi, Licheng Guo, Jason Lau, Young-kyu Choi, Jie Wang, and Jason Cong. 2021. Extending high-level synthesis for task-parallel programs. In IEEE Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM'21)."},{"key":"e_1_3_2_1_12_1","volume-title":"IEEE Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM'18)","author":"Cong Jason","year":"2018","unstructured":"Jason Cong, Zhenman Fang, Michael Lo, Hanrui Wang, Jingxian Xu, and Shaochong Zhang. 2018. Understanding performance differences of FPGAs and GPUs. In IEEE Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM'18)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240765.3240838"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00063"},{"key":"e_1_3_2_1_15_1","volume-title":"IEEE International Conference on Application-specific Systems, Architectures and Processors (ASAP'18)","author":"Sozzo Emanuele Del","year":"2018","unstructured":"Emanuele Del Sozzo, Riyadh Baghdadi, Saman Amarasinghe, and Marco D Santambrogio. 2018. A unified backend for targeting FPGAs from DSLs. In IEEE International Conference on Application-specific Systems, Architectures and Processors (ASAP'18)."},{"key":"e_1_3_2_1_16_1","volume-title":"Sven Hammarling, and Richard J. Hanson.","author":"Dongarra Jack J.","year":"1988","unstructured":"Jack J. Dongarra, Jeremy Du Croz, Sven Hammarling, and Richard J. Hanson. 1988. An extended set of FORTRAN basic linear algebra subprograms. ACM Trans. Math. Softw. (1988)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586216"},{"key":"e_1_3_2_1_18_1","volume-title":"FDL'01","author":"Guillou Anne-Claire","year":"2001","unstructured":"Anne-Claire Guillou, Fabien Quiller\u00e9, Patrice Quinton, S Rajopadhye, and Tanguy Risset. 2001. Hardware design methodology with the Alpha language. FDL'01 (2001)."},{"key":"e_1_3_2_1_19_1","volume-title":"Lasa: Abstraction and Specialization for Productive and Performant Linear Algebra on FPGAs. In IEEE Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM'23)","author":"Hao Xiaochen","year":"2023","unstructured":"Xiaochen Hao, Mingzhe Zhang, Ce Sun, Zhuofu Tao, Hongbo Rong, Yu Zhang, Lei He, Eric Petit, Wenguang Chen, and Yun Liang. 2023. Lasa: Abstraction and Specialization for Productive and Performant Linear Algebra on FPGAs. In IEEE Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM'23)."},{"key":"e_1_3_2_1_20_1","volume-title":"Bal","author":"Hijma Pieter","year":"2023","unstructured":"Pieter Hijma, Stijn Heldens, Alessio Sclocco, Ben van Werkhoven, and Henri E. Bal. 2023. Optimization Techniques for GPU Programming. ACM Comput. Surv. (2023)."},{"key":"e_1_3_2_1_21_1","unstructured":"Intel. 2015. The Computer Architecture of Intel Processor Graphics Gen9. https:\/\/software.intel.com\/content\/dam\/develop\/external\/us\/en\/documents\/the-compute-architecture-of-intel-processor-graphics-gen9-v1d0.pdf."},{"issue":"1","key":"e_1_3_2_1_22_1","first-page":"2","article-title":"Intel FPGA SDK for OpenCL Pro Edition","volume":"10","year":"2021","unstructured":"Intel. 2021. Intel FPGA SDK for OpenCL Pro Edition: Best Practices Guide - Section 10.1.2. https:\/\/www.intel.com\/content\/www\/us\/en\/docs\/programmable\/683521\/21--4\/using-a-single-kernel-to-describe-systolic.html.","journal-title":"Best Practices Guide - Section"},{"key":"e_1_3_2_1_23_1","unstructured":"Intel. 2023 a. Intel Arria 10 Product Table. https:\/\/www.intel.com\/content\/dam\/www\/programmable\/us\/en\/pdfs\/literature\/pt\/arria-10-product-table.pdf."},{"key":"e_1_3_2_1_24_1","unstructured":"Intel. 2023 b. Intel clDNN. https:\/\/github.com\/intel\/clDNN."},{"key":"e_1_3_2_1_25_1","unstructured":"Intel. 2023 c. Intel FPGA SDK for OpenCL Pro Edition. http:\/\/fpgasoftware.intel.com\/opencl\/."},{"key":"e_1_3_2_1_26_1","unstructured":"Intel. 2023 d. Intel Stratix 10 Product Table. https:\/\/www.intel.com\/content\/dam\/www\/programmable\/us\/en\/pdfs\/literature\/pt\/stratix-10-product-table.pdf."},{"key":"e_1_3_2_1_27_1","unstructured":"Intel. 2023 e. OneAPI Programming Model. https:\/\/www.oneapi.io\/."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586329"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530411"},{"key":"e_1_3_2_1_30_1","volume-title":"The organization of computations for uniform recurrence equations. J. ACM","author":"Karp Richard M","year":"1967","unstructured":"Richard M Karp, Raymond E Miller, and Shmuel Winograd. 1967. The organization of computations for uniform recurrence equations. J. ACM (1967)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3192366.3192379"},{"key":"e_1_3_2_1_32_1","volume-title":"Why systolic architecturesc IEEE computer","author":"Kung Hsiang-Tsung","year":"1982","unstructured":"Hsiang-Tsung Kung. 1982. Why systolic architecturesc IEEE computer (1982)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3289602.3293910"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3400302.3415644"},{"key":"e_1_3_2_1_35_1","unstructured":"Yi-Hsiang Lai Ecenur Ustun Shaojie Xiang Zhenman Fang Hongbo Rong and Zhiru Zhang. 2021. Programming and Synthesis for Software-Defined FPGA Acceleration: Status and Future Prospects. ACM Trans. Reconfigurable Technol. Syst. (2021)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00062"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370324"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC56929.2023.10247743"},{"key":"e_1_3_2_1_39_1","volume-title":"On the design of algorithms for VLSI systolic arrays. Proc","author":"Moldovan D. I.","year":"1983","unstructured":"D. I. Moldovan. 1983. On the design of algorithms for VLSI systolic arrays. Proc. IEEE (1983)."},{"key":"e_1_3_2_1_40_1","volume-title":"VTA: an open hardware-software stack for deep learning. arXiv preprint arXiv:1807.04188","author":"Moreau Thierry","year":"2018","unstructured":"Thierry Moreau, Tianqi Chen, Ziheng Jiang, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. VTA: an open hardware-software stack for deep learning. arXiv preprint arXiv:1807.04188 (2018)."},{"key":"e_1_3_2_1_41_1","unstructured":"Netlib. 2023. Band Storage. https:\/\/netlib.org\/lapack\/lug\/node124.html"},{"key":"e_1_3_2_1_42_1","volume-title":"Tensors for data mining and data fusion: Models, applications, and scalable algorithms. ACM Transactions on Intelligent Systems and Technology","author":"Papalexakis Evangelos E","year":"2016","unstructured":"Evangelos E Papalexakis, Christos Faloutsos, and Nicholas D Sidiropoulos. 2016. Tensors for data mining and data fusion: Models, applications, and scalable algorithms. ACM Transactions on Intelligent Systems and Technology (2016)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304059"},{"key":"e_1_3_2_1_44_1","volume-title":"The landscape of software for tensor computations. arXiv preprint arXiv:2103.13756","author":"Psarras Christos","year":"2021","unstructured":"Christos Psarras, Lars Karlsson, Jiajia Li, and Paolo Bientinesi. 2021. The landscape of software for tensor computations. arXiv preprint arXiv:2103.13756 (2021)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/800015.808184"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"e_1_3_2_1_47_1","unstructured":"Chris Rauer George S. Powley Mir Ahsan and Jr. Nicholas Finamore. 2017. Intel White Paper: Accelerating Genomics Research with OpenCL and FPGAs. https:\/\/www.intel.com\/content\/dam\/www\/programmable\/us\/en\/pdfs\/literature\/wp\/wp-accelerating-genomics-opencl-fpgas.pdf."},{"key":"e_1_3_2_1_48_1","volume-title":"Efficient acceleration of the pair-hmms forward algorithm for gatk haplotypecaller on graphics processing units. Evolutionary Bioinformatics","author":"Ren Shanshan","year":"2018","unstructured":"Shanshan Ren, Koen Bertels, and Zaid Al-Ars. 2018. Efficient acceleration of the pair-hmms forward algorithm for gatk haplotypecaller on graphics processing units. Evolutionary Bioinformatics (2018)."},{"key":"e_1_3_2_1_49_1","volume-title":"Survey and benchmarking of machine learning accelerators","author":"Reuther Albert","unstructured":"Albert Reuther, Peter Michaleas, Michael Jones, Vijay Gadepally, Siddharth Samsi, and Jeremy Kepner. 2019. Survey and benchmarking of machine learning accelerators. In IEEE high performance extreme computing conference (HPEC'19)."},{"key":"e_1_3_2_1_50_1","volume-title":"Programmatic control of a compiler for generating high-performance spatial hardware. arXiv preprint arXiv:1711.07606","author":"Rong Hongbo","year":"2017","unstructured":"Hongbo Rong. 2017. Programmatic control of a compiler for generating high-performance spatial hardware. arXiv preprint arXiv:1711.07606 (2017)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM.2019.00033"},{"key":"e_1_3_2_1_52_1","unstructured":"Techpowerup. 2021a. Intel Iris Xe MAX Graphics. https:\/\/www.techpowerup.com\/gpu-specs\/iris-xe-max-graphics.c3737."},{"key":"e_1_3_2_1_53_1","unstructured":"Techpowerup. 2021b. Intel UHD Graphics P630. https:\/\/www.techpowerup.com\/gpu-specs\/uhd-graphics-p630.c3452."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Peter Vanbroekhoven Gerda Janssens Maurice Bruynooghe and Francky Catthoor. 2007. A Practical Dynamic Single Assignment Transformation. ACM Trans. Des. Autom. Electron. Syst. (2007).","DOI":"10.1145\/1278349.1278353"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3431920.3439292"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2017.79"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3061639.3062207"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00086"},{"key":"e_1_3_2_1_59_1","volume-title":"The Formal Synthesis of Control Signals for Systolic Arrays. Ph.,D. Dissertation. Department of Computer Science","author":"Xue Jingling","unstructured":"Jingling Xue. 1992. The Formal Synthesis of Control Signals for Systolic Arrays. Ph.,D. Dissertation. Department of Computer Science, University of Edinburgh."},{"key":"e_1_3_2_1_60_1","volume-title":"ACM\/IEEE Annual International Symposium on Computer Architecture (ISCA'22)","author":"Zheng Size","year":"2022","unstructured":"Size Zheng, Renze Chen, Anjiang Wei, Yicheng Jin, Qin Han, Liqiang Lu, Bingyang Wu, Xiuhong Li, Shengen Yan, and Yun Liang. 2022. AMOS: enabling automatic mapping for tensor computations on spatial accelerators with hardware abstraction. In ACM\/IEEE Annual International Symposium on Computer Architecture (ISCA'22)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623792"},{"key":"e_1_3_2_1_62_1","volume-title":"Chimera: An Analytical Optimizing Framework for Effective Compute-intensive Operators Fusion. In IEEE International Symposium on High-Performance Computer Architecture (HPCA'23)","author":"Zheng Size","year":"2023","unstructured":"Size Zheng, Siyuan Chen, Peidi Song, Renze Chen, Xiuhong Li, Shengen Yan, Dahua Lin, Jingwen Leng, and Yun Liang. 2023 b. Chimera: An Analytical Optimizing Framework for Effective Compute-intensive Operators Fusion. In IEEE International Symposium on High-Performance Computer Architecture (HPCA'23)."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378508"}],"event":{"name":"FPGA '24: The 2024 ACM\/SIGDA International Symposium on Field Programmable Gate Arrays","location":"Monterey CA USA","acronym":"FPGA '24","sponsor":["SIGDA ACM Special Interest Group on Design Automation"]},"container-title":["Proceedings of the 2024 ACM\/SIGDA International Symposium on Field Programmable Gate Arrays"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3626202.3637566","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3626202.3637566","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T22:04:12Z","timestamp":1755900252000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3626202.3637566"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4]]},"references-count":63,"alternative-id":["10.1145\/3626202.3637566","10.1145\/3626202"],"URL":"https:\/\/doi.org\/10.1145\/3626202.3637566","relation":{},"subject":[],"published":{"date-parts":[[2024,4]]},"assertion":[{"value":"2024-04-02","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}