{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T18:15:04Z","timestamp":1771956904984,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,2,21]],"date-time":"2023-02-21T00:00:00Z","timestamp":1676937600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Department of Energy","award":["DE-SC0019323"],"award-info":[{"award-number":["DE-SC0019323"]}]},{"name":"Advanced Scientific Computing Research Program","award":["DE-AC02-06CH11357"],"award-info":[{"award-number":["DE-AC02-06CH11357"]}]},{"DOI":"10.13039\/100008902","name":"Los Alamos National Laboratory","doi-asserted-by":"publisher","award":["531711"],"award-info":[{"award-number":["531711"]}],"id":[{"id":"10.13039\/100008902","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Exascale Computing Project","award":["17-SC-20-SC"],"award-info":[{"award-number":["17-SC-20-SC"]}]},{"name":"United States Air Force Artificial Intelligence Accelerator","award":["FA8750-19-2-1000"],"award-info":[{"award-number":["FA8750-19-2-1000"]}]},{"DOI":"10.13039\/501100001691","name":"Japan Society for the Promotion of Science","doi-asserted-by":"publisher","award":["19H04119"],"award-info":[{"award-number":["19H04119"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,2,25]]},"DOI":"10.1145\/3572848.3577475","type":"proceedings-article","created":{"date-parts":[[2023,2,21]],"date-time":"2023-02-21T16:02:30Z","timestamp":1676995350000},"page":"119-134","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":24,"title":["High-Performance GPU-to-CPU Transpilation and Optimization via High-Level Parallel Constructs"],"prefix":"10.1145","author":[{"given":"William S.","family":"Moses","sequence":"first","affiliation":[{"name":"MIT CSAIL, United States"}]},{"given":"Ivan R.","family":"Ivanov","sequence":"additional","affiliation":[{"name":"Tokyo Tech, Japan"}]},{"given":"Jens","family":"Domke","sequence":"additional","affiliation":[{"name":"RIKEN, Japan"}]},{"given":"Toshio","family":"Endo","sequence":"additional","affiliation":[{"name":"Tokyo Tech, Japan"}]},{"given":"Johannes","family":"Doerfert","sequence":"additional","affiliation":[{"name":"LLNL, United States"}]},{"given":"Oleksandr","family":"Zinenko","sequence":"additional","affiliation":[{"name":"Google, France"}]}],"member":"320","published-online":{"date-parts":[[2023,2,21]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/268946.268974"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3293883.3302577"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2014.07.003"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2015.44"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","unstructured":"Valentin Churavy Dilum Aluthge Lucas C Wilcox Simon Byrne Maciej Waruszewski Ali Ramadhan Meredith Simeon Schaub James Schloss Julian Samaroo Jake Bolewski Charles Kawczynski Jeremy E Kozdon Jinguo Liu Oliver Schulz Oscar P\u00e1ll Haraldsson Takafumi Arakaki and Tim Besard. 2022. JuliaGPU\/KernelAbstractions.jl: v0.8.0. 10.5281\/zenodo.6324344","DOI":"10.5281\/zenodo.6324344"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/75277.75280"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/1065944.1065949"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/1854273.1854318"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-28596-8_11"},{"key":"e_1_3_2_1_11_1","volume-title":"Evolving OpenMP for Evolving Architectures, Bronis R","author":"Doerfert Johannes","unstructured":"Johannes Doerfert and Hal Finkel. 2018. Compiler Optimizations for OpenMP. In Evolving OpenMP for Evolving Architectures, Bronis R. de Supinski, Pedro Valero-Lara, Xavier Martorell, Sergi Mateo Bellido, and Jesus Labarta (Eds.). Springer International Publishing, Cham, 113--127."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-34627-0_9"},{"key":"e_1_3_2_1_13_1","unstructured":"Aleksandr Drozd. 2021. Benchmarker. Online GitHub repository: https:\/\/github.com\/undertherain\/benchmarker\/ commit e1f22da320b0c7384cbd2f4df50255c7c2fa6b9d."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2011.10.002"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2014.07.003"},{"key":"e_1_3_2_1_16_1","volume-title":"Encyclopedia of parallel computing","author":"Feautrier Paul","year":"2011","unstructured":"Paul Feautrier and Christian Lengauer. 2011. Polyhedron Model. Encyclopedia of parallel computing (2011), 1581--1592."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2018.2873289"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/277650.277725"},{"key":"e_1_3_2_1_19_1","unstructured":"Fujitsu. 2021. https:\/\/www.fujitsu.com\/downloads\/SUPER\/a64fx\/a64fx_datasheet_en.pdf"},{"key":"e_1_3_2_1_20_1","unstructured":"Fujitsu. 2022. https:\/\/github.com\/fujitsu\/dnnl_aarch64"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3469030"},{"key":"e_1_3_2_1_22_1","volume-title":"International journal of parallel programming 26, 5","author":"Han Hwansoo","year":"1998","unstructured":"Hwansoo Han, Chau-Wen Tseng, and Pete Keleher. 1998. Eliminating barrier synchronization for compiler-parallelized codes on software DSMs. International journal of parallel programming 26, 5 (1998), 591--612."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3554736"},{"key":"e_1_3_2_1_24_1","unstructured":"Mark Harris et al. 2007. Optimizing parallel reduction in CUDA. Nvidia developer technology 2 4 (2007) 70."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACCPD.2014.10"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/1854273.1854303"},{"key":"e_1_3_2_1_28_1","unstructured":"Intel. 2022. OneAPI Deep Neural Network Library (OneDNN). https:\/\/github.com\/oneapi-src\/oneDNN"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10766-014-0320-y"},{"key":"e_1_3_2_1_30_1","volume-title":"Compiler Construction, Michael O'Boyle (Ed.)","author":"Karrenberg Ralf","unstructured":"Ralf Karrenberg and Sebastian Hack. 2012. Improving performance of OpenCL on CPUs. In Compiler Construction, Michael O'Boyle (Ed.). Springer Berlin Heidelberg, Berlin, Heidelberg, 1--20."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/2627373.2627387"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178493"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2004.1281665"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370308"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/263699.263719"},{"key":"e_1_3_2_1_36_1","unstructured":"LLVM Contributors. 2021. OpenMP-aware optimizations. Online: https:\/\/openmp.llvm.org\/optimizations\/OpenMPOpt.html."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/2892208.2892217"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/329366.301112"},{"key":"e_1_3_2_1_39_1","volume-title":"How should compilers represent fork-join parallelism? Master's thesis","author":"Moses William Steven","unstructured":"William Steven Moses. 2017. How should compilers represent fork-join parallelism? Master's thesis. Massachusetts Institute of Technology."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/PACT52795.2021.00011"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476165"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/2254064.2254124"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2002.1011394"},{"key":"e_1_3_2_1_44_1","volume-title":"Garnett (Eds.)","volume":"32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. Py-Torch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems, H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch\u00e9-Buc, E. Fox, and R. Garnett (Eds.), Vol. 32. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper\/2019\/file\/bdbca288fee7f92f2bfa9f7012727740-Paper.pdf"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458744.3473356"},{"key":"e_1_3_2_1_46_1","volume-title":"ispc: A SPMD compiler for high-performance CPU programming. In 2012 Innovative Parallel Computing (InPar)","author":"Pharr Matt","unstructured":"Matt Pharr and William R Mark. 2012. ispc: A SPMD compiler for high-performance CPU programming. In 2012 Innovative Parallel Computing (InPar). IEEE, 1--13."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC.2017.00046"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00051"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3365655"},{"key":"e_1_3_2_1_51_1","volume-title":"PPIR: Parallel Pattern Intermediate Representation. In 2021 IEEE\/ACM International Workshop on Hierarchical Parallelism for Exascale Computing (HiPar). IEEE, 30--40","author":"Schmitz Adrian","year":"2021","unstructured":"Adrian Schmitz, Julian Miller, Lukas Tr\u00fcmper, and Matthias S M\u00fcller. 2021. PPIR: Parallel Pattern Intermediate Representation. In 2021 IEEE\/ACM International Workshop on Hierarchical Parallelism for Exascale Computing (HiPar). IEEE, 30--40."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","unstructured":"Alexander Sergeev and Mike Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. 10.48550\/ARXIV.1802.05799","DOI":"10.48550\/ARXIV.1802.05799"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/2983990.2984032"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485508"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3148173.3148186"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3148173.3148186"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/1772954.1772971"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-89740-8_2"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACSD.2006.23"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/1065944.1065947"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3148173.3148191"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/209937.209952"},{"key":"e_1_3_2_1_63_1","volume-title":"Joint scheduling and layout optimization to enable multi-level vectorization. IMPACT 12","author":"Vasilache Nicolas","year":"2012","unstructured":"Nicolas Vasilache, Benoit Meister, Muthu Baskaran, and Richard Lethin. 2012. Joint scheduling and layout optimization to enable multi-level vectorization. IMPACT 12 (2012)."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3355606"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3178372.3179507"}],"event":{"name":"PPoPP '23: The 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","location":"Montreal QC Canada","acronym":"PPoPP '23","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3572848.3577475","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3572848.3577475","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3572848.3577475","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T18:08:09Z","timestamp":1750183689000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3572848.3577475"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2,21]]},"references-count":65,"alternative-id":["10.1145\/3572848.3577475","10.1145\/3572848"],"URL":"https:\/\/doi.org\/10.1145\/3572848.3577475","relation":{},"subject":[],"published":{"date-parts":[[2023,2,21]]},"assertion":[{"value":"2023-02-21","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}