{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,4]],"date-time":"2026-02-04T18:19:54Z","timestamp":1770229194547,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"vor","delay-in-days":75,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"NSF","award":["DGE 2236417"],"award-info":[{"award-number":["DGE 2236417"]}]},{"name":"DOE","award":["DE-SC0021"],"award-info":[{"award-number":["DE-SC0021"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3730423","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"776-791","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Taking GPU Programming Models to Task for Performance Portability"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6704-0520","authenticated-orcid":false,"given":"Joshua Hoke","family":"Davis","sequence":"first","affiliation":[{"name":"Department of Computer Science, University of Maryland, College Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7558-823X","authenticated-orcid":false,"given":"Pranav","family":"Sivaraman","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Maryland, College Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7692-2453","authenticated-orcid":false,"given":"Joy","family":"Kitson","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Maryland, College Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8258-9693","authenticated-orcid":false,"given":"Konstantinos","family":"Parasyris","sequence":"additional","affiliation":[{"name":"Lawrence Livermore National Laboratory, Livermore, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4707-9580","authenticated-orcid":false,"given":"Harshitha","family":"Menon","sequence":"additional","affiliation":[{"name":"Lawrence Livermore National Laboratory, Livermore, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9832-6037","authenticated-orcid":false,"given":"Isaac","family":"Minn","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Maryland, College Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6542-3555","authenticated-orcid":false,"given":"Giorgis","family":"Georgakoudis","sequence":"additional","affiliation":[{"name":"Lawrence Livermore National Laboratory, Livermore, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3069-3701","authenticated-orcid":false,"given":"Abhinav","family":"Bhatele","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Maryland, College Park, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"[n. d.]. ECP Proxy Applications. https:\/\/proxyapps.exascaleproject.org\/. Accessed: 2023-09-30."},{"key":"e_1_3_3_2_3_2","unstructured":"[n. d.]. NERSC Proxy Suite. https:\/\/www.nersc.gov\/research-and-development\/nersc-proxy-suite\/."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"crossref","unstructured":"Victor Artigues Katharina Kormann Markus Rampp and Klaus Reuter. 2020. Evaluation of performance portability frameworks for the implementation of a particle-in-cell code. Concurrency and Computation: Practice and Experience 32 11 (2020) e5640.","DOI":"10.1002\/cpe.5640"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","unstructured":"D.\u00a0A. Beckingsale M.\u00a0J. McFadden J.\u00a0P.\u00a0S. Dahm R. Pankajakshan and R.\u00a0D. Hornung. 2020. Umpire: Application-focused management and coordination of complex hierarchical memory. IBM Journal of Research and Development 64 3\/4 (2020) 00:1\u201300:10. 10.1147\/JRD.2019.2954403","DOI":"10.1147\/JRD.2019.2954403"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356173"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Claude Bernard Michael\u00a0C Ogilvie Thomas\u00a0A DeGrand Carleton\u00a0E DeTar Steven\u00a0A Gottlieb A Krasnitz Robert\u00a0L Sugar and Doug Toussaint. 1991. Studying quarks and gluons on MIMD parallel computers. The International Journal of Supercomputing Applications 5 4 (1991) 61\u201370.","DOI":"10.1177\/109434209100500406"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-02465-9_51"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid54584.2022.00077"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC49587.2019.00010"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-74224-9_2"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC56579.2022.00006"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC54578.2021.00007"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC49587.2019.00006"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC51967.2020.00006"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Tom Deakin James Price Matt Martineau and Simon McIntosh-Smith. 2018. Evaluating Attainable Memory Bandwidth of Parallel Programming Models via BabelStream. Int. J. Comput. Sci. Eng. 17 3 (jan 2018) 247\u2013262.","DOI":"10.1504\/IJCSE.2018.095847"},{"key":"e_1_3_3_2_17_2","volume-title":"su3_bench: Lattice QCD SU (3) matrix-matrix multiply microbenchmark (su3_bench) v1. 0","author":"Doerfler Douglas","year":"2020","unstructured":"Douglas Doerfler and Christopher Daley. 2020. su3_bench: Lattice QCD SU (3) matrix-matrix multiply microbenchmark (su3_bench) v1. 0. Technical Report. Lawrence Berkeley National Lab.(LBNL), Berkeley, CA (United States)."},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC54578.2021.00009"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/2807591.2807623"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-12274-4_4"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC.2018.00006"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.Companion.2012.66"},{"key":"e_1_3_3_2_23_2","volume-title":"Mantevo Suite 1.0.","author":"Heroux Michael\u00a0Allen","year":"2013","unstructured":"Michael\u00a0Allen Heroux, Richard\u00a0Frederick Barrett, James\u00a0Michael Willenbring, Simon\u00a0David Hammond, David Richards, Jamal Mohd-Yusof, and Andrew Herdman. 2013. Mantevo Suite 1.0.Technical Report. Sandia National Lab.(SNL-NM), Albuquerque, NM (United States)."},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.2172\/1169830"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2013.115"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3624062.3624133"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC54578.2021.00008"},{"key":"e_1_3_3_2_28_2","unstructured":"Wei-Chen Lin Simon McIntosh-Smith and Tom Deakin. 2024. Preliminary report: Initial evaluation of StdPar implementations on AMD GPUs for HPC. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.02680 (2024)."},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/PDP52278.2021.00036"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","unstructured":"Ami Marowka. 2023. A comparison of two performance portability metrics. Concurrency and Computation: Practice and Experience (2023) e7868.","DOI":"10.1002\/cpe.7868"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"crossref","unstructured":"Matthew Martineau Simon McIntosh-Smith and Wayne Gaudin. 2017. Assessing the performance portability of modern parallel programming models using TeaLeaf. Concurrency and Computation: Practice and Experience 29 15 (2017) e4117.","DOI":"10.1002\/cpe.4117"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Simon McIntosh-Smith James Price Richard\u00a0B Sessions and Amaurys\u00a0A Ibarra. 2015. High performance in silico virtual drug screening on many-core processors. The international journal of high performance computing applications 29 2 (2015) 119\u2013134.","DOI":"10.1177\/1094342014528252"},{"key":"e_1_3_3_2_33_2","unstructured":"OpenMP4 2013. OpenMP Application Program Interface. Version 4.0. July 2013."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC54578.2021.00004"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"crossref","unstructured":"S\u00a0John Pennycook Jason\u00a0D Sewall Douglas\u00a0W Jacobsen Tom Deakin and Simon McIntosh-Smith. 2021. Navigating performance portability and productivity. Computing in Science & Engineering 23 5 (2021) 28\u201338.","DOI":"10.1109\/MCSE.2021.3097276"},{"key":"e_1_3_3_2_36_2","volume-title":"Proceedings of the 7th International Workshop in Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems","author":"Pennycook Simon\u00a0J","year":"2016","unstructured":"Simon\u00a0J Pennycook, Jason\u00a0D Sewall, and Victor\u00a0W Lee. 2016. A metric for performance portability. In Proceedings of the 7th International Workshop in Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems. https:\/\/arxiv.org\/abs\/1611.07409"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"crossref","unstructured":"Simon\u00a0J Pennycook Jason\u00a0D Sewall and Victor\u00a0W Lee. 2019. Implications of a metric for performance portability. Future Generation Computer Systems 92 (2019) 947\u2013958.","DOI":"10.1016\/j.future.2017.08.007"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3624062.3624187"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC49654.2021.9622813"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC49587.2019.00008"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"crossref","unstructured":"Istv\u00e1n\u00a0Z Reguly and Gihan\u00a0R Mudalige. 2020. Productivity performance and portability for computational fluid dynamics applications. Computers & Fluids 199 (2020) 104425.","DOI":"10.1016\/j.compfluid.2020.104425"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"Paul\u00a0K Romano Nicholas\u00a0E Horelik Bryan\u00a0R Herman Adam\u00a0G Nelson Benoit Forget and Kord Smith. 2015. OpenMC: A state-of-the-art Monte Carlo code for research and development. Annals of Nuclear Energy 82 (2015) 90\u201397.","DOI":"10.1016\/j.anucene.2014.07.048"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-17473-0_4"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC.2018.00004"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC51967.2020.00007"},{"key":"e_1_3_3_2_46_2","unstructured":"TOP500.org. 2024. June 2024 TOP500. https:\/\/www.top500.org\/lists\/top500\/2024\/06\/"},{"key":"e_1_3_3_2_47_2","unstructured":"TOP500.org. 2024. November 2024 TOP500. https:\/\/www.top500.org\/lists\/top500\/2024\/11\/"},{"key":"e_1_3_3_2_48_2","unstructured":"John\u00a0R Tramm Andrew\u00a0R Siegel Tanzima Islam and Martin Schulz. 2014. XSBench-the development and verification of a performance abstraction for Monte Carlo reactor analysis. The Role of Reactor Physics toward a Sustainable Future (PHYSOR) (2014)."},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","unstructured":"Christian\u00a0R. Trott Damien Lebrun-Grandi\u00e9 Daniel Arndt Jan Ciesko Vinh Dang Nathan Ellingwood Rahulkumar Gayatri Evan Harvey Daisy\u00a0S. Hollman Dan Ibanez Nevin Liber Jonathan Madsen Jeff Miles David Poliakoff Amy Powell Sivasankaran Rajamanickam Mikael Simberg Dan Sunderland Bruno Turcksin and Jeremiah Wilke. 2022. Kokkos 3: Programming Model Extensions for the Exascale Era. IEEE Transactions on Parallel and Distributed Systems 33 4 (2022) 805\u2013817. 10.1109\/TPDS.2021.3097283","DOI":"10.1109\/TPDS.2021.3097283"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"crossref","unstructured":"Michael Wolfe Seyong Lee Jungwon Kim Xiaonan Tian Rengan Xu Barbara Chapman and Sunita Chandrasekaran. 2018. The OpenACC data model: Preliminary study on its major challenges and implementations. Parallel Comput. 78 (2018) 15\u201327.","DOI":"10.1016\/j.parco.2018.07.003"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3730423","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3730423","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:59:31Z","timestamp":1755867571000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3730423"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":49,"alternative-id":["10.1145\/3721145.3730423","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3730423","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}