{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T11:58:49Z","timestamp":1768823929793,"version":"3.49.0"},"reference-count":45,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2022,4,1]],"date-time":"2022-04-01T00:00:00Z","timestamp":1648771200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2022,4,1]],"date-time":"2022-04-01T00:00:00Z","timestamp":1648771200000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2022,4,1]],"date-time":"2022-04-01T00:00:00Z","timestamp":1648771200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,4,1]],"date-time":"2022-04-01T00:00:00Z","timestamp":1648771200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Exascale Computing","award":["17-SC-20-SC"],"award-info":[{"award-number":["17-SC-20-SC"]}]},{"DOI":"10.13039\/100000015","name":"U.S. Department of Energy","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000015","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100006168","name":"National Nuclear Security Administration","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006168","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100006227","name":"Lawrence Livermore National Laboratory","doi-asserted-by":"publisher","award":["B639429"],"award-info":[{"award-number":["B639429"]}],"id":[{"id":"10.13039\/100006227","id-type":"DOI","asserted-by":"publisher"}]},{"name":"ExxonMobil Graduate Fellowship"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2022,4,1]]},"DOI":"10.1109\/tpds.2021.3094169","type":"journal-article","created":{"date-parts":[[2021,7,1]],"date-time":"2021-07-01T19:33:56Z","timestamp":1625168036000},"page":"854-865","source":"Crossref","is-referenced-by-count":8,"title":["An Automated Tool for Analysis and Tuning of GPU-Accelerated Code in HPC Applications"],"prefix":"10.1109","volume":"33","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7977-3182","authenticated-orcid":false,"given":"Keren","family":"Zhou","sequence":"first","affiliation":[]},{"given":"Xiaozhu","family":"Meng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8372-401X","authenticated-orcid":false,"given":"Ryuichi","family":"Sai","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3336-0726","authenticated-orcid":false,"given":"Dejan","family":"Grubisic","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9026-5453","authenticated-orcid":false,"given":"John","family":"Mellor-Crummey","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","article-title":"AutoScope: Automatic suggestions for code optimizations using PerfExpert","author":"sopeju","year":"2011","journal-title":"Proc Int Conf Parallel Distrib Process Techn Appl"},{"key":"ref38","doi-asserted-by":"crossref","first-page":"685","DOI":"10.1002\/cpe.1553","article-title":"HPCToolkit: Tools for performance analysis of optimized parallel programs","volume":"22","author":"adhianto","year":"2010","journal-title":"Concurrency Comput Pract Experience"},{"key":"ref33","article-title":"Minimod: A finite difference solver for seismic modeling","author":"meng","year":"2020"},{"key":"ref32","article-title":"PeleC","year":"0"},{"key":"ref31","article-title":"ExaTENSOR","author":"lyakh","year":"0"},{"key":"ref30","article-title":"Quicksilver","year":"0"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2010.41"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3392717.3392761"},{"key":"ref35","article-title":"AMD ROCm ROCProfiler","year":"0"},{"key":"ref34","author":"reinders","year":"2005","journal-title":"VTune Performance Analyzer Essentials"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2015.14"},{"key":"ref40","article-title":"MAQAO: Modular assembler quality analyzer and optimizer for itanium 2","volume":"200","author":"djoudi","year":"2005","journal-title":"Proc 4th Workshop EPIC Architectures Compiler Technol"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/2872887.2750375"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358307"},{"key":"ref13","year":"2020","journal-title":"NVIDIA Compute Sanitizer DA-05679&#x2013;001_v11 2"},{"key":"ref14","first-page":"214","article-title":"CUDAAdvisor: LLVM-based runtime profiling for modern GPUs","author":"shen","year":"2018","journal-title":"Proc Int Symp Code Generation Optim"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.1997.645821"},{"key":"ref16","article-title":"Instruction-based sampling: A new performance analysis technique for AMD family 10h processors","author":"drongowski","year":"2007"},{"key":"ref17","year":"2018","journal-title":"POWER9 Performance Monitor Unit User&#x2019;s Guide version 1 2"},{"key":"ref18","article-title":"PC sampling","year":"2019"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ProTools49597.2019.00006"},{"key":"ref28","article-title":"Dissecting the NVIDIA Volta GPU architecture via microbenchmarking","author":"jia","year":"2018"},{"key":"ref4","article-title":"NVIDIA nsight compute","year":"0"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2002.10019"},{"key":"ref3","article-title":"NVIDIA nsight systems","year":"0"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1177\/1094342006064482"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3392717.3392752"},{"key":"ref8","first-page":"25","article-title":"Allinea MAP: Adding energy and OpenMP profiling without increasing overhead","author":"january","year":"2015","journal-title":"Tools for High Performance Computing"},{"key":"ref7","first-page":"85","article-title":"Score-P: A unified performance measurement system for petascale applications","author":"mey","year":"2012","journal-title":"Proc Int Conf Competence High Perform Comput"},{"key":"ref2","year":"2020","journal-title":"Profiler User&#x2019;s Guide DU-05982&#x2013;001_v11 2"},{"key":"ref9","first-page":"1","article-title":"GVPROF: A value profiler for GPU-based clusters","author":"zhou","year":"2020","journal-title":"Proc Int Conf High Perform Comput Netw Storage Anal"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00055"},{"key":"ref20","article-title":"Data-centric performance measurement and mapping for highly parallel programming models","author":"zhang","year":"2018"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3368826.3377911"},{"key":"ref22","first-page":"115","article-title":"GPA: A GPU performance advisor based on instruction sampling","author":"zhou","year":"2021","journal-title":"Proc IEEE\/ACM Int Symp Code Gener Optim"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2011.12.006"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126961"},{"key":"ref24","article-title":"CUDA binary utilities","year":"0"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC.2014.7116904"},{"key":"ref23","article-title":"System, method, and computer program product for implementing software-based scoreboarding","author":"ohannessian jr","year":"2017"},{"key":"ref44","article-title":"Dyninst","year":"0"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICSM.1997.624245"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1137\/120864672"},{"key":"ref25","year":"2020","journal-title":"CUPTI User&#x2019;s Guide DA-05679&#x2013;001_v11 2"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/71\/9575177\/9470950-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/71\/9575177\/09470950.pdf?arnumber=9470950","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,5]],"date-time":"2023-11-05T14:52:24Z","timestamp":1699195944000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9470950\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,4,1]]},"references-count":45,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2021.3094169","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"value":"1045-9219","type":"print"},{"value":"1558-2183","type":"electronic"},{"value":"2161-9883","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,4,1]]}}}