{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T03:38:40Z","timestamp":1774323520336,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,4,15]],"date-time":"2023-04-15T00:00:00Z","timestamp":1681516800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2125813, 2050007"],"award-info":[{"award-number":["2125813, 2050007"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,4,15]]},"DOI":"10.1145\/3578244.3583736","type":"proceedings-article","created":{"date-parts":[[2023,3,29]],"date-time":"2023-03-29T22:08:41Z","timestamp":1680127721000},"page":"43-53","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["DrGPU: A Top-Down Profiler for GPU Applications"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4792-6244","authenticated-orcid":false,"given":"Yueming","family":"Hao","sequence":"first","affiliation":[{"name":"North Carolina State University, Raleigh, NC, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1555-9125","authenticated-orcid":false,"given":"Nikhil","family":"Jain","sequence":"additional","affiliation":[{"name":"Nvidia Corporation, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4609-6403","authenticated-orcid":false,"given":"Rob","family":"Van der Wijngaart","sequence":"additional","affiliation":[{"name":"Nvidia Corporation, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1248-0721","authenticated-orcid":false,"given":"Nirmal","family":"Saxena","sequence":"additional","affiliation":[{"name":"Nvidia Corporation, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6123-006X","authenticated-orcid":false,"given":"Yuanbo","family":"Fan","sequence":"additional","affiliation":[{"name":"Tenstorrent Incorporated, San Francisco, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1487-963X","authenticated-orcid":false,"given":"Xu","family":"Liu","sequence":"additional","affiliation":[{"name":"North Carolina State University, Raleigh, NC, USA"}]}],"member":"320","published-online":{"date-parts":[[2023,4,15]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Compiling CUDA with clang. https:\/\/llvm.org\/docs\/CompileCudaWithLLVM.html [Accessed","year":"2021","unstructured":"2021. Compiling CUDA with clang. https:\/\/llvm.org\/docs\/CompileCudaWithLLVM.html [Accessed May 29, 2021]."},{"key":"e_1_3_2_1_2_1","volume-title":"GTPin - A Dynamic Binary Instrumentation Framework. https:\/\/software.intel.com\/content\/www\/us\/en\/develop\/articles\/gtpin.html [Accessed","year":"2021","unstructured":"2021. GTPin - A Dynamic Binary Instrumentation Framework. https:\/\/software.intel.com\/content\/www\/us\/en\/develop\/articles\/gtpin.html [Accessed April 6, 2021]."},{"key":"e_1_3_2_1_3_1","volume-title":"The LLVM Compiler Infrastructure. https:\/\/llvm.org\/ [Accessed","year":"2021","unstructured":"2021. The LLVM Compiler Infrastructure. https:\/\/llvm.org\/ [Accessed April 6, 2021]."},{"key":"e_1_3_2_1_4_1","volume-title":"https:\/\/github.com\/AMReX-Combustion\/PeleC [Accessed","author":"C.","year":"2021","unstructured":"2021. PeleC. https:\/\/github.com\/AMReX-Combustion\/PeleC [Accessed April 6, 2021]."},{"key":"e_1_3_2_1_5_1","volume-title":"https:\/\/github.com\/ROCm-Developer-Tools\/rocprofiler [Accessed","year":"2021","unstructured":"2021. ROC-profiler. https:\/\/github.com\/ROCm-Developer-Tools\/rocprofiler [Accessed April 6, 2021]."},{"key":"e_1_3_2_1_6_1","volume-title":"SASSI Instrumentation Tool for NVIDIA GPUs. https:\/\/github.com\/NVlabs\/SASSI [Accessed","year":"2021","unstructured":"2021. SASSI Instrumentation Tool for NVIDIA GPUs. https:\/\/github.com\/NVlabs\/SASSI [Accessed April 6, 2021]."},{"key":"e_1_3_2_1_7_1","volume-title":"https:\/\/www.top500.org [Accessed","author":"List Top","year":"2021","unstructured":"2021. Top 500 List. https:\/\/www.top500.org [Accessed August 25, 2021]."},{"key":"e_1_3_2_1_8_1","volume-title":"HIP Programming Guide. https:\/\/rocmdocs.amd.com\/en\/latest\/Programming_Guides\/HIP-GUIDE.html [Accessed","author":"AMD Corporation","year":"2021","unstructured":"AMD Corporation. 2021. HIP Programming Guide. https:\/\/rocmdocs.amd.com\/en\/latest\/Programming_Guides\/HIP-GUIDE.html [Accessed April 6, 2021]."},{"key":"e_1_3_2_1_9_1","volume-title":"Henry Wong, and Tor M Aamodt.","author":"Bakhoda Ali","year":"2009","unstructured":"Ali Bakhoda, George L Yuan, Wilson WL Fung, Henry Wong, and Tor M Aamodt. 2009. Analyzing CUDA workloads using a detailed GPU simulator. In 2009 IEEE Int'l Symp. on Performance Analysis of Systems and Software. IEEE, 163--174."},{"key":"e_1_3_2_1_10_1","unstructured":"Alexey Bochkovskiy Chien-Yao Wang and Hong-Yuan Mark Liao. 2020. YOLOv4: Optimal Speed and Accuracy of Object Detection. arXiv:2004.10934 [cs.CV]"},{"key":"e_1_3_2_1_11_1","volume-title":"CUDA flux: A lightweight instruction profiler for CUDA applications. In 2019 IEEE\/ACM Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)","author":"Braun Lorenz","unstructured":"Lorenz Braun and Holger Fr\u00f6ning. 2019. CUDA flux: A lightweight instruction profiler for CUDA applications. In 2019 IEEE\/ACM Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS). IEEE, 73--81."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1088\/0004-637X\/715\/2\/1221"},{"key":"e_1_3_2_1_13_1","volume-title":"Sang- Ha Lee, and Kevin Skadron","author":"Che Shuai","year":"2009","unstructured":"Shuai Che, Michael Boyer, Jiayuan Meng, David Tarjan, Jeremy W Sheaffer, Sang- Ha Lee, and Kevin Skadron. 2009. Rodinia: A benchmark suite for heterogeneous computing. In 2009 IEEE Int'l Symp. on workload characterization (IISWC). Ieee, 44--54."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/1854273.1854318"},{"key":"e_1_3_2_1_15_1","volume-title":"Data Parallel C. https:\/\/software.intel.com\/content\/www\/us\/en\/develop\/documentation\/oneapi-programming-guide\/top\/oneapi-programming-model\/data-parallel-c-dpc.html [Accessed","author":"Intel Inc. 2021.","year":"2021","unstructured":"Intel Inc. 2021. Data Parallel C. https:\/\/software.intel.com\/content\/www\/us\/en\/develop\/documentation\/oneapi-programming-guide\/top\/oneapi-programming-model\/data-parallel-c-dpc.html [Accessed April 6, 2021]."},{"key":"e_1_3_2_1_16_1","volume-title":"Competence in High Performance Computing","author":"Mey Dieteran","year":"2010","unstructured":"Dieteran Mey, Scott Biersdorf, Christian Bischof, Kai Diethelm, Dominic Eschweiler, Michael Gerndt, Andreas Knapfer, Daniel Lorenz, Allen Malony, Wolfgang E. Nagel, Yury Oleynik, Christian Rassel, Pavel Saviankou, Dirk Schmidl, Sameer Shende, Michael Wagner, Bert Wesarg, and Felix Wolf. 2012. Score-P: A Unified Performance Measurement System for Petascale Applications. In Competence in High Performance Computing 2010, Christian Bischof, Heinz-Gerd Hegering, Wolfgang E. Nagel, and Gabriel Wittum (Eds.). Springer Berlin Heidelberg, 85--97."},{"key":"e_1_3_2_1_17_1","unstructured":"NVIDIA Corporation. [n.d.]. NVIDIA PC sampling view. http:\/\/docs.nvidia.com\/cuda\/profiler-users-guide\/index.html#pc-sampling."},{"key":"e_1_3_2_1_18_1","volume-title":"NVIDIA Compute Sanitizer. https:\/\/docs.nvidia.com\/cuda\/compute-sanitizer\/index.html [Accessed","author":"NVIDIA Corporation","year":"2021","unstructured":"NVIDIA Corporation. 2020. NVIDIA Compute Sanitizer. https:\/\/docs.nvidia.com\/cuda\/compute-sanitizer\/index.html [Accessed March 26, 2021]."},{"key":"e_1_3_2_1_19_1","volume-title":"CUDA C Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html [Accessed","author":"NVIDIA Corporation","year":"2021","unstructured":"NVIDIA Corporation. 2021. CUDA C Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html [Accessed March 8, 2021]."},{"key":"e_1_3_2_1_20_1","volume-title":"NVCC. https:\/\/docs.nvidia.com\/cuda\/cuda-compiler-driver-nvcc\/index.html [Accessed","author":"NVIDIA Corporation","year":"2021","unstructured":"NVIDIA Corporation. 2021. CUDA compiler driver, NVCC. https:\/\/docs.nvidia.com\/cuda\/cuda-compiler-driver-nvcc\/index.html [Accessed April 6, 2021]."},{"key":"e_1_3_2_1_21_1","volume-title":"https:\/\/developer.nvidia.com\/cuda-toolkit [Accessed","author":"NVIDIA Corporation","year":"2021","unstructured":"NVIDIA Corporation. 2021. CUDA Toolkit. https:\/\/developer.nvidia.com\/cuda-toolkit [Accessed April 6, 2021]."},{"key":"e_1_3_2_1_22_1","volume-title":"NVIDIA cuBLAS. https:\/\/developer.nvidia.com\/cublas [Accessed","author":"NVIDIA Corporation","year":"2021","unstructured":"NVIDIA Corporation. 2021. NVIDIA cuBLAS. https:\/\/developer.nvidia.com\/cublas [Accessed March 26, 2021]."},{"key":"e_1_3_2_1_23_1","volume-title":"https:\/\/docs.nvidia.com\/cupti\/Cupti\/index.html [Accessed","author":"NVIDIA Corporation","year":"2021","unstructured":"NVIDIA Corporation. 2021. NVIDIA CUPTI. https:\/\/docs.nvidia.com\/cupti\/Cupti\/index.html [Accessed May 9, 2021]."},{"key":"e_1_3_2_1_24_1","volume-title":"NVIDIA Nsight Compute. https:\/\/developer.nvidia.com\/nsight-compute [Accessed","author":"NVIDIA Corporation","year":"2021","unstructured":"NVIDIA Corporation. 2021. NVIDIA Nsight Compute. https:\/\/developer.nvidia.com\/nsight-compute [Accessed Aug 9, 2021]."},{"key":"e_1_3_2_1_25_1","volume-title":"NVIDIA Nsight Compute Kernel Profiling Guide. https:\/\/docs.nvidia.com\/nsight-compute\/ProfilingGuide\/index.html [Accessed","author":"NVIDIA Corporation","year":"2021","unstructured":"NVIDIA Corporation. 2021. NVIDIA Nsight Compute Kernel Profiling Guide. https:\/\/docs.nvidia.com\/nsight-compute\/ProfilingGuide\/index.html [Accessed March 8, 2021]."},{"key":"e_1_3_2_1_26_1","volume-title":"NVIDIA Nsight Systems. https:\/\/developer.nvidia.com\/nsight-systems [Accessed","author":"NVIDIA Corporation","year":"2021","unstructured":"NVIDIA Corporation. 2021. NVIDIA Nsight Systems. https:\/\/developer.nvidia.com\/nsight-systems [Accessed March 9, 2021]."},{"key":"e_1_3_2_1_27_1","volume-title":"NVIDIA Tools Extension (NVTX). https:\/\/docs.nvidia.com\/gameworks\/content\/gameworkslibrary\/nvtx\/nvidia_tools_extension_library_nvtx.htm [Accessed","author":"NVIDIA Corporation","year":"2021","unstructured":"NVIDIA Corporation. 2021. NVIDIA Tools Extension (NVTX). https:\/\/docs.nvidia.com\/gameworks\/content\/gameworkslibrary\/nvtx\/nvidia_tools_extension_library_nvtx.htm [Accessed May 9, 2021]."},{"key":"e_1_3_2_1_28_1","volume-title":"The user manual for NVIDIA profiling tools for optimizing performance of CUDA applications. https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide [Accessed","author":"NVIDIA Corporation","year":"2021","unstructured":"NVIDIA Corporation. 2021. The user manual for NVIDIA profiling tools for optimizing performance of CUDA applications. https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide [Accessed March 9, 2021]."},{"key":"e_1_3_2_1_29_1","unstructured":"Joseph Redmon. 2013--2016. Darknet: Open Source Neural Networks in C. http:\/\/pjreddie.com\/darknet\/."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3168831"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342006064482"},{"key":"e_1_3_2_1_32_1","volume-title":"https:\/\/www.khronos.org\/opencl\/ [Accessed","author":"The Khronos Group Inc. 2021. OPENCL.","year":"2021","unstructured":"The Khronos Group Inc. 2021. OPENCL. https:\/\/www.khronos.org\/opencl\/ [Accessed April 6, 2021]."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/223982.224449"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358307"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-32820-6_85"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/2854038.2854041"},{"key":"e_1_3_2_1_37_1","volume-title":"Networking, Storage and Analysis (SC)","author":"Zhou Keren","unstructured":"Keren Zhou, Yueming Hao, John Mellor-Crummey, Xiaozhu Meng, and Xu Liu. 2020. GVProf: a value profiler for GPU-based clusters. In 2020 SC20: Int'l Conference for High Performance Computing, Networking, Storage and Analysis (SC). IEEE Computer Society, 1263--1278."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3332466.3374534"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3392717.3392752"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3094169"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370339"}],"event":{"name":"ICPE '23: ACM\/SPEC International Conference on Performance Engineering","location":"Coimbra Portugal","acronym":"ICPE '23","sponsor":["SIGMETRICS ACM Special Interest Group on Measurement and Evaluation","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2023 ACM\/SPEC International Conference on Performance Engineering"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3578244.3583736","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3578244.3583736","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3578244.3583736","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:49:20Z","timestamp":1750182560000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3578244.3583736"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,4,15]]},"references-count":41,"alternative-id":["10.1145\/3578244.3583736","10.1145\/3578244"],"URL":"https:\/\/doi.org\/10.1145\/3578244.3583736","relation":{},"subject":[],"published":{"date-parts":[[2023,4,15]]},"assertion":[{"value":"2023-04-15","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}