{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T14:34:43Z","timestamp":1780497283814,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,11,13]],"date-time":"2021-11-13T00:00:00Z","timestamp":1636761600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Triad National Security,","award":["LLC contract# 581326"],"award-info":[{"award-number":["LLC contract# 581326"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,11,14]]},"DOI":"10.1145\/3458817.3476221","type":"proceedings-article","created":{"date-parts":[[2021,10,21]],"date-time":"2021-10-21T05:10:34Z","timestamp":1634793034000},"page":"1-15","source":"Crossref","is-referenced-by-count":27,"title":["Hybrid, scalable, trace-driven performance modeling of GPGPUs"],"prefix":"10.1145","author":[{"given":"Yehia","family":"Arafa","sequence":"first","affiliation":[{"name":"New Mexico State University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Abdel-Hameed","family":"Badawy","sequence":"additional","affiliation":[{"name":"New Mexico State University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ammar","family":"ElWazir","sequence":"additional","affiliation":[{"name":"New Mexico State University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Atanu","family":"Barai","sequence":"additional","affiliation":[{"name":"New Mexico State University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ali","family":"Eker","sequence":"additional","affiliation":[{"name":"Binghamton University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Gopinath","family":"Chennupati","sequence":"additional","affiliation":[{"name":"Amazon Alexa"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Nandakishore","family":"Santhi","sequence":"additional","affiliation":[{"name":"Los Alamos National Laboratory"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Stephan","family":"Eidenbenz","sequence":"additional","affiliation":[{"name":"Los Alamos National Laboratory"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2021,11,13]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3392717.3392761"},{"key":"e_1_3_2_2_2_1","volume-title":"Low Overhead Instruction Latency Characterization for NVIDIA GPGPUs. In 2019 IEEE High Performance Extreme Computing Conference (HPEC). 1--8. 10","author":"Arafa Y.","year":"2019"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2019.2904497"},{"key":"e_1_3_2_2_4_1","volume-title":"IEEE 38th International Performance Computing and Communications Conference (IPCCC '19)","author":"Arafa Y.","year":"2019"},{"key":"e_1_3_2_2_5_1","volume-title":"Proceedings of the 17th ACM International Conference on Computing Frontiers (CF '20)","author":"Arafa Y."},{"key":"e_1_3_2_2_6_1","volume-title":"Proceedings of the 15th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","author":"Baghsorkhi S. S.","year":"2010"},{"key":"e_1_3_2_2_7_1","volume-title":"https:\/\/github.com\/baidu-research\/DeepBench. [Online","author":"Research Baidu","year":"2021"},{"key":"e_1_3_2_2_8_1","volume-title":"2009 IEEE International Symposium on Performance Analysis of Systems and Software. 163--174","author":"Bakhoda A.","year":"2009"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"crossref","unstructured":"A. Barai Y. Arafa A. Badawy G. Chennupati N. Santhi and S. Eidenbenz. 2021. PPT-Multicore: Performance Prediction of OpenMP applications using Reuse Profiles and Analytical Modeling. arXiv:2104.05102 [cs.PF]  A. Barai Y. Arafa A. Badawy G. Chennupati N. Santhi and S. Eidenbenz. 2021. PPT-Multicore: Performance Prediction of OpenMP applications using Reuse Profiles and Analytical Modeling. arXiv:2104.05102 [cs.PF]","DOI":"10.1007\/s11227-021-03949-4"},{"key":"e_1_3_2_2_10_1","volume-title":"The International Symposium on Memory Systems (MEMSYS). 341--351","author":"Barai A."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3431731"},{"key":"e_1_3_2_2_12_1","unstructured":"M. Brehob and R. Enbody. 1999. An analytical model of locality and caching. Technical Report Michigan State University MSU-CSE-99-31 (Aug. 1999).  M. Brehob and R. Enbody. 1999. An analytical model of locality and caching. Technical Report Michigan State University MSU-CSE-99-31 (Aug. 1999)."},{"key":"e_1_3_2_2_13_1","volume-title":"Proceedings of the 17th Annual International Conference on Supercomputing","author":"Cascaval C."},{"key":"e_1_3_2_2_14_1","volume-title":"IEEE International Symposium on Workload Characterization (IISWC '09)","author":"Che S.","year":"2009"},{"key":"e_1_3_2_2_15_1","volume-title":"Winter Simulation Conference (WSC '17)","author":"Chennupati G.","year":"2017"},{"key":"e_1_3_2_2_16_1","volume-title":"CUDA rogramming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html. [Online","author":"NVIDIA Corporation","year":"2021"},{"key":"e_1_3_2_2_17_1","volume-title":"https:\/\/docs.nvidia.com\/cuda\/cuda-binary-utilities\/#cuobjdump. [Online","author":"NVIDIA Corporation","year":"2021"},{"key":"e_1_3_2_2_18_1","volume-title":"GPU Compute Capability. https:\/\/developer.nvidia.com\/cuda-gpus. [Online","author":"NVIDIA Corporation","year":"2021"},{"key":"e_1_3_2_2_19_1","volume-title":"Parallel Thread Execution ISA Version 7.2. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index.html. [Online","author":"NVIDIA Corporation","year":"2021"},{"key":"e_1_3_2_2_20_1","volume-title":"Source and Assembly ISA. https:\/\/docs.nvidia.com\/cuda\/cuda-binary-utilities\/index.html#instruction-set-ref. [Online","author":"NVIDIA Corporation","year":"2021"},{"key":"e_1_3_2_2_21_1","volume-title":"19th International Conference on Parallel Architectures and Compilation Techniques (PACT). 353--364","author":"Diamos G."},{"key":"e_1_3_2_2_22_1","volume-title":"Proceedings of the ACM SIGPLAN 2003 Conference on Programming Language Design and Implementation (PLDI '03)","author":"Ding C."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"crossref","unstructured":"N. Ding and S. Williams. 2019. An Instruction Roofline Model for GPUs. In 2019 IEEE\/ACM Performance Modeling Benchmarking and Simulation of High Performance Computer Systems (PMBS). 7--18. 10.1109\/PMBS49563.2019.00007  N. Ding and S. Williams. 2019. An Instruction Roofline Model for GPUs. In 2019 IEEE\/ACM Performance Modeling Benchmarking and Simulation of High Performance Computer Systems (PMBS). 7--18. 10.1109\/PMBS49563.2019.00007","DOI":"10.1109\/PMBS49563.2019.00007"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2017.04.002"},{"key":"e_1_3_2_2_25_1","volume-title":"Collecting Roofline on GPUs. https:\/\/performanceportability.org\/perfport\/measurements\/gpu\/. [Online","author":"DOE SC","year":"2021"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.2172\/1079561"},{"key":"e_1_3_2_2_27_1","volume-title":"Innovative Parallel Computing (InPar '12)","author":"Grauer-Gray S.","year":"2012"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2951218"},{"key":"e_1_3_2_2_29_1","volume-title":"Proceedings of the 36th Annual International Symposium on Computer Architecture","author":"Hong S."},{"key":"e_1_3_2_2_30_1","volume-title":"GPUMech: GPU Performance Modeling Technique Based on Interval Analysis. In 2014 47th Annual IEEE\/ACM International Symposium on Microarchitecture. 268--279","author":"Huang J.","year":"2014"},{"key":"e_1_3_2_2_31_1","volume-title":"2015 IEEE International Symposium on Workload Characterization. 76--86","author":"Kambadur M.","year":"2015"},{"key":"e_1_3_2_2_32_1","volume-title":"Proceedings of the 12th Workshop on General Purpose Processing Using GPUs (GPGPU '19)","author":"Karki A.","year":"1941"},{"key":"e_1_3_2_2_33_1","volume-title":"Accel-Sim: An Extensible Simulation Framework for Validated GPU Modeling. In 2020 ACM\/IEEE 47th Annual International Symposium on Computer Architecture (ISCA). 473--486","author":"Khairy M.","year":"2020"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3291051"},{"key":"e_1_3_2_2_35_1","volume-title":"Livermore Unstructured Lagrangian Explicit Shock Hydrodynamics. https:\/\/github.com\/LLNL\/LULESH\/tree\/2.0.2-dev\/cuda\/src. [Online","author":"LLNL.","year":"2021"},{"key":"e_1_3_2_2_36_1","unstructured":"B. Lorenz and H. Fr\u00f6ning. 2019. CUDA Flux: A Lightweight Instruction Profiler for CUDA Applications. 2019 IEEE\/ACM Performance Modeling Benchmarking and Simulation of High Performance Computer Systems (PMBS) (2019) 73--81. 10.1109\/PMBS49563.2019.00014  B. Lorenz and H. Fr\u00f6ning. 2019. CUDA Flux: A Lightweight Instruction Profiler for CUDA Applications. 2019 IEEE\/ACM Performance Modeling Benchmarking and Simulation of High Performance Computer Systems (PMBS) (2019) 73--81. 10.1109\/PMBS49563.2019.00014"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1147\/sj.92.0078"},{"key":"e_1_3_2_2_38_1","volume-title":"Proceedings of the 2012 IEEE 26th International Parallel and Distributed Processing Symposium (IPDPS '12)","author":"Niu Q.","year":"2012"},{"key":"e_1_3_2_2_39_1","volume-title":"2014 IEEE 20th International Symposium on High Performance Computer Architecture (HPCA). 37--48","author":"Nugteren C.","year":"2014"},{"key":"e_1_3_2_2_40_1","volume-title":"CUDA Basic Linear Algebra Subroutines. https:\/\/developer.nvidia.com\/cublas. [Online","author":"NVIDIA Corporation","year":"2021"},{"key":"e_1_3_2_2_41_1","volume-title":"CUDA Deep Neural Network library. https:\/\/developer.nvidia.com\/cudnn. [Online","author":"NVIDIA Corporation","year":"2021"},{"key":"e_1_3_2_2_42_1","volume-title":"Nsight Compute CLI (nv-nsight). https:\/\/docs.nvidia.com\/nsight-compute\/NsightComputeCli\/index.html. [Online","author":"NVIDIA Corporation","year":"2021"},{"key":"e_1_3_2_2_43_1","volume-title":"https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide\/. [Online","author":"NVIDIA Corporation","year":"2021"},{"key":"e_1_3_2_2_44_1","volume-title":"Proceedings of the 2018 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation. 49--59","author":"Obaida M. A.","year":"2009"},{"key":"e_1_3_2_2_45_1","volume-title":"Cache-Conscious Wave-front Scheduling. In 2012 45th Annual IEEE\/ACM International Symposium on Microarchitecture. 72--83","author":"Rogers T. G.","year":"2012"},{"key":"e_1_3_2_2_46_1","volume-title":"2015 Winter Simulation Conference (WSC). 3013--3024","author":"Santhi N.","year":"2015"},{"key":"e_1_3_2_2_47_1","volume-title":"2017 IEEE International Symposium on Workload Characterization (IISWC). 54--65","author":"Sembrant A.","year":"2017"},{"key":"e_1_3_2_2_48_1","volume-title":"Proceedings of the 2018 International Symposium on Code Generation and Optimization (CGO). 214--227","author":"Shen D."},{"key":"e_1_3_2_2_49_1","volume-title":"SC '12: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis. 1--11","author":"Spafford K. L.","year":"2012"},{"key":"e_1_3_2_2_50_1","volume-title":"Flexible Software Profiling of GPU Architectures. In ACM\/IEEE 42nd Annual International Symposium on Computer Architecture (ISCA '15)","author":"Stephenson M."},{"key":"e_1_3_2_2_51_1","volume-title":"Joblib: running Python functions as pipeline jobs. https:\/\/joblib.readthedocs.io. [Online","author":"Team Joblib Development","year":"2021"},{"key":"e_1_3_2_2_52_1","volume-title":"The Top500 List. https:\/\/www.top500.org\/lists\/top500\/2020\/11\/. [Online","year":"2021"},{"key":"e_1_3_2_2_53_1","volume-title":"Proceedings of the 52nd Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO '52)","author":"Villa O."},{"key":"e_1_3_2_2_54_1","volume-title":"MDM: The GPU Memory Divergence Model. In 2020 53rd Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 1009--1021","author":"Wang L.","year":"2020"},{"key":"e_1_3_2_2_55_1","volume-title":"2019 IEEE International Symposium on High Performance Computer Architecture (HPCA). 506--518","author":"Wang X.","year":"2019"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_2_2_57_1","volume-title":"2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA). 564--576","author":"Wu G.","year":"2015"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/2427631.2427632"},{"key":"e_1_3_2_2_59_1","volume-title":"An Empirical Roofline Methodology for Quantitatively Assessing Performance Portability. In 2018 IEEE\/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC). 14--23","author":"Yang C.","year":"2018"},{"key":"e_1_3_2_2_60_1","volume-title":"SC '16: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. 750--760","author":"Yang X.","year":"2016"},{"key":"e_1_3_2_2_61_1","unstructured":"G. Yongbin W. Wenxuan L. Yunfan and C. Lizhong. 2020. UVMBench: A Comprehensive Benchmark Suite for Researching Unified Virtual Memory in GPUs. arXiv:2007.09822 [cs.AR]  G. Yongbin W. Wenxuan L. Yunfan and C. Lizhong. 2020. UVMBench: A Comprehensive Benchmark Suite for Researching Unified Virtual Memory in GPUs. arXiv:2007.09822 [cs.AR]"},{"key":"e_1_3_2_2_62_1","volume-title":"2011 IEEE 17th International Symposium on High Performance Computer Architecture. 382--393","author":"Zhang Y.","year":"2011"},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/1552309.1552310"}],"event":{"name":"SC '21: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis Missouri","acronym":"SC '21","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","IEEE CS"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3458817.3476221","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3458817.3476221","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:12:22Z","timestamp":1750191142000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3458817.3476221"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,11,13]]},"references-count":63,"alternative-id":["10.1145\/3458817.3476221","10.1145\/3458817"],"URL":"https:\/\/doi.org\/10.1145\/3458817.3476221","relation":{},"subject":[],"published":{"date-parts":[[2021,11,13]]}}}