{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,8]],"date-time":"2026-01-08T05:39:55Z","timestamp":1767850795268,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,21]]},"DOI":"10.1145\/3695053.3731068","type":"proceedings-article","created":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T16:46:17Z","timestamp":1750437977000},"page":"1509-1523","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["GCStack+GCScaler: Fast and Accurate GPU Performance Analyses Using Fine-Grained Stall Cycle Accounting and Interval Analysis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-5937-8550","authenticated-orcid":false,"given":"Hanna","family":"Cha","sequence":"first","affiliation":[{"name":"Yonsei University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5549-7265","authenticated-orcid":false,"given":"Sungchul","family":"Lee","sequence":"additional","affiliation":[{"name":"Yonsei University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0463-7717","authenticated-orcid":false,"given":"Jounghoo","family":"Lee","sequence":"additional","affiliation":[{"name":"Yonsei University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3418-5299","authenticated-orcid":false,"given":"Yeonan","family":"Ha","sequence":"additional","affiliation":[{"name":"Yonsei University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5432-7813","authenticated-orcid":false,"given":"Joonsung","family":"Kim","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University, Suwon, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1015-9969","authenticated-orcid":false,"given":"Youngsok","family":"Kim","sequence":"additional","affiliation":[{"name":"Yonsei University, Seoul, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2025,6,20]]},"reference":[{"key":"e_1_3_3_1_2_2","first-page":"265","volume-title":"Proc. 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, et\u00a0al. 2016. TensorFlow: a System for Large-Scale Machine Learning. In Proc. 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI). 265\u2013283."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2016.7482092"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476221"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"crossref","unstructured":"Yehia Arafa Abdel-Hameed\u00a0A Badawy Gopinath Chennupati Nandakishore Santhi and Stephan Eidenbenz. 2019. PPT-GPU: Scalable GPU Performance Modeling. IEEE Computer Architecture Letters (CAL) 18 1 (2019) 55\u201358.","DOI":"10.1109\/LCA.2019.2904497"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2010.5452029"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480100"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"Rajeev Balasubramonian Andrew\u00a0B Kahng Naveen Muralimanohar Ali Shafiee and Vaishnav Srinivas. 2017. CACTI 7: New Tools for Interconnect Exploration in Innovative Off-Chip Memories. ACM Transactions on Architecture and Code Optimization (TACO) 14 2 (2017) 1\u201325.","DOI":"10.1145\/3085572"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Hanna Cha Sungchul Lee Yeonan Ha Hanhwi Jang Joonsung Kim and Youngsok Kim. 2024. GCStack: A GPU Cycle Accounting Mechanism for Providing Accurate Insight into GPU Performance. IEEE Computer Architecture Letters (CAL) (2024).","DOI":"10.1109\/LCA.2024.3476909"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2013.6704684"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Cen Chen Kenli Li Aijia Ouyang Zhuo Tang and Keqin Li. 2017. GPU-accelerated Parallel Hierarchical Extreme Learning Machine on Flink for Big Data. IEEE Transactions on Systems Man and Cybernetics: Systems 47 10 (2017) 2740\u20132753.","DOI":"10.1109\/TSMC.2017.2690673"},{"key":"e_1_3_3_1_14_2","first-page":"578","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et\u00a0al. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI). 578\u2013594."},{"key":"e_1_3_3_1_15_2","volume-title":"NVIDIA AMPERE GA102 GPU ARCHITECTURE","author":"Corporation NVIDIA","year":"2020","unstructured":"NVIDIA Corporation. 2020. NVIDIA AMPERE GA102 GPU ARCHITECTURE. [Online] Available from: https:\/\/www.nvidia.com\/content\/PDF\/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf."},{"key":"e_1_3_3_1_16_2","volume-title":"NVIDIA Nsight Compute CLI Documentation","author":"Corporation NVIDIA","year":"2020","unstructured":"NVIDIA Corporation. 2020. NVIDIA Nsight Compute CLI Documentation. [Online] Available from: https:\/\/docs.nvidia.com\/nsight-compute\/NsightComputeCli\/index.html."},{"key":"e_1_3_3_1_17_2","unstructured":"NVIDIA Corporation. 2024. CUDA Samples. Available: https:\/\/docs.nvidia.com\/cuda\/cuda-samples\/"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"William\u00a0J. Dally Stephen\u00a0W. Keckler and David\u00a0B. Kirk. 2021. Evolution of the Graphics Processing Unit (GPU). IEEE Micro (2021).","DOI":"10.1109\/MM.2021.3113475"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Sambit Das Phani Motamarri Vishal Subramanian David\u00a0M Rogers and Vikram Gavini. 2022. DFT-FE 1.0: A Massively Parallel Hybrid CPU-GPU Density Functional Theory Code Using Finite-element Discretization. Computer Physics Communications 280 (2022) 108473.","DOI":"10.1016\/j.cpc.2022.108473"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Stijn Eyerman and Lieven Eeckhout. 2009. Per-Thread Cycle Accounting in SMT Processors. ACM Sigplan Notices 44 3 (2009) 133\u2013144.","DOI":"10.1145\/1508284.1508260"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"Stijn Eyerman Lieven Eeckhout Tejas Karkhanis and James\u00a0E Smith. 2006. A Performance Counter Architecture for Computing Accurate CPI Components. ACM SIGPLAN Notices 41 11 (2006) 175\u2013184.","DOI":"10.1145\/1168918.1168880"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Stijn Eyerman Wim Heirman Kristof\u00a0Du Bois and Ibrahim Hur. 2018. Multi-Stage CPI Stacks. IEEE Computer Architecture Letters (CAL) 17 (2018).","DOI":"10.1109\/LCA.2017.2761751"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Richard\u00a0M Fujimoto. 1990. Parallel Discrete Event Simulation. Communications of the ACM 33 (1990) 30\u201353.","DOI":"10.1145\/84537.84545"},{"key":"e_1_3_3_1_24_2","first-page":"139","volume-title":"Proc. IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)","author":"Gera Prasun","year":"2018","unstructured":"Prasun Gera, Hyojong Kim, Hyesoon Kim, Sunpyo Hong, Vinod George, and Chi-Keung Luk. 2018. Performance Characterisation and Simulation of Intel\u2019s Integrated GPU Architecture. In Proc. IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS). IEEE, 139\u2013148."},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2017.7975298"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480058"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589058"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339595"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00059"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.59"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2014.53"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.2172\/1762830"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2012.6189201"},{"key":"e_1_3_3_1_34_2","first-page":"257","volume-title":"Proc. 22nd International Conference on Parallel Architectures and Compilation Techniques (PACT)","author":"Jia Wenhao","year":"2013","unstructured":"Wenhao Jia, Kelly\u00a0A Shaw, and Margaret Martonosi. 2013. Starchart: Hardware and Software Optimization Using Recursive Partitioning Regression Trees. In Proc. 22nd International Conference on Parallel Architectures and Compilation Techniques (PACT). IEEE, 257\u2013267."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Hai Jiang Yi Chen Zhi Qiao Tien-Hsiung Weng and Kuan-Ching Li. 2015. Scaling Up MapReduce-Based Big Data Processing on Multi-GPU Systems. Cluster Computing 18 (2015) 369\u2013383.","DOI":"10.1007\/s10586-014-0400-1"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"David\u00a0Gohara John E.\u00a0Stone and Guochun Shi. 2010. OpenCL: A Parallel Programming Standard for Heterogeneous Computing Systems. Computing in Science & Engineering 12 (2010).","DOI":"10.1109\/MCSE.2010.69"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2015.14"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00047"},{"key":"e_1_3_3_1_39_2","first-page":"93","volume-title":"Parallel Computing: On the Road to Exascale","author":"Krasnopolsky Boris","year":"2016","unstructured":"Boris Krasnopolsky and Alexey Medvedev. 2016. Acceleration of Large Scale OpenFOAM Simulations on Distributed Systems with Multicore CPUs and GPUs. In Parallel Computing: On the Road to Exascale. IOS Press, 93\u2013102."},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527384"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2013.6557151"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"crossref","unstructured":"Sangpil Lee and Won\u00a0Woo Ro. 2015. Parallel GPU Architecture Simulation Framework Exploiting Architectural-Level Parallelism with Timing Error Prediction. IEEE Transactions on Computers (TC) 65 4 (2015) 1253\u20131265.","DOI":"10.1109\/TC.2015.2444848"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/2628071.2628107"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00028"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623773"},{"key":"e_1_3_3_1_46_2","unstructured":"Peter Mattson Christine Cheng Cody Coleman Greg Diamos Paulius Micikevicius David Patterson Hanlin Tang Gu-Yeon Wei Peter Bailis Victor Bittorf et\u00a0al. 2019. Mlperf Training Benchmark. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1910.01500 (2019)."},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2009.7478342"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC53511.2021.00026"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS57527.2023.00030"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"crossref","unstructured":"John Nickolls and William\u00a0J. Dally. 2010. The GPU Computing Era. IEEE Micro (2010).","DOI":"10.1109\/MM.2010.41"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2014.6983052"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00045"},{"key":"e_1_3_3_1_53_2","first-page":"164","volume-title":"Proc. IEEE International Symposium on Quality Electronic Design (ISQED)","author":"Mesa-Martinez Jose\u00a0Renau Sangeetha\u00a0Sudhakrishnan, Francisco J.","year":"2011","unstructured":"Jose\u00a0Renau Sangeetha\u00a0Sudhakrishnan, Francisco J. Mesa-Martinez. 2011. A Design Time Simulator for Computer Architects. In Proc. IEEE International Symposium on Quality Electronic Design (ISQED). IEEE, 164\u2013173."},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00088"},{"key":"e_1_3_3_1_55_2","unstructured":"John\u00a0A Stratton Christopher Rodrigues I-Jui Sung Nady Obeid Li-Wen Chang Nasser Anssari Geng\u00a0Daniel Liu and Wen-mei\u00a0W Hwu. 2012. Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing. Center for Reliable and High-Performance Computing 127 7.2 (2012)."},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322230"},{"key":"e_1_3_3_1_57_2","volume-title":"General-Purpose Graphics Processor Architectures","author":"Lun\u00a0Fung Timothy G.\u00a0Rogers Tor M.\u00a0Aamodt, Wilson Wai","year":"2018","unstructured":"Timothy G.\u00a0Rogers Tor M.\u00a0Aamodt, Wilson Wai Lun\u00a0Fung. 2018. General-Purpose Graphics Processor Architectures. Morgan & Claypool."},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00085"},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"crossref","unstructured":"Lu Wang Magnus Jahre Almutaz Adileh Zhiying Wang and Lieven Eeckhout. 2019. Modeling Emerging Memory-Divergent GPU Applications. IEEE Computer Architecture Letters (CAL) 18 (2019).","DOI":"10.1109\/LCA.2019.2923618"},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056063"},{"key":"e_1_3_3_1_61_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00022"},{"key":"e_1_3_3_1_62_2","first-page":"380","volume-title":"Proc. 56th IEEE\/ACM International Symposium on Microarchitecture (MICRO)","author":"Yifan\u00a0Sun Adwait\u00a0Jog Ying\u00a0Li,","year":"2023","unstructured":"Adwait\u00a0Jog Ying\u00a0Li, Yifan\u00a0Sun. 2023. Path Forward Beyond Simulators: Fast and Accurate GPU Execution Time Prediction for DNN Workloads. In Proc. 56th IEEE\/ACM International Symposium on Microarchitecture (MICRO). IEEE, 380\u2013394."},{"key":"e_1_3_3_1_63_2","doi-asserted-by":"crossref","unstructured":"Zhibin Yu Lieven Eeckhout Nilanjan Goswami Tao Li Lizy\u00a0K John and Hai Jin. 2015. GPGPU-MiniBench: Accelerating GPGPU Micro-Architecture Simulation. IEEE Transactions on Computers (TC) 64 (2015).","DOI":"10.1109\/TC.2015.2395427"},{"key":"e_1_3_3_1_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/2465529.2465540"}],"event":{"name":"ISCA '25: Proceedings of the 52nd Annual International Symposium on Computer Architecture","location":"Tokyo Japan","acronym":"SIGARCH '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 52nd Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695053.3731068","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T11:03:18Z","timestamp":1750503798000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3695053.3731068"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,20]]},"references-count":63,"alternative-id":["10.1145\/3695053.3731068","10.1145\/3695053"],"URL":"https:\/\/doi.org\/10.1145\/3695053.3731068","relation":{},"subject":[],"published":{"date-parts":[[2025,6,20]]},"assertion":[{"value":"2025-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}