{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T08:03:30Z","timestamp":1776931410167,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":104,"publisher":"ACM","funder":[{"name":"CoCoUnit ERC Advanced Grant of the EU's Horizon 2020 program","award":["833057"],"award-info":[{"award-number":["833057"]}]},{"name":"Spanish State Research Agency (MCIU\/AEI\/ 10.13039\/501100011033) and FEDER\/UE","award":["PID2020-113172RB-I00"],"award-info":[{"award-number":["PID2020-113172RB-I00"]}]},{"name":"Catalan Agency for University and Research (AGAUR)","award":["2021SGR00383"],"award-info":[{"award-number":["2021SGR00383"]}]},{"name":"ICREA Academia program"},{"name":"Spanish State Research Agency (MCIU\/AEI\/ 10.13039\/501100011033) and FEDER\/UE under grant","award":["PID2024-155476OB-I00"],"award-info":[{"award-number":["PID2024-155476OB-I00"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3756041","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:21:19Z","timestamp":1760721679000},"page":"369-384","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Dissecting and Modeling the Architecture of Modern GPU Cores"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0052-7710","authenticated-orcid":false,"given":"Rodrigo","family":"Huerta","sequence":"first","affiliation":[{"name":"Universitat Polit\u00e8cnica de Catalunya, Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2377-6939","authenticated-orcid":false,"given":"Mojtaba Abaie","family":"Shoushtary","sequence":"additional","affiliation":[{"name":"Universitat Polit\u00e8cnica de Catalunya, Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5325-9153","authenticated-orcid":false,"given":"Jos\u00e9-Lorenzo","family":"Cruz","sequence":"additional","affiliation":[{"name":"Universitat Polit\u00e8cnica de Catalunya, Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0009-0996","authenticated-orcid":false,"given":"Antonio","family":"Gonzalez","sequence":"additional","affiliation":[{"name":"Universitat Polit\u00e8cnica de Catalunya, Barcelona, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.5555\/3236002"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00075"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC55821.2022.9926299"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480093"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Ayaz Akram and Lina Sawalha. 2019. A Survey of Computer Architecture Simulation Techniques and Tools. IEEE Access 7 (2019) 78120\u201378145.","DOI":"10.1109\/ACCESS.2019.2917698"},{"key":"e_1_3_3_2_7_2","volume-title":"AMD Graphics Core Next Architecture, Generation 3. Reference Guide.","year":"2016","unstructured":"AMD. 2016. AMD Graphics Core Next Architecture, Generation 3. Reference Guide. Technical Report. AMD."},{"key":"e_1_3_3_2_8_2","volume-title":"\"AMD Instinct MI100\" Instruction Set Architecture. Reference Guide.","year":"2020","unstructured":"AMD. 2020. \"AMD Instinct MI100\" Instruction Set Architecture. Reference Guide. Technical Report. AMD."},{"key":"e_1_3_3_2_9_2","volume-title":"\"RDNA 1.0\" Instruction Set Architecture. Reference Guide.","year":"2020","unstructured":"AMD. 2020. \"RDNA 1.0\" Instruction Set Architecture. Reference Guide. Technical Report. AMD."},{"key":"e_1_3_3_2_10_2","volume-title":"\"RDNA 2\" Instruction Set Architecture. Reference Guide.","year":"2020","unstructured":"AMD. 2020. \"RDNA 2\" Instruction Set Architecture. Reference Guide. Technical Report. AMD."},{"key":"e_1_3_3_2_11_2","volume-title":"Vega 7nm Instruction Set Architecture. Reference Guide.","year":"2020","unstructured":"AMD. 2020. Vega 7nm Instruction Set Architecture. Reference Guide. Technical Report. AMD."},{"key":"e_1_3_3_2_12_2","volume-title":"Vega Instruction Set Architecture. Reference Guide.","year":"2020","unstructured":"AMD. 2020. Vega Instruction Set Architecture. Reference Guide. Technical Report. AMD."},{"key":"e_1_3_3_2_13_2","volume-title":"\"AMD Instinct MI200\" Instruction Set Architecture. Reference Guide.","year":"2022","unstructured":"AMD. 2022. \"AMD Instinct MI200\" Instruction Set Architecture. Reference Guide. Technical Report. AMD."},{"key":"e_1_3_3_2_14_2","volume-title":"\"RDNA 3\" Instruction Set Architecture. Reference Guide.","year":"2023","unstructured":"AMD. 2023. \"RDNA 3\" Instruction Set Architecture. Reference Guide. Technical Report. AMD."},{"key":"e_1_3_3_2_15_2","volume-title":"\"AMD Instinct MI300\" Instruction Set Architecture. Reference Guide.","year":"2024","unstructured":"AMD. 2024. \"AMD Instinct MI300\" Instruction Set Architecture. Reference Guide. Technical Report. AMD."},{"key":"e_1_3_3_2_16_2","volume-title":"\"RDNA 3.5\" Instruction Set Architecture. Reference Guide.","year":"2024","unstructured":"AMD. 2024. \"RDNA 3.5\" Instruction Set Architecture. Reference Guide. Technical Report. AMD."},{"key":"e_1_3_3_2_17_2","volume-title":"\"RDNA 4\" Instruction Set Architecture. Reference Guide.","year":"2025","unstructured":"AMD. 2025. \"RDNA 4\" Instruction Set Architecture. Reference Guide. Technical Report. AMD."},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/RTSS.2017.00017"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10070957"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Nathan Binkert Bradford Beckmann Gabriel Black Steven\u00a0K. Reinhardt Ali Saidi Arkaprava Basu Joel Hestness Derek\u00a0R. Hower Tushar Krishna Somayeh Sardashti Rathijit Sen Korey Sewell Muhammad Shoaib Nilay Vaish Mark\u00a0D. Hill and David\u00a0A. Wood. 2011. The gem5 simulator. SIGARCH Comput. Archit. News 39 2 (Aug. 2011) 1\u20137.","DOI":"10.1145\/2024716.2024718"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"John Burgess. 2020. RTX on\u2014The NVIDIA Turing GPU. IEEE Micro 40 2 (2020) 36\u201344.","DOI":"10.1109\/MM.2020.2971677"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"crossref","unstructured":"Alhadi Bustamam Kevin Burrage and Nicholas\u00a0A. Hamilton. 2012. Fast Parallel Markov Clustering in Bioinformatics Using Massively Parallel Computing on GPU with CUDA and ELLPACK-R Sparse Format. IEEE\/ACM Transactions on Computational Biology and Bioinformatics 9 3 (2012) 679\u2013692.","DOI":"10.1109\/TCBB.2011.68"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Jianli Cao Zhikui Chen Yuxin Wang He Guo and Pengcheng Wang. 2021. Instruction prefetch for improving GPGPU performance. IEICE Transactions on Fundamentals of Electronics Communications and Computer Sciences E104A (2021) 773\u2013785. Issue 5.","DOI":"10.1587\/transfun.2020EAP1105"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2013.6704684"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_3_2_29_2","unstructured":"Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cuDNN: Efficient Primitives for Deep Learning. (10 2014). arxiv:https:\/\/arXiv.org\/abs\/1410.0759"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","unstructured":"Jack Choquette Olivier Giroux and Denis Foley. 2018. Volta: Performance and Programmability. IEEE Micro 38 2 (2018) 42\u201352.","DOI":"10.1109\/MM.2018.022071134"},{"key":"e_1_3_3_2_31_2","unstructured":"Cloudcores. 2022. CuAssembler: An unofficial cuda assembler for all generations of SASS. https:\/\/github.com\/cloudcores\/CuAssembler"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00084"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"crossref","unstructured":"Massimiliano Fasi Nicholas\u00a0J. Higham Mantas Mikaitis and Srikara Pranesh. 2021. Numerical behavior of NVIDIA tensor cores. PeerJ Computer Science 7 (2 2021) 1\u201319.","DOI":"10.7717\/peerj-cs.330"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/2000064.2000093"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155675"},{"key":"e_1_3_3_2_36_2","first-page":"139","volume-title":"IEEE International Symposium on Performance Analysis of Systems and Software","author":"Gera Prasun","year":"2018","unstructured":"Prasun Gera, Hyojong Kim, Hyesoon Kim, Sunpyo Hong, Vinod George, and Chi-Keung Luk. 2018. Performance Characterisation and Simulation of Intel\u2019s Integrated GPU Architecture. In IEEE International Symposium on Performance Analysis of Systems and Software. IEEE Computer Society, Los Alamitos, CA, USA, 139\u2013148."},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339595"},{"key":"e_1_3_3_2_38_2","unstructured":"Scott Gray. 2014. MaxAS: Assembler for NVIDIA Maxwell architecture. https:\/\/github.com\/NervanaSystems\/maxas"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00058"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3696443.3708943"},{"key":"e_1_3_3_2_41_2","unstructured":"Steven\u00a0J Heinrich and A\u00a0L Madison. 2019. Techniques for efficiently transferring data to a processor. 417 (2019). Issue 62."},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"Francisco\u00a0E. Hern\u00e1ndez P\u00e9rez Nurzhan Mukhadiyev Xiao Xu Aliou Sow Bok\u00a0Jik Lee Ramanan Sankaran and Hong\u00a0G. Im. 2018. Direct numerical simulations of reacting flows with detailed chemistry using many-core\/GPU acceleration. Computers & Fluids 173 (2018) 73\u201379.","DOI":"10.1016\/j.compfluid.2018.03.074"},{"key":"e_1_3_3_2_43_2","unstructured":"Rodrigo Huerta Mojtaba\u00a0Abaie Shoushtary Josep-Lloren\u00e7 Cruz and Antonio Gonz\u00e1lez. 2025. Modern GPU Simulator MICRO 2025. https:\/\/github.com\/upc-arco\/modern-gpu-simulator-micro-2025"},{"key":"e_1_3_3_2_44_2","unstructured":"Rodrigo Huerta Mojtaba\u00a0Abaie Shoushtary and Antonio Gonz\u00e1lez. 2024. Analyzing and Improving Hardware Modeling of Accel-Sim. arxiv:https:\/\/arXiv.org\/abs\/2401.10082\u00a0[cs.AR]"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/DSD.2015.56"},{"key":"e_1_3_3_2_46_2","volume-title":"4th gem5 Users\u2019 Workshop","author":"Jamieson Charles","year":"2022","unstructured":"Charles Jamieson, Anushka Chandrashekar, Ian McDougall, and Matthew\u00a0D. Sinclair. 2022. gem5 GPU Accuracy Profiler (GAP). In 4th gem5 Users\u2019 Workshop."},{"key":"e_1_3_3_2_47_2","volume-title":"Dissecting the NVIDIA Turing T4 GPU via Microbenchmarking","author":"Jia Zhe","year":"2019","unstructured":"Zhe Jia, Marco Maggioni, Jeffrey Smith, and Daniele\u00a0Paolo Scarpazza. 2019. Dissecting the NVIDIA Turing T4 GPU via Microbenchmarking. Technical Report."},{"key":"e_1_3_3_2_48_2","volume-title":"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking","author":"Jia Zhe","year":"2018","unstructured":"Zhe Jia, Marco Maggioni, Benjamin Staiger, and Daniele\u00a0Paolo Scarpazza. 2018. Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking. Technical Report. arXiv:https:\/\/arXiv.org\/abs\/1804.06826"},{"key":"e_1_3_3_2_49_2","volume-title":"NVIDIA GTC 2021","author":"Jia Zhe","year":"2021","unstructured":"Zhe Jia and Peter\u00a0Van Sandt. 2021. Dissecting the Ampere GPU Architecture through Microbenchmarking. In NVIDIA GTC 2021. NVIDIA."},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00070"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/325164.325162"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00021"},{"key":"e_1_3_3_2_53_2","unstructured":"Mahmoud Khairy Jain Akshay Tor Aamodt and Timothy\u00a0G. Rogers. 2018. Exploring Modern GPU Memory System Design Challenges through Accurate Modeling. (2018)."},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00047"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"crossref","unstructured":"Ahmad Lashgar Ebad Salehi and Amirali Baniasadi. 2016. A Case Study in Reverse Engineering GPGPUs: Outstanding Memory Handling Resources. SIGARCH Comput. Archit. News 43 4 (4 2016) 15\u201321.","DOI":"10.1145\/2927964.2927968"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"crossref","unstructured":"Erik Lindholm John Nickolls Stuart Oberman and John Montrym. 2008. NVIDIA Tesla: A Unified Graphics and Computing Architecture. IEEE Micro 28 2 (2008) 39\u201355.","DOI":"10.1109\/MM.2008.31"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"crossref","unstructured":"Weiguo Liu Bertil Schmidt Gerrit Voss and Wolfgang M\u00fcller-Wittig. 2008. Accelerating molecular dynamics simulations using Graphics Processing Units with CUDA. Computer Physics Communications 179 9 (2008) 634\u2013641.","DOI":"10.1016\/j.cpc.2008.05.008"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC62836.2024.10938416"},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00064"},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2018.00091"},{"key":"e_1_3_3_2_61_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-10549-5_35"},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-26362-5_3"},{"key":"e_1_3_3_2_63_2","unstructured":"Microsoft. 2023. How Microsoft\u2019s bet on Azure unlocked an AI revolution.https:\/\/news.microsoft.com\/source\/features\/ai\/how-microsofts-bet-on-azure-unlocked-an-ai-revolution\/."},{"key":"e_1_3_3_2_64_2","unstructured":"Michael Mishkin. 2016. Write-after-Read Hazard Prevention in GPGPUsim. (2016)."},{"key":"e_1_3_3_2_65_2","unstructured":"S. Narang and G. Diamos. 2016. DeepBench: Benchmarking Deep Learning operations on different hardware. https:\/\/github.com\/baidu-research\/DeepBench"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"crossref","unstructured":"Marco\u00a0S Nobile Paolo Cazzaniga Andrea Tangherloni and Daniela Besozzi. 2016. Graphics processing units in bioinformatics computational biology and systems biology. Briefings in Bioinformatics 18 5 (07 2016) 870\u2013885.","DOI":"10.1093\/bib\/bbw058"},{"key":"e_1_3_3_2_67_2","volume-title":"NVIDIA\u2019s Next Generation CUDA TM Compute Architecture: Fermi","year":"2009","unstructured":"NVIDIA. 2009. NVIDIA\u2019s Next Generation CUDA TM Compute Architecture: Fermi. Technical Report. NVIDIA."},{"key":"e_1_3_3_2_68_2","volume-title":"Technology Overview NVIDIA GeForce GTX 680","year":"2012","unstructured":"NVIDIA. 2012. Technology Overview NVIDIA GeForce GTX 680. Technical Report. NVIDIA."},{"key":"e_1_3_3_2_69_2","volume-title":"NVIDIA NVLink TM High-Speed Interconnect: Application Performance","year":"2014","unstructured":"NVIDIA. 2014. NVIDIA NVLink TM High-Speed Interconnect: Application Performance. Technical Report. Nvidia."},{"key":"e_1_3_3_2_70_2","unstructured":"NVIDIA. 2016. NVIDIA Collective Communications Library (NCCL). https:\/\/developer.nvidia.com\/nccl."},{"key":"e_1_3_3_2_71_2","volume-title":"NVIDIA Tesla V100 GPU architecture the world\u2019s most advanced data center GPU","year":"2017","unstructured":"NVIDIA. 2017. NVIDIA Tesla V100 GPU architecture the world\u2019s most advanced data center GPU. Technical Report. NVIDIA."},{"key":"e_1_3_3_2_72_2","volume-title":"NVIDIA TURING GPU architecture Graphics Reinvented NVIDIA Turing GPU Architecture","year":"2018","unstructured":"NVIDIA. 2018. NVIDIA TURING GPU architecture Graphics Reinvented NVIDIA Turing GPU Architecture. Technical Report. NVIDIA."},{"key":"e_1_3_3_2_73_2","unstructured":"NVIDIA. 2019. NVIDIA Developer Forums: Instruction cache and instruction fetch stalls. https:\/\/forums.developer.nvidia.com\/t\/instruction-cache-and-instruction-fetch-stalls\/76883"},{"key":"e_1_3_3_2_74_2","volume-title":"NVIDIA AMPERE GA102 GPU architecture Second-Generation RTX NVIDIA Ampere GA102 GPU Architecture","year":"2020","unstructured":"NVIDIA. 2020. NVIDIA AMPERE GA102 GPU architecture Second-Generation RTX NVIDIA Ampere GA102 GPU Architecture. Technical Report. NVIDIA."},{"key":"e_1_3_3_2_75_2","volume-title":"NVIDIA ADA GPU architecture","year":"2022","unstructured":"NVIDIA. 2022. NVIDIA ADA GPU architecture. Technical Report. NVIDIA."},{"key":"e_1_3_3_2_76_2","volume-title":"NVIDIA H100 Tensor Core GPU Architecture","year":"2022","unstructured":"NVIDIA. 2022. NVIDIA H100 Tensor Core GPU Architecture. Technical Report. NVIDIA."},{"key":"e_1_3_3_2_77_2","volume-title":"NVIDIA RTX Blackwell GPU Architecture","year":"2025","unstructured":"NVIDIA. 2025. NVIDIA RTX Blackwell GPU Architecture. Technical Report. NVIDIA."},{"key":"e_1_3_3_2_78_2","unstructured":"NVIDIA. n. d.. CUDA binary utilities documentation. https:\/\/docs.nvidia.com\/cuda\/cuda-binary-utilities\/"},{"key":"e_1_3_3_2_79_2","unstructured":"NVIDIA. n. d.. CUTLASS: CUDA Templates for Linear Algebra Subroutines. https:\/\/github.com\/NVIDIA\/cutlass"},{"key":"e_1_3_3_2_80_2","unstructured":"Lars Nyland John\u00a0R Nickolls Gentaro Hirota and Tanmoy Mandal. 2011. Systems and methods for coalescing memory accesses of parallel threads. US Patent 8 086 806."},{"key":"e_1_3_3_2_81_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00016"},{"key":"e_1_3_3_2_82_2","volume-title":"5th gem5 Users\u2019 Workshop","author":"Ramadas Vishnu","year":"2023","unstructured":"Vishnu Ramadas, Daniel Kouchekinia, Ndubuisi Osuji, and Matthew\u00a0D. Sinclair. 2023. Closing the Gap: Improving the Accuracy of gem5\u2019s GPU Models. In 5th gem5 Users\u2019 Workshop."},{"key":"e_1_3_3_2_83_2","volume-title":"6th Young Architects\u2019 (YArch) Workshop","author":"Ramadas Vishnu","year":"2024","unstructured":"Vishnu Ramadas, Daniel Kouchekinia, and Matthew\u00a0D. Sinclair. 2024. Further Closing the GAP: Improving the Accuracy of gem5\u2019s GPU Models. In 6th Young Architects\u2019 (YArch) Workshop."},{"key":"e_1_3_3_2_84_2","doi-asserted-by":"publisher","DOI":"10.1145\/115952.115961"},{"key":"e_1_3_3_2_85_2","doi-asserted-by":"publisher","DOI":"10.5555\/320080.320085"},{"key":"e_1_3_3_2_86_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.16"},{"key":"e_1_3_3_2_87_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173211"},{"key":"e_1_3_3_2_88_2","unstructured":"Mojtaba\u00a0Abaie Shoushtary Jordi\u00a0Tubella Murgadas and Antonio Gonzalez. 2024. Control Flow Management in Modern GPUs. arxiv:https:\/\/arXiv.org\/abs\/2407.02944\u00a0[cs.AR]"},{"key":"e_1_3_3_2_89_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-94-017-7267-9_26"},{"key":"e_1_3_3_2_90_2","unstructured":"J.A. Stratton C. Rodrigues I.J. Sung N. Obeid L.W. Chang N. Anssari G.D. Liu and W.W. Hwu. 2012. Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing. Center for Reliable and High-Performance Computing (2012)."},{"key":"e_1_3_3_2_91_2","doi-asserted-by":"crossref","unstructured":"Wei Sun Ang Li Tong Geng Sander Stuijk and Henk Corporaal. 2023. Dissecting Tensor Cores via Microbenchmarks: Latency Throughput and Numeric Behaviors. IEEE Transactions on Parallel and Distributed Systems 34 1 (2023) 246\u2013261.","DOI":"10.1109\/TPDS.2022.3217824"},{"key":"e_1_3_3_2_92_2","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322230"},{"key":"e_1_3_3_2_93_2","first-page":"830","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Villa Oreste","year":"2014","unstructured":"Oreste Villa, Daniel\u00a0R. Johnson, Mike Oconnor, Evgeny Bolotin, David Nellans, Justin Luitjens, Nikolai Sakharnykh, Peng Wang, Paulius Micikevicius, Anthony Scudiero, Stephen\u00a0W. Keckler, and William\u00a0J. Dally. 2014. Scaling the Power Wall: A Path to Exascale. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. 830\u2013841."},{"key":"e_1_3_3_2_94_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00077"},{"key":"e_1_3_3_2_95_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358307"},{"key":"e_1_3_3_2_96_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2014.6983039"},{"key":"e_1_3_3_2_97_2","doi-asserted-by":"crossref","unstructured":"Craig Warren Antonios Giannopoulos Alan Gray Iraklis Giannakis Alan Patterson Laura Wetter and Andre Hamrah. 2019. A CUDA-based GPU engine for gprMax: Open source FDTD electromagnetic simulation software. Computer Physics Communications 237 (2019) 208\u2013218.","DOI":"10.1016\/j.cpc.2018.11.007"},{"key":"e_1_3_3_2_98_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2010.5452013"},{"key":"e_1_3_3_2_99_2","doi-asserted-by":"crossref","unstructured":"Jian\u00a0Liu Xiaoxia\u00a0Li Zheng\u00a0Mo and Li Guo. 2015. Revealing chemical reactions of coal pyrolysis with GPU-enabled ReaxFF molecular dynamics and cheminformatics analysis. Molecular Simulation 41 1-3 (2015) 13\u201327.","DOI":"10.1080\/08927022.2014.913789"},{"key":"e_1_3_3_2_100_2","unstructured":"Da Yan. 2019. TuringAS: Assembler for NVIDIA Volta and Turing GPUs. https:\/\/github.com\/daadaada\/turingas"},{"key":"e_1_3_3_2_101_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00071"},{"key":"e_1_3_3_2_102_2","first-page":"1220","volume-title":"IEEE Symposium on Security and Privacy","author":"Yavarzadeh Hosein","year":"2023","unstructured":"Hosein Yavarzadeh, Mohammadkazem Taram, Shravan Narayan, Deian Stefan, and Dean Tullsen. 2023. Half&Half: Demystifying Intel\u2019s Directional Branch Predictors for Fast, Secure Partitioned Execution. In IEEE Symposium on Security and Privacy. 1220\u20131237."},{"key":"e_1_3_3_2_103_2","unstructured":"Xiuxia Zhang. 2017. KeplerAs: An Open Source Kepler GPU Assembler. https:\/\/github.com\/xiuxiazhang\/KeplerAs"},{"key":"e_1_3_3_2_104_2","doi-asserted-by":"publisher","DOI":"10.1145\/3018743.3018755"},{"key":"e_1_3_3_2_105_2","doi-asserted-by":"publisher","DOI":"10.1145\/3576915.3616672"}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756041","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:49:27Z","timestamp":1769464167000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3756041"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":104,"alternative-id":["10.1145\/3725843.3756041","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3756041","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}