{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T12:30:26Z","timestamp":1773318626271,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":122,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759768","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:04:47Z","timestamp":1762963487000},"page":"905-934","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Characterizing Performance, Power, and Energy of AMD CDNA3 GPU Family"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8485-581X","authenticated-orcid":false,"given":"Bagus","family":"Hanindhito","sequence":"first","affiliation":[{"name":"Dell Technologies, Round Rock, Texas, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8166-5686","authenticated-orcid":false,"given":"Bhavesh","family":"Patel","sequence":"additional","affiliation":[{"name":"Dell Technologies, Round Rock, Texas, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_3_2_2","doi-asserted-by":"publisher","DOI":"10.1145\/3079079.3079103"},{"key":"e_1_3_3_3_3_2","doi-asserted-by":"publisher","unstructured":"Mark\u00a0James Abraham Teemu Murtola Roland Schulz Szil\u00e1rd P\u00e1ll Jeremy\u00a0C. Smith Berk Hess and Erik Lindahl. 2015. GROMACS: High performance molecular simulations through multi-level parallelism from laptops to supercomputers. SoftwareX 1-2 (2015) 19\u201325. 10.1016\/j.softx.2015.06.001","DOI":"10.1016\/j.softx.2015.06.001"},{"key":"e_1_3_3_3_4_2","unstructured":"Advanced Micro Devices Inc.2024. AMD Accelerates Pace of Data Center AI Innovation and Leadership with Expanded AMD Instinct GPU Roadmap. https:\/\/ir.amd.com\/news-events\/press-releases\/detail\/1201\/amd-accelerates-pace-of-data-center-ai-innovation-and-leadership-with-expanded-amd-instinct-gpu-roadmap"},{"key":"e_1_3_3_3_5_2","volume-title":"AMD Instinct MI300 Instruction Set Architecture","author":"Inc. Advanced Micro Devices,","year":"2024","unstructured":"Advanced Micro Devices, Inc.2024. AMD Instinct MI300 Instruction Set Architecture. Whitepaper. Advanced Micro Devices, Inc., California, US. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/instinct-tech-docs\/instruction-set-architectures\/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf"},{"key":"e_1_3_3_3_6_2","volume-title":"AMD CDNA\u2122\u00a03 Architecture: The All-New AMD GPU Architecture for the Modern Era of HPC and AI","author":"Inc. Advanced Micro Devices,","year":"2025","unstructured":"Advanced Micro Devices, Inc.2025. AMD CDNA\u2122\u00a03 Architecture: The All-New AMD GPU Architecture for the Modern Era of HPC and AI. Whitepaper. Advanced Micro Devices, Inc., California, US. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/instinct-tech-docs\/white-papers\/amd-cdna-3-white-paper.pdf"},{"key":"e_1_3_3_3_7_2","unstructured":"Advanced Micro Devices Inc.2025. AMD Matrix Instruction Calculator. https:\/\/github.com\/ROCm\/amd_matrix_instruction_calculator"},{"key":"e_1_3_3_3_8_2","unstructured":"Advanced Micro Devices Inc.2025. Low precision floating point types. https:\/\/rocm.docs.amd.com\/projects\/HIP\/en\/latest\/reference\/low_fp_types.html"},{"key":"e_1_3_3_3_9_2","unstructured":"Advanced Micro Devices Inc.2025. TransferBench. https:\/\/github.com\/ROCm\/TransferBench\/tree\/v1.61.00?tab=readme-ov-file"},{"key":"e_1_3_3_3_10_2","unstructured":"Amey Agrawal Ashish Panwar Jayashree Mohan Nipun Kwatra Bhargav\u00a0S. Gulavani and Ramachandran Ramjee. 2023. SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills. arxiv:https:\/\/arXiv.org\/abs\/2308.16369\u00a0[cs.LG]"},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3725789.3725797"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3624062.3624177"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"publisher","unstructured":"Yulong Ao Chao Yang Fangfang Liu Wanwang Yin Lijuan Jiang and Qiao Sun. 2018. Performance Optimization of the HPCG Benchmark on the Sunway TaihuLight Supercomputer. ACM Trans. Archit. Code Optim. 15 1 Article 11 (March 2018) 20\u00a0pages. 10.1145\/3182177","DOI":"10.1145\/3182177"},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-50371-0_19"},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/IRPS48228.2024.10529424"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","unstructured":"Ajay Bandi Pydi Venkata Satya\u00a0Ramesh Adapa and Yudu Eswar Vinay Pratap\u00a0Kumar Kuchi. 2023. The Power of Generative AI: A Review of Requirements Models Input-Output Formats Evaluation Metrics and Challenges. Future Internet 15 8 (2023) 60\u00a0pages. 10.3390\/fi15080260","DOI":"10.3390\/fi15080260"},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/SCW63240.2024.00183"},{"key":"e_1_3_3_3_18_2","series-title":"Proceedings of Machine Learning Research","first-page":"992","volume-title":"Proceedings of the 38th International Conference on Machine Learning","volume":"139","author":"Blalock Davis","year":"2021","unstructured":"Davis Blalock and John Guttag. 2021. Multiplying Matrices Without Multiplying. In Proceedings of the 38th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0139), Marina Meila and Tong Zhang (Eds.). PMLR, Virtual, 992\u20131004. https:\/\/proceedings.mlr.press\/v139\/blalock21a.html"},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/PDP.2013.56"},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607066"},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"publisher","unstructured":"Yupeng Chang Xu Wang Jindong Wang Yuan Wu Linyi Yang Kaijie Zhu Hao Chen Xiaoyuan Yi Cunxiang Wang Yidong Wang Wei Ye Yue Zhang Yi Chang Philip\u00a0S. Yu Qiang Yang and Xing Xie. 2024. A Survey on Evaluation of Large Language Models. ACM Trans. Intell. Syst. Technol. 15 3 Article 39 (March 2024) 45\u00a0pages. 10.1145\/3641289","DOI":"10.1145\/3641289"},{"key":"e_1_3_3_3_22_2","unstructured":"Matthew Connatser. 2024. AMD crafts custom EPYC CPU with HBM3 memory for Microsoft Azure \u2013 CPU with 96 Zen 4 cores and 450GB of HBM3 may be repurposed MI300C four chips hit 7 TB\/s [Updated]. https:\/\/www.tomshardware.com\/pc-components\/cpus\/amd-crafts-custom-epyc-cpu-for-microsoft-azure-with-hbm3-memory-cpu-with-88-zen-4-cores-and-450gb-of-hbm3-may-be-repurposed-mi300c-four-chips-hit-7-tb-s"},{"key":"e_1_3_3_3_23_2","unstructured":"Fernando\u00a0Aznar Cornejo. 2024. Announcing Azure HBv5 Virtual Machines: A Breakthrough in Memory Bandwidth for HPC. https:\/\/www.hpcwire.com\/solution_content\/microsoft-amd\/announcing-azure-hbv5-virtual-machines-a-breakthrough-in-memory-bandwidth-for-hpc\/"},{"key":"e_1_3_3_3_24_2","unstructured":"Curtis Nicholas and Fanfarillo Alessandro. 2022. Intro to register pressure in AMD compilers. https:\/\/www.olcf.ornl.gov\/wp-content\/uploads\/Intro_Register_pressure_ORNL_20220812_2083.pdf"},{"key":"e_1_3_3_3_25_2","doi-asserted-by":"publisher","unstructured":"William\u00a0J. Dally Stephen\u00a0W. Keckler and David\u00a0B. Kirk. 2021. Evolution of the Graphics Processing Unit (GPU). IEEE Micro 41 6 (2021) 42\u201351. 10.1109\/MM.2021.3113475","DOI":"10.1109\/MM.2021.3113475"},{"key":"e_1_3_3_3_26_2","doi-asserted-by":"publisher","unstructured":"Tom Darden Darrin York and Lee Pedersen. 1993. Particle mesh Ewald: An Nlog(N) method for Ewald sums in large systems. The Journal of Chemical Physics 98 12 (06 1993) 10089\u201310092. 10.1063\/1.464397","DOI":"10.1063\/1.464397"},{"key":"e_1_3_3_3_27_2","unstructured":"DeepSeek-AI. 2025. DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arxiv:https:\/\/arXiv.org\/abs\/2501.12948\u00a0[cs.CL]"},{"key":"e_1_3_3_3_28_2","unstructured":"DeepSeek-AI. 2025. DeepSeek-V3 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2412.19437\u00a0[cs.CL]"},{"key":"e_1_3_3_3_29_2","doi-asserted-by":"publisher","unstructured":"Yuefan Deng Peng Zhang Carlos Marques Reid Powell and Li Zhang. 2013. Analysis of Linpack and power efficiencies of the world\u2019s TOP500 supercomputers. Parallel Comput. 39 6 (2013) 271\u2013279. 10.1016\/j.parco.2013.04.007","DOI":"10.1016\/j.parco.2013.04.007"},{"key":"e_1_3_3_3_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/IGSC64514.2024.00016"},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"publisher","unstructured":"Jack Dongarra Michael\u00a0A Heroux and Piotr Luszczek. 2016. High-performance conjugate-gradient benchmark: A new metric for ranking high-performance computing systems. The International Journal of High Performance Computing Applications 30 1 (2016) 3\u201310. 10.1177\/1094342015593158","DOI":"10.1177\/1094342015593158"},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-18991-2_27"},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"publisher","unstructured":"Jack\u00a0J. Dongarra. 1992. Performance of various computers using standard linear equations software. SIGARCH Comput. Archit. News 20 3 (June 1992) 22\u201344. 10.1145\/141868.141871","DOI":"10.1145\/141868.141871"},{"key":"e_1_3_3_3_34_2","doi-asserted-by":"publisher","unstructured":"Jack\u00a0J. Dongarra Piotr Luszczek and Antoine Petitet. 2003. The LINPACK Benchmark: past present and future. Concurrency and Computation: Practice and Experience 15 9 (2003) 803\u2013820. 10.1002\/cpe.728","DOI":"10.1002\/cpe.728"},{"key":"e_1_3_3_3_35_2","unstructured":"Erlangen National High Performance Computing Center. 2025. GPU benchmarks. https:\/\/github.com\/RRZE-HPC\/gpu-benches"},{"key":"e_1_3_3_3_36_2","doi-asserted-by":"publisher","unstructured":"Dominik Ernst Georg Hager Jonas Thies and Gerhard Wellein. 2021. Performance engineering for real and complex tall & skinny matrix multiplication kernels on GPUs. The International Journal of High Performance Computing Applications 35 1 (2021) 5\u201319. 10.1177\/1094342020965661","DOI":"10.1177\/1094342020965661"},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"publisher","unstructured":"Dominik Ernst Markus Holzer Georg Hager Matthias Knorr and Gerhard Wellein. 2023. Analytical performance estimation during code generation on modern GPUs. J. Parallel and Distrib. Comput. 173 (2023) 152\u2013167. 10.1016\/j.jpdc.2022.11.003","DOI":"10.1016\/j.jpdc.2022.11.003"},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2004.26"},{"key":"e_1_3_3_3_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533229"},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3629526.3653835"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS61541.2024.00031"},{"key":"e_1_3_3_3_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3676151.3719377"},{"key":"e_1_3_3_3_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/IMPACT59481.2023.10348822"},{"key":"e_1_3_3_3_44_2","doi-asserted-by":"publisher","unstructured":"Wen-mei Hwu and Sanjay Patel. 2018. Accelerator Architectures \u2014A Ten-Year Retrospective. IEEE Micro 38 6 (2018) 56\u201362. 10.1109\/MM.2018.2877839","DOI":"10.1109\/MM.2018.2877839"},{"key":"e_1_3_3_3_45_2","volume-title":"High Bandwidth Memory (HBM3) DRAM","author":"Council Joint Electron Device Engineering","year":"2023","unstructured":"Joint Electron Device Engineering Council. 2023. High Bandwidth Memory (HBM3) DRAM. Standard. JEDEC Solid State Technology Assoc., Virginia, US."},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"publisher","unstructured":"Mla\u0111an Jovanovi\u0107 and Mark Campbell. 2022. Generative Artificial Intelligence: Trends and Prospects. Computer 55 10 (2022) 107\u2013112. 10.1109\/MC.2022.3192720","DOI":"10.1109\/MC.2022.3192720"},{"key":"e_1_3_3_3_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/PDP52278.2021.00027"},{"key":"e_1_3_3_3_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00103"},{"key":"e_1_3_3_3_49_2","doi-asserted-by":"publisher","unstructured":"Andreas\u00a0Kosmas Kakolyris Dimosthenis Masouros Sotirios Xydis and Dimitrios Soudris. 2024. SLO-Aware GPU DVFS for Energy-Efficient LLM Inference Serving. IEEE Computer Architecture Letters 23 2 (2024) 150\u2013153. 10.1109\/LCA.2024.3406038","DOI":"10.1109\/LCA.2024.3406038"},{"key":"e_1_3_3_3_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3715996"},{"key":"e_1_3_3_3_51_2","unstructured":"Andrew Kerr Duane Merrill Julien Demouth John Tran Naila Farooqui Markus Tavenrath Vince Schuster Eddie Gornish Jerry Zheng and Bageshri Sathe. 2018. CUTLASS: CUDA Template Library for Dense Linear Algebra at All Levels and Scales. NVIDIA GPU Technology Conference (GTC) s8854 (Mar 2018). https:\/\/on-demand.gputechconf.com\/gtc\/2018\/presentation\/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf"},{"key":"e_1_3_3_3_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532370"},{"key":"e_1_3_3_3_53_2","doi-asserted-by":"publisher","DOI":"10.1109\/IMW59701.2024.10536972"},{"key":"e_1_3_3_3_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTR.2009.5289128"},{"key":"e_1_3_3_3_55_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-21867-5_1"},{"key":"e_1_3_3_3_56_2","doi-asserted-by":"publisher","DOI":"10.1145\/2370816.2370850"},{"key":"e_1_3_3_3_57_2","unstructured":"Chester Lam and George Cozma. 2023. AMD\u2019s CDNA 3 Compute Architecture. https:\/\/chipsandcheese.com\/p\/amds-cdna-3-compute-architecture"},{"key":"e_1_3_3_3_58_2","unstructured":"Chester Lam George Cozma and Neggles. 2024. Testing AMD\u2019s Giant MI300X. https:\/\/chipsandcheese.com\/p\/testing-amds-giant-mi300x"},{"key":"e_1_3_3_3_59_2","unstructured":"Ehsan Latif Yifan Zhou Shuchen Guo Yizhu Gao Lehong Shi Matthew Nayaaba Gyeonggeon Lee Liang Zhang Arne Bewersdorff Luyang Fang Xiantong Yang Huaqin Zhao Hanqi Jiang Haoran Lu Jiaxi Li Jichao Yu Weihang You Zhengliang Liu Vincent\u00a0Shung Liu Hui Wang Zihao Wu Jin Lu Fei Dou Ping Ma Ninghao Liu Tianming Liu and Xiaoming Zhai. 2024. A Systematic Assessment of OpenAI o1-Preview for Higher Order Thinking in Education. arxiv:https:\/\/arXiv.org\/abs\/2410.21287\u00a0[cs.CY] https:\/\/arxiv.org\/abs\/2410.21287"},{"key":"e_1_3_3_3_60_2","doi-asserted-by":"crossref","unstructured":"Ehsan Latif Yifan Zhou Shuchen Guo Lehong Shi Yizhu Gao Matthew Nyaaba Arne Bewerdorff Xiantong Yang and Xiaoming Zhai. 2024. Can OpenAI o1 outperform humans in higher-order cognitive thinking? arxiv:https:\/\/arXiv.org\/abs\/2412.05753\u00a0[cs.CY] https:\/\/arxiv.org\/abs\/2412.05753","DOI":"10.21203\/rs.3.rs-7435042\/v1"},{"key":"e_1_3_3_3_61_2","doi-asserted-by":"publisher","unstructured":"Matthew Leinhauser Ren\u00e9 Widera Sergei Bastrakov Alexander Debus Michael Bussmann and Sunita Chandrasekaran. 2022. Metrics and Design of an Instruction Roofline Model for AMD GPUs. ACM Trans. Parallel Comput. 9 1 Article 1 (Jan. 2022) 14\u00a0pages. 10.1145\/3505285","DOI":"10.1145\/3505285"},{"key":"e_1_3_3_3_62_2","unstructured":"Baolin Li Yankai Jiang Vijay Gadepally and Devesh Tiwari. 2024. LLM Inference Serving: Survey of Recent Advances and Opportunities. arxiv:https:\/\/arXiv.org\/abs\/2407.12391\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2407.12391"},{"key":"e_1_3_3_3_63_2","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid59990.2024.00083"},{"key":"e_1_3_3_3_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3293883.3295734"},{"key":"e_1_3_3_3_65_2","doi-asserted-by":"publisher","unstructured":"Erik Lindholm John Nickolls Stuart Oberman and John Montrym. 2008. NVIDIA Tesla: A Unified Graphics and Computing Architecture. IEEE Micro 28 2 (2008) 39\u201355. 10.1109\/MM.2008.31","DOI":"10.1109\/MM.2008.31"},{"key":"e_1_3_3_3_66_2","unstructured":"Zhiye Liu. 2023. AMD\u2019s Unannounced MI300C AI Accelerator Emerges. https:\/\/www.tomshardware.com\/news\/amds-special-mi300c-ai-accelerator-emerges"},{"key":"e_1_3_3_3_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589349"},{"key":"e_1_3_3_3_68_2","doi-asserted-by":"publisher","unstructured":"Tania Malik Vladimir Rychkov and Alexey Lastovetsky. 2016. Network-aware optimization of communications for parallel matrix multiplication on hierarchical HPC platforms. Concurrency and Computation: Practice and Experience 28 3 (2016) 802\u2013821. 10.1002\/cpe.3609 arXiv:https:\/\/onlinelibrary.wiley.com\/doi\/pdf\/10.1002\/cpe.3609","DOI":"10.1002\/cpe.3609"},{"key":"e_1_3_3_3_69_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-17248-4_9"},{"key":"e_1_3_3_3_70_2","doi-asserted-by":"publisher","DOI":"10.1109\/PMBS.2016.006"},{"key":"e_1_3_3_3_71_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2018.00091"},{"key":"e_1_3_3_3_72_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-40843-4_30"},{"key":"e_1_3_3_3_73_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640411"},{"key":"e_1_3_3_3_74_2","doi-asserted-by":"publisher","unstructured":"Miltiadis Moralis-Pegios Stelios Pitris Charoula Mitsolidou Konstantinos Fotiadis Hannes Ramon Joris Lambrecht Johan Bauwelinck Xin Yin Yoojin Ban Peter De\u00a0Heyn Joris Van\u00a0Campenhout Tobias Lamprecht Andreas Lehnman Nikos Pleros and Theoni Alexoudi. 2021. Silicon circuits for chip-to-chip communications in multi-socket server board interconnects. IET Optoelectronics 15 2 (2021) 102\u2013110. 10.1049\/ote2.12018 arXiv:https:\/\/ietresearch.onlinelibrary.wiley.com\/doi\/pdf\/10.1049\/ote2.12018","DOI":"10.1049\/ote2.12018"},{"key":"e_1_3_3_3_75_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42614.2022.9731107"},{"key":"e_1_3_3_3_76_2","unstructured":"NVIDIA Corporation. 2025. Matrix Multiplication Background User\u2019s Guide. https:\/\/docs.nvidia.com\/deeplearning\/performance\/dl-performance-matrix-multiplication\/index.html"},{"key":"e_1_3_3_3_77_2","volume-title":"OCP 8-bit Floating Point Specification (OFP8): Revision 1.0","author":"Summer Oberman, Stuart and Micikevicius, Paulius and Dubey, Pradeep and Cornea, Marius and Rodriguez, Andres and Bratt, Ian and Grisenthwaite, Richard and Jouppi, Norm and Chou, Chiachen and Huffman, Amber and Schulte, Michael and Wittig, Ralph and Jani, Dharmesh and Deng,","year":"2023","unstructured":"Oberman, Stuart and Micikevicius, Paulius and Dubey, Pradeep and Cornea, Marius and Rodriguez, Andres and Bratt, Ian and Grisenthwaite, Richard and Jouppi, Norm and Chou, Chiachen and Huffman, Amber and Schulte, Michael and Wittig, Ralph and Jani, Dharmesh and Deng, Summer. 2023. OCP 8-bit Floating Point Specification (OFP8): Revision 1.0. Standard. Open Compute Project."},{"key":"e_1_3_3_3_78_2","volume-title":"OCP OAI Universal Baseboard Design Specification v1.0","author":"Group OCP OAI JDA","year":"2020","unstructured":"OCP OAI JDA Group. 2020. OCP OAI Universal Baseboard Design Specification v1.0. Standard. Open Compute Project. http:\/\/files.opencompute.org\/oc\/public.php?service=files&t=f924e6fefb20c9be651c61c0f4b1a5cc"},{"key":"e_1_3_3_3_79_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640383"},{"key":"e_1_3_3_3_80_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-69953-0_16"},{"key":"e_1_3_3_3_81_2","unstructured":"Oostrum Ren\u00e9 van and Chalmers Noel and McDougall Damon and Bauman Paul and Curtis Nicholas and Malaya Nicholas and Wolfe Noah. 2019. AMD GPU Hardware Basics. https:\/\/www.olcf.ornl.gov\/wp-content\/uploads\/2019\/10\/ORNL_Application_Readiness_Workshop-AMD_GPU_Basics.pdf"},{"key":"e_1_3_3_3_82_2","doi-asserted-by":"publisher","DOI":"10.23919\/VLSICircuits52068.2021.9492400"},{"key":"e_1_3_3_3_83_2","unstructured":"Dylan Patel George Cozma and Gerald Wong. 2023. AMD MI300 \u2013 Taming The Hype \u2013 AI Performance Volume Ramp Customers Cost IO Networking Software. https:\/\/www.semianalysis.com\/p\/amd-mi300-taming-the-hype-ai-performance"},{"key":"e_1_3_3_3_84_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2019.00017"},{"key":"e_1_3_3_3_85_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-13581-1"},{"key":"e_1_3_3_3_86_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-14047-1"},{"key":"e_1_3_3_3_87_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-10968-3"},{"key":"e_1_3_3_3_88_2","doi-asserted-by":"publisher","unstructured":"Szil\u00e1rd P\u00e1ll Artem Zhmurov Paul Bauer Mark Abraham Magnus Lundborg Alan Gray Berk Hess and Erik Lindahl. 2020. Heterogeneous parallelization and acceleration of molecular dynamics simulations in GROMACS. The Journal of Chemical Physics 153 13 (10 2020) 134110. 10.1063\/5.0018516","DOI":"10.1063\/5.0018516"},{"key":"e_1_3_3_3_89_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2013.89"},{"key":"e_1_3_3_3_90_2","doi-asserted-by":"publisher","DOI":"10.1145\/3624062.3624195"},{"key":"e_1_3_3_3_91_2","doi-asserted-by":"publisher","unstructured":"Shaolei Ren Bill Tomlinson Rebecca\u00a0W. Black and Andrew\u00a0W. Torrance. 2024. Reconciling the contrasting narratives on the environmental impact of large language models. Scientific Reports 14 1 (01 Nov 2024) 26310. 10.1038\/s41598-024-76682-6","DOI":"10.1038\/s41598-024-76682-6"},{"key":"e_1_3_3_3_92_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC43674.2020.9286149"},{"key":"e_1_3_3_3_93_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-01769-87"},{"key":"e_1_3_3_3_94_2","unstructured":"Ben Sander Evan Masters Babak Poursatip and Henry Ho. 2025. Measuring Max-Achievable FLOPs \u2013 Part 2. https:\/\/rocm.blogs.amd.com\/software-tools-optimization\/measuring-max-achievable-flops-part2\/README.html."},{"key":"e_1_3_3_3_95_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS61541.2024.00022"},{"key":"e_1_3_3_3_96_2","doi-asserted-by":"publisher","DOI":"10.1109\/SCW63240.2024.00079"},{"key":"e_1_3_3_3_97_2","doi-asserted-by":"publisher","unstructured":"Murray Shanahan. 2024. Talking about Large Language Models. Commun. ACM 67 2 (Jan. 2024) 68\u201379. 10.1145\/3624724","DOI":"10.1145\/3624724"},{"key":"e_1_3_3_3_98_2","doi-asserted-by":"publisher","unstructured":"Eva Siegmann Robert\u00a0J. Harrison David Carlson Smeet Chheda Anthony Curtis Firat Coskun Raul Gonzalez Daniel Wood and Nikolay\u00a0A. Simakov. 2024. First Impressions of the Sapphire Rapids Processor with HBM for Scientific Workloads. SN Computer Science 5 5 (07 Jun 2024) 623. 10.1007\/s42979-024-02958-3","DOI":"10.1007\/s42979-024-02958-3"},{"key":"e_1_3_3_3_99_2","series-title":"(SC \u201922)","volume-title":"Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis","author":"Sinha Prasoon","year":"2022","unstructured":"Prasoon Sinha, Akhil Guliani, Rutwik Jain, Brandon Tran, Matthew\u00a0D. Sinclair, and Shivaram Venkataraman. 2022. Not all GPUs are created equal: characterizing variability in large-scale, accelerator-rich systems. In Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis (Dallas, Texas) (SC \u201922). IEEE Press, New York, NY, USA, Article 65, 15\u00a0pages."},{"key":"e_1_3_3_3_100_2","doi-asserted-by":"publisher","unstructured":"Alan Smith and Vamsi\u00a0Krishna Alla. 2025. AMD Instinct\u2122 MI300X: A Generative AI Accelerator and Platform Architecture. IEEE Micro 1 1 (2025) 1\u20139. 10.1109\/MM.2025.3552324","DOI":"10.1109\/MM.2025.3552324"},{"key":"e_1_3_3_3_101_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC49657.2024.10454441"},{"key":"e_1_3_3_3_102_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00068"},{"key":"e_1_3_3_3_103_2","doi-asserted-by":"publisher","DOI":"10.1109\/VLSITechnologyandCir46783.2024.10631545"},{"key":"e_1_3_3_3_104_2","unstructured":"Jovan Stojkovic Esha Choukse Chaojie Zhang Inigo Goiri and Josep Torrellas. 2024. Towards Greener LLMs: Bringing Energy-Efficiency to the Forefront of LLM Inference. arxiv:https:\/\/arXiv.org\/abs\/2403.20306\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2403.20306"},{"key":"e_1_3_3_3_105_2","unstructured":"Peng Sun Andy Luo Seungrok Jung Liz Li and Hai Xiao. 2025. Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X. https:\/\/rocm.blogs.amd.com\/artificial-intelligence\/DeepSeekR1-Part2\/README.html."},{"key":"e_1_3_3_3_106_2","doi-asserted-by":"publisher","unstructured":"Wei Sun Ang Li Tong Geng Sander Stuijk and Henk Corporaal. 2023. Dissecting Tensor Cores via Microbenchmarks: Latency Throughput and Numeric Behaviors. IEEE Transactions on Parallel and Distributed Systems 34 1 (2023) 246\u2013261. 10.1109\/TPDS.2022.3217824","DOI":"10.1109\/TPDS.2022.3217824"},{"key":"e_1_3_3_3_107_2","doi-asserted-by":"publisher","unstructured":"Li Tan Shashank Kothapalli Longxiang Chen Omar Hussaini Ryan Bissiri and Zizhong Chen. 2014. A survey of power and energy efficient techniques for high performance numerical linear algebra operations. Parallel Comput. 40 10 (2014) 559\u2013573. 10.1016\/j.parco.2014.09.001","DOI":"10.1016\/j.parco.2014.09.001"},{"key":"e_1_3_3_3_108_2","doi-asserted-by":"publisher","DOI":"10.23919\/ISC.2024.10528925"},{"key":"e_1_3_3_3_109_2","doi-asserted-by":"publisher","unstructured":"Timm Teubner Christoph\u00a0M. Flath Christof Weinhardt Wil van\u00a0der Aalst and Oliver Hinz. 2023. Welcome to the Era of ChatGPT et al. Business & Information Systems Engineering 65 2 (01 Apr 2023) 95\u2013101. 10.1007\/s12599-023-00795-x","DOI":"10.1007\/s12599-023-00795-x"},{"key":"e_1_3_3_3_110_2","doi-asserted-by":"publisher","unstructured":"Ilario\u00a0G. Tironi Ren\u00e9 Sperb Paul\u00a0E. Smith and Wilfred\u00a0F. van Gunsteren. 1995. A generalized reaction field method for molecular dynamics simulations. The Journal of Chemical Physics 102 13 (04 1995) 5451\u20135459. 10.1063\/1.469273 arXiv:https:\/\/pubs.aip.org\/aip\/jcp\/article-pdf\/102\/13\/5451\/19142781\/5451_1_online.pdf","DOI":"10.1063\/1.469273"},{"key":"e_1_3_3_3_111_2","unstructured":"TOP500.org. 2025. Frequently Asked Questions: The Linpack Benchmark. https:\/\/top500.org\/resources\/frequently-asked-questions\/."},{"key":"e_1_3_3_3_112_2","doi-asserted-by":"publisher","unstructured":"David Van Der\u00a0Spoel Erik Lindahl Berk Hess Gerrit Groenhof Alan\u00a0E. Mark and Herman J.\u00a0C. Berendsen. 2005. GROMACS: Fast flexible and free. Journal of Computational Chemistry 26 16 (2005) 1701\u20131718. 10.1002\/jcc.20291 arXiv:https:\/\/onlinelibrary.wiley.com\/doi\/pdf\/10.1002\/jcc.20291","DOI":"10.1002\/jcc.20291"},{"key":"e_1_3_3_3_113_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-658-45010-6_15"},{"key":"e_1_3_3_3_114_2","unstructured":"Shashank Verma and Neal Vaidya. 2023. Matrix Multiplication Background User\u2019s Guide. Mastering LLM Techniques: Inference Optimization"},{"key":"e_1_3_3_3_115_2","unstructured":"Siwei Wu Zhongyuan Peng Xinrun Du Tuney Zheng Minghao Liu Jialong Wu Jiachen Ma Yizhi Li Jian Yang Wangchunshu Zhou Qunshu Lin Junbo Zhao Zhaoxiang Zhang Wenhao Huang Ge Zhang Chenghua Lin and J.\u00a0H. Liu. 2024. A Comparative Study on Reasoning Patterns of OpenAI\u2019s o1 Model. arxiv:https:\/\/arXiv.org\/abs\/2410.13639\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2410.13639"},{"key":"e_1_3_3_3_116_2","doi-asserted-by":"publisher","unstructured":"Zhiwei Yang Lu Lu and Ruimin Wang. 2022. A batched GEMM optimization framework for deep learning. The Journal of Supercomputing 78 11 (March 2022) 13393\u201313408. 10.1007\/s11227-022-04336-3","DOI":"10.1007\/s11227-022-04336-3"},{"key":"e_1_3_3_3_117_2","doi-asserted-by":"publisher","DOI":"10.1109\/SCW63240.2024.00077"},{"key":"e_1_3_3_3_118_2","doi-asserted-by":"publisher","unstructured":"Haiyan Zhao Hanjie Chen Fan Yang Ninghao Liu Huiqi Deng Hengyi Cai Shuaiqiang Wang Dawei Yin and Mengnan Du. 2024. Explainability for Large Language Models: A Survey. ACM Trans. Intell. Syst. Technol. 15 2 Article 20 (Feb. 2024) 38\u00a0pages. 10.1145\/3639372","DOI":"10.1145\/3639372"},{"key":"e_1_3_3_3_119_2","volume-title":"OCP Accelerator Module Design Spec Package v1.1","author":"Zhao Whitney","year":"2020","unstructured":"Whitney Zhao, Tiffany Jin, Cheng Chen, Siamak Tavallaei, Zhenghui Wu, Song\u00a0Kok Hang, Ben Wei, Jubin Mehta, Yuval Itkin, and Hao Shen. 2020. OCP Accelerator Module Design Spec Package v1.1. Standard. Open Compute Project. https:\/\/www.opencompute.org\/documents\/ocp-accelerator-module-design-specification-v1p1-1-pdf"},{"key":"e_1_3_3_3_120_2","unstructured":"Lianmin Zheng Liangsheng Yin Zhiqiang Xie Chuyue Sun Jeff Huang Cody\u00a0Hao Yu Shiyi Cao Christos Kozyrakis Ion Stoica Joseph\u00a0E. Gonzalez Clark Barrett and Ying Sheng. 2024. SGLang: Efficient Execution of Structured Language Model Programs. arxiv:https:\/\/arXiv.org\/abs\/2312.07104\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2312.07104"},{"key":"e_1_3_3_3_121_2","doi-asserted-by":"publisher","unstructured":"Yue Zheng Yuhao Chen Bin Qian Xiufang Shi Yuanchao Shu and Jiming Chen. 2025. A Review on Edge Large Language Models: Design Execution and Applications. ACM Comput. Surv. 57 8 Article 209 (March 2025) 35\u00a0pages. 10.1145\/3719664","DOI":"10.1145\/3719664"},{"key":"e_1_3_3_3_122_2","unstructured":"Tianyang Zhong Zhengliang Liu Yi Pan Yutong Zhang Yifan Zhou Shizhe Liang Zihao Wu Yanjun Lyu Peng Shu Xiaowei Yu Chao Cao Hanqi Jiang Hanxu Chen Yiwei Li Junhao Chen Huawen Hu Yihen Liu Huaqin Zhao Shaochen Xu Haixing Dai Lin Zhao Ruidong Zhang Wei Zhao Zhenyuan Yang Jingyuan Chen Peilong Wang Wei Ruan Hui Wang Huan Zhao Jing Zhang Yiming Ren Shihuan Qin Tong Chen Jiaxi Li Arif\u00a0Hassan Zidan Afrar Jahin Minheng Chen Sichen Xia Jason Holmes Yan Zhuang Jiaqi Wang Bochen Xu Weiran Xia Jichao Yu Kaibo Tang Yaxuan Yang Bolun Sun Tao Yang Guoyu Lu Xianqiao Wang Lilong Chai He Li Jin Lu Lichao Sun Xin Zhang Bao Ge Xintao Hu Lian Zhang Hua Zhou Lu Zhang Shu Zhang Ninghao Liu Bei Jiang Linglong Kong Zhen Xiang Yudan Ren Jun Liu Xi Jiang Yu Bao Wei Zhang Xiang Li Gang Li Wei Liu Dinggang Shen Andrea Sikora Xiaoming Zhai Dajiang Zhu and Tianming Liu. 2024. Evaluation of OpenAI o1: Opportunities and Challenges of AGI. arxiv:https:\/\/arXiv.org\/abs\/2409.18486\u00a0[cs.CL]"},{"key":"e_1_3_3_3_123_2","doi-asserted-by":"publisher","DOI":"10.1145\/3543622.3573210"}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759768","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:46:51Z","timestamp":1773254811000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759768"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":122,"alternative-id":["10.1145\/3712285.3759768","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759768","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}