{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T07:54:14Z","timestamp":1768031654975,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","funder":[{"name":"Advanced Scientific Computing Research, Scientific Discovery through Advanced Computing (SciDAC) program, in the U.S. Department of Energy, Office of Science","award":["DE-AC02-05CH11231"],"award-info":[{"award-number":["DE-AC02-05CH11231"]}]},{"name":"National Energy Research Scientific Computing Center, which is supported by the Office of Science of the U.S. Department of Energy","award":["DE-AC02-05CH11231"],"award-info":[{"award-number":["DE-AC02-05CH11231"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3731599.3767497","type":"proceedings-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T16:20:02Z","timestamp":1762532402000},"page":"1205-1216","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Roofline Analysis of Tightly-Coupled CPU-GPU Superchips: A Study on MI300A and GH200"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4596-0289","authenticated-orcid":false,"given":"Oscar","family":"Antepara","sequence":"first","affiliation":[{"name":"AMCR, Lawrence Berkeley National Laboratory, Berkeley, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7923-2896","authenticated-orcid":false,"given":"Leonid","family":"Oliker","sequence":"additional","affiliation":[{"name":"AMCR, Lawrence Berkeley National Laboratory, Berkeley, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8327-5717","authenticated-orcid":false,"given":"Samuel","family":"Williams","sequence":"additional","affiliation":[{"name":"AMCR, Lawrence Berkeley National Laboratory, Berkeley, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","volume-title":"mixbench GitHub Repository","year":"2025","unstructured":"2025. mixbench GitHub Repository. https:\/\/github.com\/ekondis\/mixbench"},{"key":"e_1_3_3_2_3_2","unstructured":"AMD. 2021. AMD MI250X GPU ARCHITECTURE. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/instinct-business-docs\/white-papers\/amd-cdna2-white-paper.pdf"},{"key":"e_1_3_3_2_4_2","unstructured":"AMD. 2023. 4TH GEN AMD EPYC PROCESSOR ARCHITECTURE. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/epyc-business-docs\/white-papers\/221704010-B_en_4th-Gen-AMD-EPYC-Processor-Architecture\u2014White-Paper_pdf.pdf"},{"key":"e_1_3_3_2_5_2","unstructured":"AMD. 2023. AMD MI300A GPU ARCHITECTURE. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/instinct-tech-docs\/data-sheets\/amd-instinct-mi300a-data-sheet.pdf"},{"key":"e_1_3_3_2_6_2","unstructured":"AMD. 2024. AMD Accelerator Cloud (AAC). https:\/\/aac.amd.com\/help\/"},{"key":"e_1_3_3_2_7_2","unstructured":"AMD. 2024. GPU MEMORY. https:\/\/rocm.docs.amd.com\/en\/docs-6.1.0\/conceptual\/gpu-memory.html"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/SCW63240.2024.00159"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3712285.3759815"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW50202.2020.00067"},{"key":"e_1_3_3_2_11_2","unstructured":"Sridutt Bhalachandra Brian Austin Samuel Williams and Nicholas\u00a0J. Wright. 2022. Understanding the Impact of Input Entropy on FPU CPU and GPU Power. arxiv:https:\/\/arXiv.org\/abs\/2212.08805\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2212.08805"},{"key":"e_1_3_3_2_12_2","unstructured":"Jeremy Duckworth Vinay Gavirangaswamy David Gloe and Brad Klein. 2023. Software-defined Multi-tenancy on HPE Cray Ex Supercomputers. https:\/\/cug.org\/proceedings\/cug2023_proceedings\/includes\/files\/pap132s2-file2.pdf."},{"key":"e_1_3_3_2_13_2","unstructured":"Jiakun Fan Yanglin Zhang Xiangchen Li and Dimitrios\u00a0S. Nikolopoulos. 2025. Parallel CPU-GPU Execution for LLM Inference on Constrained GPUs. arxiv:https:\/\/arXiv.org\/abs\/2506.03296\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2506.03296"},{"key":"e_1_3_3_2_14_2","unstructured":"Luigi Fusco Mikhail Khalilov Marcin Chrapek Giridhar Chukkapalli Thomas Schulthess and Torsten Hoefler. 2024. Understanding Data Movement in Tightly Coupled Heterogeneous Systems: A Case Study with the Grace Hopper Superchip. arxiv:https:\/\/arXiv.org\/abs\/2408.11556\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2408.11556"},{"key":"e_1_3_3_2_15_2","unstructured":"J.\u00a0Alex Hurt Grant\u00a0J. Scott Derek Weitzel and Huijun Zhu. 2024. Adventures with Grace Hopper AI Super Chip and the National Research Platform. arxiv:https:\/\/arXiv.org\/abs\/2410.16487\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2410.16487"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-49556-5_1"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3673038.3673104"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/SCW63240.2024.00140"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","unstructured":"Elias Konstantinidis and Yiannis Cotronis. 2017. A quantitative roofline model for GPU kernel performance estimation using micro-benchmarks and hardware metric profiling. J. Parallel and Distrib. Comput. 107 (2017) 37\u201356. 10.1016\/j.jpdc.2017.04.002","DOI":"10.1016\/j.jpdc.2017.04.002"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-17248-4_7"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3736227.3736231"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","unstructured":"Ami Marowka. 2023. A comparison of two performance portability metrics. Concurrency and Computation: Practice and Experience 35 25 (2023) e7868. 10.1002\/cpe.7868 arXiv:https:\/\/onlinelibrary.wiley.com\/doi\/pdf\/10.1002\/cpe.7868","DOI":"10.1002\/cpe.7868"},{"key":"e_1_3_3_2_23_2","unstructured":"NVIDIA. 2017. NVIDIA V100 GPU ARCHITECTURE. https:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf."},{"key":"e_1_3_3_2_24_2","unstructured":"NVIDIA. 2020. NVIDIA A100 GPU ARCHITECTURE. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf."},{"key":"e_1_3_3_2_25_2","unstructured":"NVIDIA. 2025. NVIDIA GH200 GPU ARCHITECTURE. https:\/\/resources.nvidia.com\/en-us-grace-cpu\/grace-hopper-superchip."},{"key":"e_1_3_3_2_26_2","unstructured":"NVIDIA. 2025. NVIDIA GRACE CPU. https:\/\/resources.nvidia.com\/en-us-grace-cpu\/nvidia-grace-cpu-superchip."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","unstructured":"S.J. Pennycook J.D. Sewall and V.W. Lee. 2019. Implications of a metric for performance portability. Future Generation Computer Systems 92 (2019) 947\u2013958. 10.1016\/j.future.2017.08.007","DOI":"10.1016\/j.future.2017.08.007"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3673038.3673110"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00068"},{"key":"e_1_3_3_2_30_2","unstructured":"TACC. 2025. TACC: Vista Supercomputer. https:\/\/tacc.utexas.edu\/systems\/vista\/"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.23919\/ISC.2024.10528925"},{"key":"e_1_3_3_2_32_2","unstructured":"Prabhu Vellaisamy Thomas Labonte Sourav Chakraborty Matt Turner Samantika Sury and John\u00a0Paul Shen. 2025. Characterizing and Optimizing LLM Inference Workloads on CPU-GPU Coupled Architectures. arxiv:https:\/\/arXiv.org\/abs\/2504.11750\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2504.11750"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42615.2023.10067395"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3723851.3723853"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","unstructured":"Samuel Williams Andrew Waterman and David Patterson. 2009. Roofline: an insightful visual performance model for multicore architectures. Commun. ACM 52 4 (April 2009) 65\u201376. 10.1145\/1498765.1498785","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","unstructured":"Charlene Yang Thorsten Kurth and Samuel Williams. 2020. Hierarchical Roofline analysis for GPUs: Accelerating performance optimization for the NERSC-9 Perlmutter system. Concurrency and Computation: Practice and Experience 32 20 (2020) e5547. 10.1002\/cpe.5547 arXiv:https:\/\/onlinelibrary.wiley.com\/doi\/pdf\/10.1002\/cpe.5547e5547 cpe.5547.","DOI":"10.1002\/cpe.5547"}],"event":{"name":"SC Workshops '25: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St Louis MO USA","acronym":"SC Workshops '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731599.3767497","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:35:57Z","timestamp":1767987357000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731599.3767497"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":35,"alternative-id":["10.1145\/3731599.3767497","10.1145\/3731599"],"URL":"https:\/\/doi.org\/10.1145\/3731599.3767497","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}