{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T11:36:57Z","timestamp":1774006617826,"version":"3.50.1"},"reference-count":32,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T00:00:00Z","timestamp":1769731200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100002347","name":"Bundesministerium f\u00fcr Forschung, Technologie und Raumfahrt","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100002347","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Parallel Computing"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1016\/j.parco.2026.103183","type":"journal-article","created":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T16:48:11Z","timestamp":1769705291000},"page":"103183","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Microarchitectural comparison, in-core modeling, and memory hierarchy analysis of state-of-the-art CPUs: Grace, Sapphire Rapids, and Genoa"],"prefix":"10.1016","volume":"127","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3776-9353","authenticated-orcid":false,"given":"Jan","family":"Laukemann","sequence":"first","affiliation":[]},{"given":"Georg","family":"Hager","sequence":"additional","affiliation":[]},{"given":"Gerhard","family":"Wellein","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"issue":"4","key":"10.1016\/j.parco.2026.103183_b1","doi-asserted-by":"crossref","first-page":"65","DOI":"10.1145\/1498765.1498785","article-title":"Roofline: An insightful visual performance model for multicore architectures","volume":"52","author":"Williams","year":"2009","journal-title":"Commun. ACM"},{"key":"10.1016\/j.parco.2026.103183_b2","series-title":"Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region Workshops","first-page":"45","article-title":"NVIDIA grace superchip early evaluation for HPC applications","author":"Banchelli","year":"2024"},{"key":"10.1016\/j.parco.2026.103183_b3","series-title":"Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region Workshops","first-page":"36","article-title":"First impressions of the NVIDIA grace CPU superchip and NVIDIA grace hopper superchip for scientific workloads","author":"Simakov","year":"2024"},{"key":"10.1016\/j.parco.2026.103183_b4","series-title":"2018 IEEE\/ACM Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems","first-page":"121","article-title":"Automated instruction stream throughput prediction for Intel and AMD microarchitectures","author":"Laukemann","year":"2018"},{"key":"10.1016\/j.parco.2026.103183_b5","series-title":"2019 IEEE\/ACM Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems","first-page":"1","article-title":"Automatic throughput and critical path analysis of x86 and ARM assembly kernels","author":"Laukemann","year":"2019"},{"key":"10.1016\/j.parco.2026.103183_b6","series-title":"ICS \u201922: 2022 International Conference on Supercomputing, Virtual Event, USA, June 27-30, 2022","first-page":"1","article-title":"Uica: Accurate throughput prediction of basic blocks on recent intel microarchitectures","author":"Abel","year":"2022"},{"key":"10.1016\/j.parco.2026.103183_b7","series-title":"2023 IEEE International Symposium on Workload Characterization","first-page":"87","article-title":"Facile: Fast, accurate, and interpretable basic-block throughput prediction","author":"Abel","year":"2023"},{"key":"10.1016\/j.parco.2026.103183_b8","series-title":"LLVM machine code analyzer","author":"LLVM Compiler Infrastructure","year":"2024"},{"key":"10.1016\/j.parco.2026.103183_b9","series-title":"2014 21st International Conference on High Performance Computing","first-page":"1","article-title":"CQA: A code quality analyzer tool at binary level","author":"Charif-Rubial","year":"2014"},{"key":"10.1016\/j.parco.2026.103183_b10","series-title":"International Conference on Machine Learning","first-page":"4505","article-title":"Ithemal: Accurate, portable and fast basic block throughput estimation using deep neural networks","author":"Mendis","year":"2019"},{"key":"10.1016\/j.parco.2026.103183_b11","series-title":"2022 IEEE International Symposium on Workload Characterization","first-page":"14","article-title":"GRANITE: A graph neural network model for basic block throughput estimation","author":"Sykora","year":"2022"},{"key":"10.1016\/j.parco.2026.103183_b12","series-title":"2024 IEEE International Parallel and Distributed Processing Symposium","first-page":"350","article-title":"CloverLeaf on intel multi-core CPUs: A case study in write-allocate evasion","author":"Laukemann","year":"2024"},{"key":"10.1016\/j.parco.2026.103183_b13","series-title":"SC24-W: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","first-page":"1405","article-title":"Microarchitectural comparison and in-core modeling of state-of-the-art CPUs: Grace, Sapphire Rapids, and Genoa","author":"Laukemann","year":"2024"},{"key":"10.1016\/j.parco.2026.103183_b14","series-title":"Arm Neoverse V2 Core Software Optimization Guide","author":"Arm Limited","year":"2022"},{"key":"10.1016\/j.parco.2026.103183_b15","series-title":"Job Scheduling Strategies for Parallel Processing","first-page":"44","article-title":"SLURM: Simple linux utility for resource management","author":"Yoo","year":"2003"},{"key":"10.1016\/j.parco.2026.103183_b16","series-title":"43rd International Conference on Parallel Processing Workshops","first-page":"176","article-title":"Overhead analysis of performance counter measurements","author":"Roehl","year":"2014"},{"key":"10.1016\/j.parco.2026.103183_b17","series-title":"LIKWID","author":"Gruber","year":"2023"},{"key":"10.1016\/j.parco.2026.103183_b18","series-title":"Intel\u00ae 64 and IA-32 architecture optimization reference manual","year":"2023"},{"key":"10.1016\/j.parco.2026.103183_b19","series-title":"Software optimization guide for the AMD Zen4 microarchitecture","year":"2023"},{"key":"10.1016\/j.parco.2026.103183_b20","series-title":"ASPLOS","first-page":"673","article-title":"Uops.info: Characterizing latency, throughput, and port usage of instructions on Intel microarchitectures","author":"Abel","year":"2019"},{"key":"10.1016\/j.parco.2026.103183_b21","series-title":"Ooo instruction benchmarking framework on the back of dragons","author":"Hammer","year":"2018"},{"key":"10.1016\/j.parco.2026.103183_b22","series-title":"Ibench - instruction benchmarks","author":"Hofmann","year":"2017"},{"issue":"2","key":"10.1016\/j.parco.2026.103183_b23","first-page":"54","article-title":"Bridging the architecture gap: Abstracting performance-relevant properties of modern server processors","volume":"7","author":"Hofmann","year":"2020","journal-title":"Supercomput. Front. Innov."},{"key":"10.1016\/j.parco.2026.103183_b24","series-title":"Proceedings of the 29th ACM on International Conference on Supercomputing","first-page":"207","article-title":"Quantifying performance bottlenecks of stencil computations using the execution-cache-memory model","author":"Stengel","year":"2015"},{"key":"10.1016\/j.parco.2026.103183_b25","series-title":"Companion of the 2023 ACM\/SPEC International Conference on Performance Engineering","first-page":"127","article-title":"Core-level performance engineering with the open-source architecture code analyzer (OSACA) and the compiler explorer","author":"Laukemann","year":"2023"},{"key":"10.1016\/j.parco.2026.103183_b26","series-title":"IEEE Computer Society Technical Committee on Computer Architecture (TCCA) Newsletter","first-page":"19","article-title":"Memory bandwidth and machine balance in current high performance computers","author":"McCalpin","year":"1995"},{"key":"10.1016\/j.parco.2026.103183_b27","series-title":"Design and Implementation of an Automated Performance Modeling Toolkit for Regular Loop Kernels","author":"Hammer","year":"2023"},{"key":"10.1016\/j.parco.2026.103183_b28","series-title":"Hot Chips Symposium","first-page":"1","article-title":"New 3rd gen Intel\u00ae Xeon\u00ae scalable processor (codename: Ice lake-SP)","author":"Papazian","year":"2020"},{"key":"10.1016\/j.parco.2026.103183_b29","unstructured":"G. Hager, [Online]. Available: https:\/\/blogs.fau.de\/hager\/archives\/8997."},{"issue":"20","key":"10.1016\/j.parco.2026.103183_b30","doi-asserted-by":"crossref","DOI":"10.1002\/cpe.6512","article-title":"Execution-cache-memory modeling and performance tuning of sparse matrix-vector multiplication and lattice quantum chromodynamics on A64FX","volume":"34","author":"Alappat","year":"2022","journal-title":"Concurr. Comput.: Pr. Exp."},{"key":"10.1016\/j.parco.2026.103183_b31","series-title":"High Performance Computing","first-page":"22","article-title":"On the accuracy and usefulness of analytic energy models for contemporary multicore processors","author":"Hofmann","year":"2018"},{"key":"10.1016\/j.parco.2026.103183_b32","series-title":"Arm Neoverse V2 Core Technical Reference Manual","author":"Limited","year":"2022"}],"container-title":["Parallel Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167819126000013?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167819126000013?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T09:21:18Z","timestamp":1773998478000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167819126000013"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":32,"alternative-id":["S0167819126000013"],"URL":"https:\/\/doi.org\/10.1016\/j.parco.2026.103183","relation":{},"ISSN":["0167-8191"],"issn-type":[{"value":"0167-8191","type":"print"}],"subject":[],"published":{"date-parts":[[2026,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Microarchitectural comparison, in-core modeling, and memory hierarchy analysis of state-of-the-art CPUs: Grace, Sapphire Rapids, and Genoa","name":"articletitle","label":"Article Title"},{"value":"Parallel Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.parco.2026.103183","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 The Authors. Published by Elsevier B.V.","name":"copyright","label":"Copyright"}],"article-number":"103183"}}