{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T02:04:10Z","timestamp":1773713050926,"version":"3.50.1"},"reference-count":65,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"11","license":[{"start":{"date-parts":[[2018,11,1]],"date-time":"2018-11-01T00:00:00Z","timestamp":1541030400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"}],"funder":[{"DOI":"10.13039\/100002418","name":"Intel Corporation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100002418","id-type":"DOI","asserted-by":"publisher"}]},{"name":"UVSQ"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Proc. IEEE"],"published-print":{"date-parts":[[2018,11]]},"DOI":"10.1109\/jproc.2018.2851190","type":"journal-article","created":{"date-parts":[[2018,8,9]],"date-time":"2018-08-09T18:42:50Z","timestamp":1533840170000},"page":"1985-2003","source":"Crossref","is-referenced-by-count":4,"title":["The Long and Winding Road Toward Efficient High-Performance Computing"],"prefix":"10.1109","volume":"106","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4975-5469","authenticated-orcid":false,"given":"William","family":"Jalby","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"David","family":"Kuck","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Allen D.","family":"Malony","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1073-4457","authenticated-orcid":false,"given":"Michel","family":"Masella","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Abdelhafid","family":"Mazouz","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3498-8147","authenticated-orcid":false,"given":"Mihail","family":"Popov","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"crossref","first-page":"685","DOI":"10.1002\/cpe.1553","article-title":"HPCTOOLKIT: Tools for performance analysis of optimized parallel programs","volume":"22","author":"adhianto","year":"2010","journal-title":"Concurrency Comput Pract Exper"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1177\/1094342006064482"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1177\/109434200001400404"},{"key":"ref32","first-page":"513","article-title":"Guided performance analysis combining profile and trace tools","author":"gim\u00e9nez","year":"2010","journal-title":"Proc Eur Conf Parallel Process"},{"key":"ref31","article-title":"MAQAO: Modular assembler quality analyzer and optimizer for Itanium 2","author":"djoudi","year":"2005","journal-title":"Proc 4th Workshop EPIC Archit Compil Technol"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1201\/b10509-5"},{"key":"ref37","year":"0","journal-title":"Intel VTune Amplifier"},{"key":"ref36","article-title":"PAPI: A portable interface to hardware performance counters","author":"mucci","year":"1999","journal-title":"Proc Dept of Defense HPCMP Users Group Conference"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICPPW.2010.38"},{"key":"ref34","year":"0","journal-title":"MAP Low-Overhead Profiling to Optimize C C++ Fortran and F90 Codes"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pcbi.1005659"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/2432516.2432517"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/3030207.3030224"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063454"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-43659-3_18"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1177\/1094342014568690"},{"key":"ref27","first-page":"6","article-title":"CERE: LLVM-based codelet extractor and replayer for piecewise benchmarking and optimization","volume":"12","author":"de oliveira castro","year":"2015","journal-title":"ACM Trans Archit Code Optim"},{"key":"ref65","first-page":"161","article-title":"Evaluating out-of-order engine limitations using Uop flow simulation","author":"palomares","year":"2016","journal-title":"Tools for High Performance Computing"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.4190"},{"key":"ref2","year":"0","journal-title":"HPCG Benchmark"},{"key":"ref1","year":"0","journal-title":"Top500 Supercomputer Sites"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2004.840306"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1201\/b10509-11"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1016\/j.scico.2005.10.013"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339587"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/s10766-010-0161-2"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-71528-3_4"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1177\/1094342013493124"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2005.55"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2008.5222642"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1021\/ct9004068"},{"key":"ref58","first-page":"1085","article-title":"Exploring SIMD for molecular dynamics, using Intel Xeon processors and Intel Xeon Phi coprocessors","author":"pennycook","year":"2013","journal-title":"Proc IEEE 27th Int Symp Parallel Distrib Process"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1063\/1.4904922"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1002\/jcc.23501"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.9"},{"key":"ref54","first-page":"7","article-title":"VP3: A vectorization potential performance prototype","author":"wong","year":"2015","journal-title":"Proc of WPMVP"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2010.41"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2011.233"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2006.37"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2007.32"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.1556"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2005.10"},{"key":"ref13","first-page":"50","article-title":"Loop transformation recipes for code generation and auto-tuning","author":"hall","year":"2009","journal-title":"Proc Int Workshop Lang Compil Parallel Comput"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2009.5161004"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/2591635.2667174"},{"key":"ref16","first-page":"1","article-title":"Automatically tuned linear algebra software","author":"whaley","year":"1998","journal-title":"Proc 1998 ACM\/IEEE Supercomputing 98 Conf"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1088\/1742-6596\/16\/1\/071"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2004.840848"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2004.840301"},{"key":"ref4","article-title":"Instruction tables: Lists of instruction latencies, throughputs and micro-operation breakdowns for Intel, AMD and VIA CPUs","author":"fog","year":"2011"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/1186736.1186737"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC.2014.7116904"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1016\/j.combustflame.2010.12.004"},{"key":"ref8","article-title":"Iterative compilation in a non-linear optimisation space","author":"bodin","year":"1998","journal-title":"Proc Workshop Profile Feedback-Directed Compil"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2004.1281665"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2005.29"},{"key":"ref9","article-title":"ACOVEA: Analysis of compiler options via evolutionary algorithm","author":"ladd","year":"2004"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-08144-1_9"},{"key":"ref45","first-page":"49","author":"lindlan","year":"2000","journal-title":"Proc ACM\/IEEE Conf Supercomput"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/2693561.2693569"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/2464996.2465440"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1155\/2008\/713705"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2008.09.001"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-68564-7_9"},{"key":"ref43","first-page":"79","article-title":"Score-P: A joint performance measurement run-time infrastructure for periscope, Scalasca, TAU, and Vampir","author":"kn\u00fcpfer","year":"2012","journal-title":"Tools for High Performance Computing"}],"container-title":["Proceedings of the IEEE"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/5\/8510998\/08430508.pdf?arnumber=8430508","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,9]],"date-time":"2024-07-09T02:50:35Z","timestamp":1720493435000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8430508\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,11]]},"references-count":65,"journal-issue":{"issue":"11"},"URL":"https:\/\/doi.org\/10.1109\/jproc.2018.2851190","relation":{},"ISSN":["0018-9219","1558-2256"],"issn-type":[{"value":"0018-9219","type":"print"},{"value":"1558-2256","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018,11]]}}}