{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T11:25:32Z","timestamp":1743765932743,"version":"3.37.3"},"reference-count":93,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2021,6,28]],"date-time":"2021-06-28T00:00:00Z","timestamp":1624838400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,6,28]],"date-time":"2021-06-28T00:00:00Z","timestamp":1624838400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"name":"Triad National Security, LLC","award":["581326"],"award-info":[{"award-number":["581326"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2022,2]]},"DOI":"10.1007\/s11227-021-03949-4","type":"journal-article","created":{"date-parts":[[2021,6,28]],"date-time":"2021-06-28T11:05:20Z","timestamp":1624878320000},"page":"2354-2385","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["PPT-Multicore: performance prediction of OpenMP applications using reuse profiles and analytical modeling"],"prefix":"10.1007","volume":"78","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6879-4455","authenticated-orcid":false,"given":"Atanu","family":"Barai","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yehia","family":"Arafa","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Abdel-Hameed","family":"Badawy","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gopinath","family":"Chennupati","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nandakishore","family":"Santhi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Stephan","family":"Eidenbenz","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,6,28]]},"reference":[{"key":"3949_CR1","unstructured":"7-CPU: 7-Zip LZMA Benchmark. https:\/\/www.7-cpu.com (2021). [Online; accessed 4-Dec-2020]"},{"key":"3949_CR2","volume-title":"Software and system development using virtual platforms: full-system simulation with Wind river simics","author":"D Aarno","year":"2014","unstructured":"Aarno D, Engblom J (2014) Software and system development using virtual platforms: full-system simulation with Wind river simics. Morgan Kaufmann, Burlington"},{"issue":"1","key":"3949_CR3","doi-asserted-by":"publisher","first-page":"71","DOI":"10.1006\/jpdc.1997.1346","volume":"44","author":"A Alexandrov","year":"1997","unstructured":"Alexandrov A, Ionescu MF, Schauser KE, Scheiman C (1997) LogGP: incorporating long messages into the LogP model for parallel computation. J Parallel Distrib Comput 44(1):71\u201379","journal-title":"J Parallel Distrib Comput"},{"issue":"1","key":"3949_CR4","doi-asserted-by":"publisher","first-page":"55","DOI":"10.1109\/LCA.2019.2904497","volume":"18","author":"Y Arafa","year":"2019","unstructured":"Arafa Y, Badawy AA, Chennupati G, Santhi N, Eidenbenz S (2019) Ppt-gpu: scalable gpu performance modeling. IEEE Comput Archit Lett 18(1):55\u201358","journal-title":"IEEE Comput Archit Lett"},{"key":"3949_CR5","doi-asserted-by":"crossref","unstructured":"Arafa Y, Badawy AH, Chennupati G, Barai A, Santhi N, Eidenbenz S (2020) Fast, accurate, and scalable memory modeling of GPGPUs using reuse profiles. In: Proceedings of the 34th ACM International Conference on Supercomputing, ICS \u201920. Association for Computing Machinery, New York, NY, USA","DOI":"10.1145\/3392717.3392761"},{"key":"3949_CR6","doi-asserted-by":"crossref","unstructured":"Arafa Y, Chennupati G, Barai A, Badawy AHA, Santhi N, Eidenbenz S, (2019) GPUs cache performance estimation using reuse distance analysis. In: 2019 IEEE 38th International Performance of Computing and Communications Conference (IPCCC), Piscataway, NJ, USA, pp 1\u20138. IEEE","DOI":"10.1109\/IPCCC47392.2019.8958760"},{"issue":"2","key":"3949_CR7","doi-asserted-by":"publisher","first-page":"59","DOI":"10.1109\/2.982917","volume":"35","author":"T Austin","year":"2002","unstructured":"Austin T, Larson E, Ernst D (2002) SimpleScalar: an infrastructure for computer system modeling. Computer 35(2):59\u201367","journal-title":"Computer"},{"issue":"1","key":"3949_CR8","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2851503","volume":"34","author":"M Badamo","year":"2016","unstructured":"Badamo M, Casarona J, Zhao M, Yeung D (2016) Identifying power-efficient multicore cache hierarchies via reuse distance analysis. ACM Trans Comput Syst 34(1):1\u201330","journal-title":"ACM Trans Comput Syst"},{"issue":"2","key":"3949_CR9","doi-asserted-by":"publisher","first-page":"119","DOI":"10.1109\/LCA.2017.2695178","volume":"16","author":"AA Badawy","year":"2017","unstructured":"Badawy AA, Yeung D (2017) Guiding locality optimizations for graph computations via reuse distance analysis. IEEE Comput Archit Lett 16(2):119\u2013122","journal-title":"IEEE Comput Archit Lett"},{"key":"3949_CR10","doi-asserted-by":"crossref","unstructured":"Badawy AA, Yeung D (2017) Optimizing locality in graph computations using reuse distance profiles. In: 2017 IEEE 36th International Performance Computing and Communications Conference (IPCCC), pp 1\u20138","DOI":"10.1109\/PCCC.2017.8280444"},{"key":"3949_CR11","doi-asserted-by":"publisher","first-page":"341","DOI":"10.1145\/3422575.3422806","volume-title":"The International symposium on memory systems","author":"A Barai","year":"2020","unstructured":"Barai A, Chennupati G, Santhi N, Badawy AH, Arafa Y, Eidenbenz S (2020) PPT-SASMM: scalable analytical shared memory model: predicting the performance of multicore caches from a single-threaded execution trace. The International symposium on memory systems. MEMSYS. Association for Computing Machinery. NY, USA, New York, pp 341\u2013351"},{"key":"3949_CR12","unstructured":"Berg E, Hagersten E (2004) StatCache: a probabilistic approach to efficient and accurate data locality analysis. In: 2004 IEEE International Symposium on Performance Analysis of Systems and Software\u2014ISPASS , Piscataway, NJ, USA, pp 20\u201327. IEEE"},{"key":"3949_CR13","unstructured":"Berg E, Zeffer H, Hagersten E (2006) A statistical multiprocessor cache model. In: 2006 IEEE International Symposium on Performance Analysis of Systems and Software, Piscataway, NJ, USA, pp 89\u201399. IEEE"},{"key":"3949_CR14","unstructured":"Beyls K, D\u2019Hollander EH (2001) Reuse distance as a metric for cache behavior. In: Proceedings of the IASTED Conference on Parallel and Distributed Computing and Systems, Piscataway, NJ, USA pp. 617\u2013622. IEEE"},{"key":"3949_CR15","unstructured":"Bienia C (2011) Benchmarking modern multiprocessors. Ph.D. thesis, Princeton University"},{"issue":"2","key":"3949_CR16","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2024716.2024718","volume":"39","author":"N Binkert","year":"2011","unstructured":"Binkert N, Beckmann B, Black G, Reinhardt SK, Saidi A, Basu A, Hestness J, Hower DR, Krishna T, Sardashti S et al (2011) The gem5 simulator. ACM SIGARCH Comput Archit News 39(2):1\u20137","journal-title":"ACM SIGARCH Comput Archit News"},{"key":"3949_CR17","unstructured":"Brehob M, Enbody R (1999) An analytical model of locality and caching. Technical Report MSU-CSE-99-31"},{"issue":"3","key":"3949_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2629677","volume":"11","author":"TE Carlson","year":"2014","unstructured":"Carlson TE, Heirman W, Eyerman S, Hur I, Eeckhout L (2014) An evaluation of high-level mechanistic core models. ACM Trans Archit Code Optim (TACO) 11(3):1\u201325","journal-title":"ACM Trans Archit Code Optim (TACO)"},{"key":"3949_CR19","doi-asserted-by":"crossref","unstructured":"Carothers CD, Meredith JS, Blanco MP, Vetter JS, Mubarak M, LaPre J, Moore S (2017) Durango: scalable synthetic workload generation for extreme-scale application performance modeling and simulation. In: Proceedings of the 2017 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation, SIGSIM-PADS \u201917, pp 97\u2013108. Association for Computing Machinery, New York, NY, USA","DOI":"10.1145\/3064911.3064923"},{"key":"3949_CR20","doi-asserted-by":"crossref","unstructured":"Cascaval C, Padua DA (2003) Estimating cache misses and locality using stack distances. In: Proceedings of the 17th Annual International Conference on Supercomputing, ICS \u201903, pp 150\u2013159. ACM, New York, NY, USA","DOI":"10.1145\/782814.782836"},{"key":"3949_CR21","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1007\/978-3-319-49956-7_4","volume-title":"International conference on algorithms and architectures for parallel processing","author":"G Ceballos","year":"2016","unstructured":"Ceballos G, Hagersten E, Black-Schaffer D (2016) Formalizing data locality in task parallel applications. International conference on algorithms and architectures for parallel processing. Springer, Cham, pp 43\u201361"},{"key":"3949_CR22","doi-asserted-by":"publisher","first-page":"114","DOI":"10.1007\/978-3-319-72971-8_6","volume-title":"High performance computing systems. Performance modeling, benchmarking, and simulation","author":"G Chennupati","year":"2018","unstructured":"Chennupati G, Santhi N, Bird R, Thulasidasan S, Badawy AHA, Misra S, Eidenbenz S (2018) A scalable analytical memory model for CPU performance prediction. In: Jarvis S, Wright S, Hammond S (eds) High performance computing systems. Performance modeling, benchmarking, and simulation. Springer, Cham, pp 114\u2013135"},{"key":"3949_CR23","doi-asserted-by":"crossref","unstructured":"Chennupati G, Santhi N, Eidenbenz S (2019) Scalable prformance prediction of codes with memory hierarchy and pipelines. In: Proceedings of the 2019 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation, SIGSIM-PADS \u201919, pp 13\u201324. Association for Computing Machinery, New York, NY, USA","DOI":"10.1145\/3316480.3325518"},{"key":"3949_CR24","doi-asserted-by":"crossref","unstructured":"Chennupati G, Santhi N, Eidenbenz S, Thulasidasan S (2017) An analytical memory hierarchy model for performance prediction. In: Proceedings of the 2017 Winter Simulation Conference, WSC \u201917. IEEE Press, Piscataway, NJ, USA","DOI":"10.1109\/WSC.2017.8247842"},{"key":"3949_CR25","unstructured":"Chennupati G, Santhi N, Eidenbenz S, Zerr RJ, Rosa M, Zamora RJ, Park EJ, Nadiga BT, Liu J, Ahmed K, Obaida MA (2017c) Performance prediction toolkit (PPT). Los Alamos National Laboratory (LANL) . https:\/\/github.com\/lanl\/PPT"},{"key":"3949_CR26","doi-asserted-by":"crossref","unstructured":"Collange S, Daumas M, Defour D, Parello D (2010) Barra: a parallel functional simulator for GPGPU. In: 2010 IEEE International Symposium on Modeling, Analysis and Simulation of Computer and Telecommunication Systems, pp 351\u2013360. IEEE","DOI":"10.1109\/MASCOTS.2010.43"},{"key":"3949_CR27","unstructured":"Cope J, Liu N, Lang S, Carns P, Carothers C, Ross R (2011) CODES: enabling co-design of multilayer exascale storage architectures. In: Proceedings of the Workshop on Emerging Supercomputing Technologies"},{"issue":"7","key":"3949_CR28","first-page":"1","volume":"28","author":"D Culler","year":"1993","unstructured":"Culler D, Karp R, Patterson D, Sahay A, Schauser KE, Santos E, Subramonian R, von Eicken T (1993) LogP: towards a realistic model of parallel computation. Proceedings of the fourth ACM SIGPLAN symposium on principles and practice of parallel programming 28(7):1\u201312","journal-title":"Proceedings of the fourth ACM SIGPLAN symposium on principles and practice of parallel programming"},{"issue":"1","key":"3949_CR29","doi-asserted-by":"publisher","first-page":"46","DOI":"10.1109\/99.660313","volume":"5","author":"L Dagum","year":"1998","unstructured":"Dagum L, Menon R (1998) OpenMP: an industry-standard API for shared-memory programming. IEEE Comput Sci Eng 5(1):46\u201355","journal-title":"IEEE Comput Sci Eng"},{"issue":"4","key":"3949_CR30","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2818374","volume":"12","author":"S Das","year":"2015","unstructured":"Das S, Aamodt TM, Dally WJ (2015) Reuse distance-based probabilistic cache replacement. ACM Trans Archit Code Optim 12(4):1\u201322","journal-title":"ACM Trans Archit Code Optim"},{"key":"3949_CR31","doi-asserted-by":"crossref","unstructured":"Davis JD, Laudon J, Olukotun K (2005) Maximizing CMP throughput with mediocre cores. In: Proceedings of the 14th International Conference on Parallel Architectures and Compilation Techniques, PACT \u201905, pp 51\u201362. IEEE Computer Society, USA","DOI":"10.1109\/PACT.2005.42"},{"key":"3949_CR32","doi-asserted-by":"crossref","unstructured":"De Pestel S, Steen SVd, Akram S, Eeckhout L (2018) RPPM: rapid performance prediction of multithreaded applications on multicore hardware. IEEE Comput Archit Lett 17(2):183\u2013186","DOI":"10.1109\/LCA.2018.2849983"},{"key":"3949_CR33","unstructured":"Ding C, Chilimbi T (2009) A composable model for analyzing locality of multi-threaded programs. Technical Report, MSR-TR-2009-107, Microoft"},{"issue":"4","key":"3949_CR34","doi-asserted-by":"publisher","first-page":"692","DOI":"10.1007\/s11390-014-1460-7","volume":"29","author":"C Ding","year":"2014","unstructured":"Ding C, Xiang X, Bao B, Luo H, Luo YW, Wang XL (2014) Performance metrics and models for shared cache. J Comput Sci Technol 29(4):692\u2013712","journal-title":"J Comput Sci Technol"},{"key":"3949_CR35","unstructured":"Ding C, Zhong Y (2001) Reuse distance analysis. University of Rochester, Rochester, NY, USA, Technical Report"},{"issue":"5","key":"3949_CR36","doi-asserted-by":"publisher","first-page":"245","DOI":"10.1145\/781131.781159","volume":"38","author":"C Ding","year":"2003","unstructured":"Ding C, Zhong Y (2003) Predicting whole-program locality through reuse distance analysis. Proceedings of the ACM SIGPLAN 2003 conference on Programming language design and implementation ( 38(5):245\u2013257","journal-title":"Proceedings of the ACM SIGPLAN 2003 conference on Programming language design and implementation ("},{"key":"3949_CR37","doi-asserted-by":"crossref","unstructured":"Dubach C, Jones T, O\u2019Boyle M (2007) Microarchitectural design space exploration using an architecture-centric approach. In: 40th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO 2007), pp. 262\u2013271","DOI":"10.1109\/MICRO.2007.12"},{"key":"3949_CR38","doi-asserted-by":"crossref","unstructured":"Duong N, Zhao D, Kim T, Cammarota R, Valero M, Veidenbaum AV (2012) Improving cache management policies using dynamic reuse distances. In: Proceedings of IEEE\/ACM International Symposium on Microarchitecture, MICRO-45, pp. 389\u2013400. IEEE, Piscataway, NJ, USA","DOI":"10.1109\/MICRO.2012.43"},{"issue":"1","key":"3949_CR39","first-page":"1","volume":"5","author":"L Eeckhout","year":"2010","unstructured":"Eeckhout L (2010) Computer architecture performance evaluation methods. Synth Lect Comput Archit 5(1):1\u2013145","journal-title":"Synth Lect Comput Archit"},{"key":"3949_CR40","unstructured":"Fog A (2016) Instruction tables: Lists of instruction latencies, throughputs and micro-operation breakdowns for Intel, AMD and VIA CPUs"},{"key":"3949_CR41","doi-asserted-by":"crossref","unstructured":"Grass T, Allande C, Armejach A, Rico A, Ayguad\u00e9 E, Labarta J, Valero M, Casas M, Moreto M (2016) MUSA: a multi-level simulation approach for next-generation HPC machines. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC \u201916. IEEE Press","DOI":"10.1109\/SC.2016.44"},{"key":"3949_CR42","doi-asserted-by":"crossref","unstructured":"Grauer-Gray S, Xu L, Searles R, Ayalasomayajula S, Cavazos J, (2012) Auto-tuning a high-level language targeted to GPU codes. In 2012 Innovative Parallel Computing (InPar). IEEE, Piscataway, NJ, USA, pp 1\u201310","DOI":"10.1109\/InPar.2012.6339595"},{"issue":"4","key":"3949_CR43","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1145\/1054907.1054914","volume":"31","author":"N Hardavellas","year":"2004","unstructured":"Hardavellas N, Somogyi S, Wenisch TF, Wunderlich RE, Chen S, Kim J, Falsafi B, Hoe JC, Nowatzyk AG (2004) SimFlex: a fast, accurate, flexible full-system simulation framework for performance evaluation of Server Architecture. SIGMETRICS Perform Eval Rev 31(4):31\u201334","journal-title":"SIGMETRICS Perform Eval Rev"},{"issue":"10","key":"3949_CR44","first-page":"764","volume":"38","author":"RIP Heywood","year":"1995","unstructured":"Heywood RIP, Howel F (1995) HASE: a flexible toolset for computer architects. Comput J 38(10):764\u2013775","journal-title":"Comput J"},{"key":"3949_CR45","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1109\/MC.2008.209","volume":"41","author":"MD Hill","year":"2008","unstructured":"Hill MD, Marty MR (2008) Amdahl\u2019s law in the multicore era. IEEE Comput 41:33\u201338","journal-title":"IEEE Comput"},{"issue":"2","key":"3949_CR46","doi-asserted-by":"publisher","first-page":"40","DOI":"10.1109\/2.982915","volume":"35","author":"CJ Hughes","year":"2002","unstructured":"Hughes CJ, Pai VS, Ranganathan P, Adve SV (2002) Rsim: simulating shared-memory multiprocessors with ILP processors. Computer 35(2):40\u201349","journal-title":"Computer"},{"key":"3949_CR47","unstructured":"Huh J, Burger D, Keckler SW (2001) Exploring the design space of future CMPs. In: Proceedings of the 2001 International Conference on Parallel Architectures and Compilation Techniques, PACT \u201901, pp 199\u2013210. IEEE Computer Society, USA"},{"key":"3949_CR48","unstructured":"$$\\ddot{{\\rm I}}$$pek E, McKee SA, Caruana R, de\u00a0Supinski BR, Schulz M (2006) Efficiently exploring architectural design spaces via predictive modeling. In: Proceedings of the 12th International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS XII, pp 195\u2013206. Association for Computing Machinery, New York, NY, USA"},{"key":"3949_CR49","doi-asserted-by":"crossref","unstructured":"Jiang Y, Zhang EZ, Tian K, Shen X (2010) Is reuse distance applicable to data locality analysis on chip multiprocessors? In: Proceedings of the 19th Joint European Conference on Theory and Practice of Software, International Conference on Compiler Construction, CC\u201910\/ETAPS\u201910, pp 264\u2013282. Springer","DOI":"10.1007\/978-3-642-11970-5_15"},{"key":"3949_CR50","unstructured":"Kaxiras S, Young C (2000) Coherence communication prediction in shared-memory multiprocessors. In: IEEE Proceedings Sixth International Symposium on High-performance Computer Architecture. HPCA-6 (Cat. No. PR00550), pp 156\u2013167 Piscataway, NJ, USA. IEEE"},{"key":"3949_CR51","doi-asserted-by":"crossref","unstructured":"Keramidas G, Petoumenos P, Kaxiras S (2007) Cache replacement based on reuse-distance prediction. In: 2007 25th International Conference on Computer Design. IEEE, NY, USA, pp 245\u2013250","DOI":"10.1109\/ICCD.2007.4601909"},{"key":"3949_CR52","doi-asserted-by":"crossref","unstructured":"Kise K, Katagiri T, Honda H, Yuba, T (2004) The simCore\/alpha functional simulator. In: Proceedings of the 2004 workshop on computer architecture education: held in conjunction with the 31st International symposium on computer architecture, pp. 24\u2013es","DOI":"10.1145\/1275571.1275602"},{"key":"3949_CR53","unstructured":"Lattner C, Adve V (2004) LLVM: a compilation framework for lifelong program analysis and transformation. In: Proceedings of the International Symposium on Code Generation and Optimization: Feedback-directed and Runtime Optimization, CGO \u201904, pp 75\u201386. IEEE Computer Society, Washington, DC, USA"},{"key":"3949_CR54","doi-asserted-by":"crossref","unstructured":"Lee BC, Collins J, Wang H, Brooks D (2008) CPR: composable performance regression for scalable multiprocessor models. In: 2008 41st IEEE\/ACM International Symposium on Microarchitecture, pp 270\u2013281. IEEE","DOI":"10.1109\/MICRO.2008.4771797"},{"key":"3949_CR55","doi-asserted-by":"crossref","unstructured":"Lee S, Meredith JS, Vetter JS (2015) COMPASS: a framework for automated performance modeling and prediction. In: Proceedings of the 29th ACM on International Conference on Supercomputing, ICS \u201915, pp 405\u2013414. Association for Computing Machinery, New York, NY, USA","DOI":"10.1145\/2751205.2751220"},{"key":"3949_CR56","doi-asserted-by":"crossref","unstructured":"Liao C, Quinlan DJ, Panas T, de\u00a0Supinski BR (2010) A ROSE-based openMP 3.0 research compiler supporting multiple runtime libraries. In: Proceedings of the 6th International Conference on Beyond Loop Level Parallelism in OpenMP: Accelerators, Tasking and More, IWOMP\u201910, pp 15\u201328. Springer-Verlag, Berlin, Heidelberg","DOI":"10.1007\/978-3-642-13217-9_2"},{"key":"3949_CR57","doi-asserted-by":"crossref","unstructured":"Maeda RKV, Cai Q, Xu J, Wang Z, Tian Z, (2017) Fast and accurate exploration of multi-level caches using hierarchical reuse distance. In: 2017 IEEE International symposium on high performance computer architecture (HPCA). IEEE, Piscataway, NJ, USA, pp 145\u2013156","DOI":"10.1109\/HPCA.2017.11"},{"key":"3949_CR58","doi-asserted-by":"crossref","unstructured":"Malakar P, Balaprakash P, Vishwanath V, Morozov V, Kumaran K (2018) Benchmarking machine learning methods for performance modeling of scientific applications. In: 2018 IEEE\/ACM Performance modeling, benchmarking and simulation of high performance computer systems (PMBS), pp 33\u201344","DOI":"10.1109\/PMBS.2018.8641686"},{"issue":"2","key":"3949_CR59","doi-asserted-by":"publisher","first-page":"78","DOI":"10.1147\/sj.92.0078","volume":"9","author":"RL Mattson","year":"1970","unstructured":"Mattson RL, Gecsei J, Slutz DR, Traiger IL (1970) Evaluation techniques for storage hierarchies. IBM Syst J 9(2):78\u2013117","journal-title":"IBM Syst J"},{"issue":"5","key":"3949_CR60","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1145\/1127577.1127586","volume":"33","author":"C McCurdy","year":"2005","unstructured":"McCurdy C, Fischer C (2005) Using pin as a memory reference generator for multiprocessor simulation. SIGARCH Comput Archit News 33(5):39\u201344","journal-title":"SIGARCH Comput Archit News"},{"issue":"6","key":"3949_CR61","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1145\/1273442.1250746","volume":"42","author":"N Nethercote","year":"2007","unstructured":"Nethercote N, Seward J (2007) Valgrind: a framework for heavyweight dynamic binary instrumentation. ACM SIGPLAN Notices 42(6):89\u2013100","journal-title":"ACM SIGPLAN Notices"},{"key":"3949_CR62","doi-asserted-by":"crossref","unstructured":"Niu Q, Dinan J, Lu Q, Sadayappan P (2012) PARDA: a fast parallel reuse distance analysis algorithm. In: Proceedings of the 2012 IEEE 26th International parallel and distributed processing symposium, IPDPS \u201912, pp 1284\u20131294. IEEE Computer Society, USA","DOI":"10.1109\/IPDPS.2012.117"},{"key":"3949_CR63","doi-asserted-by":"crossref","unstructured":"Obaida MA, Liu J, Chennupati G, Santhi N, Eidenbenz S (2018) Parallel application performance prediction using analysis based models and HPC simulations. In: Proceedings of the 2018 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation, SIGSIM-PADS \u201918, pp 49\u201359. Association for Computing Machinery, New York, NY, USA","DOI":"10.1145\/3200921.3200937"},{"key":"3949_CR64","doi-asserted-by":"crossref","unstructured":"Pakin S, McCormick P (2013) Hardware-independent application characterization. In: 2013 IEEE International Symposium on Workload Characterization (IISWC), pp 111\u2013112","DOI":"10.1109\/IISWC.2013.6704676"},{"key":"3949_CR65","doi-asserted-by":"crossref","unstructured":"Patel A, Afram F, Chen S, Ghose K (2011) MARSS: a full system simulator for multicore X86 CPUs. In: Proceedings of the 48th Design Automation Conference, DAC \u201911, pp 1050\u20131055. Association for Computing Machinery, New York, NY, USA","DOI":"10.1145\/2024724.2024954"},{"key":"3949_CR66","unstructured":"Payer M, Kravina E, Gross TR, (2013) Lightweight memory tracing. In: 2013 USENIX Annual Technical Conference (USENIX ATC 13). USENIX Association, San Jose, CA, pp 115\u2013126"},{"key":"3949_CR67","unstructured":"Pouchet LN (2012) Polybench: the polyhedral benchmark suite. URL: http:\/\/www.cs.ucla.edu\/pouchet\/software\/polybench"},{"key":"3949_CR68","doi-asserted-by":"crossref","unstructured":"Reddi VJ, Settle A, Connors DA, Cohn RS (2004) PIN: a binary instrumentation tool for computer architecture research and education. In: Proceedings of the 2004 Workshop on computer architecture education: Held in Conjunction with the 31st International Symposium on Computer Architecture, WCAE \u201904, pp 22\u2013es. Association for Computing Machinery, New York, NY, USA","DOI":"10.1145\/1275571.1275600"},{"issue":"4","key":"3949_CR69","doi-asserted-by":"publisher","first-page":"37","DOI":"10.1145\/1964218.1964225","volume":"38","author":"AF Rodrigues","year":"2011","unstructured":"Rodrigues AF, Hemmert KS, Barrett BW, Kersey C, Oldfield R, Weston M, Risen R, Cook J, Rosenfeld P, Cooper-Balis E, Jacob B (2011) The structural simulation toolkit. SIGMETRICS Perform Eval Rev 38(4):37\u201342","journal-title":"SIGMETRICS Perform Eval Rev"},{"issue":"8","key":"3949_CR70","doi-asserted-by":"publisher","first-page":"1704","DOI":"10.1109\/TPDS.2019.2896633","volume":"30","author":"JM Sabarimuthu","year":"2019","unstructured":"Sabarimuthu JM, Venkatesh TG (2019) Analytical derivation of concurrent reuse distance profile for multi-threaded application running on chip multi-processor. IEEE Trans Parallel Distrib Syst 30(8):1704\u20131721","journal-title":"IEEE Trans Parallel Distrib Syst"},{"issue":"3","key":"3949_CR71","doi-asserted-by":"publisher","first-page":"475","DOI":"10.1145\/2508148.2485963","volume":"41","author":"D Sanchez","year":"2013","unstructured":"Sanchez D, Kozyrakis C (2013) ZSim: fast and accurate microarchitectural simulation of thousand-core systems. SIGARCH Comput Archit News 41(3):475\u2013486","journal-title":"SIGARCH Comput Archit News"},{"key":"3949_CR72","doi-asserted-by":"crossref","unstructured":"Santhi N, Eidenbenz S, Liu J (2015) The Simian concept: parallel discrete event simulation with interpreted languages and just-in-time compilation. In: 2015 Winter Simulation Conference (WSC), pp. 3013\u20133024","DOI":"10.1109\/WSC.2015.7408405"},{"key":"3949_CR73","doi-asserted-by":"crossref","unstructured":"Schuff DL, Kulkarni M, Pai VS (2010) Accelerating multicore reuse distance analysis with sampling and parallelization. In: Proceedings of the 19th International Conference on Parallel Architectures and Compilation Techniques, PACT \u201910, p. 53\u201364. ACM","DOI":"10.1145\/1854273.1854286"},{"key":"3949_CR74","doi-asserted-by":"crossref","unstructured":"Schuff DL, Parsons BS, Pai VS (2010) Multicore-aware reuse distance analysis. In: 2010 IEEE International symposium on parallel and distributed processing. Workshops and Phd Forum (IPDPSW). IEEE, IEEE, Piscataway, NJ, USA, pp 1\u20138","DOI":"10.1109\/IPDPSW.2010.5470780"},{"key":"3949_CR75","doi-asserted-by":"crossref","unstructured":"Sen R, Wood DA (2013) Reuse-based Online Models for Caches. In: Proceedings of the ACM SIGMETRICS\/International Conference on Measurement and Modeling of Computer Systems, SIGMETRICS \u201913, pp. 279\u2013292. ACM, New York, NY, USA","DOI":"10.1145\/2465529.2465756"},{"key":"3949_CR76","first-page":"1","volume-title":"High Perform Comput Comput Sci\u2013VECPAR 2010","author":"J Shalf","year":"2011","unstructured":"Shalf J, Dosanjh S, Morrison J (2011) Exascale computing technology challenges. In: Palma JMLM, Dayd\u00e9 M, Marques O, Lopes JC (eds) High Perform Comput Comput Sci\u2013VECPAR 2010. Springer, Berlin, pp 1\u201325"},{"key":"3949_CR77","unstructured":"Sharkey J, Ponomarev D, Ghose K (2005) Abstract M-SIM: a flexible, multithreaded architectural simulation environment. Technical report, Department of Computer Science, State University of New York at Binghamton"},{"key":"3949_CR78","doi-asserted-by":"crossref","unstructured":"Shen X, Shaw J, Meeker B, Ding C (2007) Locality Approximation Using Time. In: Proceedings of the 34th annual ACM SIGPLAN-SIGACT symposium on principles of programming languages, POPL \u201907, pp. 55\u201361. ACM, New York, NY, USA","DOI":"10.1145\/1190216.1190227"},{"issue":"12","key":"3949_CR79","doi-asserted-by":"publisher","first-page":"1752","DOI":"10.1109\/TPDS.2009.31","volume":"20","author":"X Shi","year":"2009","unstructured":"Shi X, Su F, Peir JK, Xia Y, Yang Z (2009) Modeling and stack simulation of CMP cache capacity and accessibility. IEEE Trans Parallel Distrib Syst 20(12):1752\u20131763","journal-title":"IEEE Trans Parallel Distrib Syst"},{"key":"3949_CR80","doi-asserted-by":"crossref","unstructured":"Spafford KL, Vetter JS (2012) Aspen: A domain specific language for performance modeling. In: SC \u201912: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, pp. 1\u201311","DOI":"10.1109\/SC.2012.20"},{"issue":"12","key":"3949_CR81","first-page":"3537","volume":"65","author":"S Van den Steen","year":"2016","unstructured":"Van den Steen S, Eyerman S, De Pestel S, Mechri M, Carlson T, Black-Schaffer D, Hagersten E, Eeckhout L (2016) Analytical processor performance and power modeling using micro-architecture independent characteristics. IEEE Trans Comput 65(12):3537\u20133551","journal-title":"IEEE Trans Comput"},{"issue":"1","key":"3949_CR82","doi-asserted-by":"publisher","first-page":"9","DOI":"10.1109\/LCA.2017.2701370","volume":"17","author":"SV den Steen","year":"2018","unstructured":"den Steen SV, Eeckhout L (2018) Modeling superscalar processor memory-level parallelism. IEEE Comput. Archit. Lett 17(1):9\u201312","journal-title":"IEEE Comput. Archit. Lett"},{"issue":"3","key":"3949_CR83","doi-asserted-by":"publisher","first-page":"377","DOI":"10.1145\/2024723.2000109","volume":"39","author":"G Sun","year":"2011","unstructured":"Sun G, Hughes CJ, Kim C, Zhao J, Xu C, Xie Y, Chen YK (2011) Moguls: a model to explore the memory hierarchy for bandwidth improvements. SIGARCH Comput Archit News 39(3):377\u2013388","journal-title":"SIGARCH Comput Archit News"},{"key":"3949_CR84","doi-asserted-by":"publisher","first-page":"157","DOI":"10.1007\/978-3-642-11261-4_11","volume-title":"Tools for high performance computing 2009","author":"D Terpstra","year":"2010","unstructured":"Terpstra D, Jagode H, You H, Dongarra J (2010) Collecting performance data with PAPI-C. Tools for high performance computing 2009. Springer, Berlin, pp 157\u2013173"},{"key":"3949_CR85","unstructured":"Thazhuthaveetil M, Vaswani K, Joseph P (2006) Construction and use of linear regression models for processor performance analysis. In: Twelfth international symposium on high-performance computer architecture. IEEE Computer Society, Los Alamitos, CA, USA, pp. 99\u2013108"},{"issue":"2","key":"3949_CR86","doi-asserted-by":"publisher","first-page":"209","DOI":"10.1177\/1094342014568690","volume":"29","author":"D Unat","year":"2015","unstructured":"Unat D, Chan C, Zhang W, Williams S, Bachan J, Bell J, Shalf J (2015) ExaSAT: an exascale co-design tool for performance modeling. Int J High Perform Comput Appl 29(2):209\u2013232","journal-title":"Int J High Perform Comput Appl"},{"key":"3949_CR87","doi-asserted-by":"crossref","unstructured":"Wu MJ, Yeung D (2012) Identifying optimal multicore cache hierarchies for loop-based parallel programs via reuse distance analysis. In: Proceedings of the 2012 ACM SIGPLAN workshop on memory systems performance and correctness, MSPC \u201912, pp. 2\u201311. Association for Computing Machinery, New York, NY, USA","DOI":"10.1145\/2247684.2247687"},{"issue":"1","key":"3949_CR88","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2427631.2427632","volume":"31","author":"MJ Wu","year":"2013","unstructured":"Wu MJ, Yeung D (2013) Efficient reuse distance analysis of multicore scaling for loop-based parallel programs. ACM Trans Comput Syst 31(1):1\u201337","journal-title":"ACM Trans Comput Syst"},{"key":"3949_CR89","doi-asserted-by":"crossref","unstructured":"Wu MJ, Zhao M, Yeung D (2013) Studying multicore processor scaling via reuse distance analysis. In: Proceedings of the 40th Annual international symposium on computer architecture, ISCA \u201913, p. 499\u2013510. Association for Computing Machinery, New York, NY, USA","DOI":"10.1145\/2485922.2485965"},{"issue":"8","key":"3949_CR90","doi-asserted-by":"publisher","first-page":"1256","DOI":"10.1016\/j.jcss.2013.02.005","volume":"79","author":"X Wu","year":"2013","unstructured":"Wu X, Taylor V (2013) Performance modeling of hybrid MPI\/openMP scientific applications on large-scale multicore Supercomputers. J Comput Syst Sci 79(8):1256\u20131268","journal-title":"J Comput Syst Sci"},{"issue":"2","key":"3949_CR91","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3092702","volume":"35","author":"M Zhao","year":"2017","unstructured":"Zhao M, Yeung D (2017) Using multicore reuse distance to study coherence directories. ACM Trans Comput Syst (TOCS) 35(2):1\u201349","journal-title":"ACM Trans Comput Syst (TOCS)"},{"issue":"3","key":"3949_CR92","doi-asserted-by":"publisher","first-page":"328","DOI":"10.1109\/TC.2007.50","volume":"56","author":"Y Zhong","year":"2007","unstructured":"Zhong Y, Dropsho SG, Shen X, Studer A, Ding C (2007) Miss rate prediction across program inputs and cache configurations. IEEE Trans Comput 56(3):328\u2013343","journal-title":"IEEE Trans Comput"},{"issue":"6","key":"3949_CR93","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1552309.1552310","volume":"31","author":"Y Zhong","year":"2009","unstructured":"Zhong Y, Shen X, Ding C (2009) Program locality analysis using reuse distance. ACM Trans Program Lang Syst 31(6):1\u201339","journal-title":"ACM Trans Program Lang Syst"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-021-03949-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-021-03949-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-021-03949-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,24]],"date-time":"2022-01-24T11:28:00Z","timestamp":1643023680000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-021-03949-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,6,28]]},"references-count":93,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2022,2]]}},"alternative-id":["3949"],"URL":"https:\/\/doi.org\/10.1007\/s11227-021-03949-4","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"type":"print","value":"0920-8542"},{"type":"electronic","value":"1573-0484"}],"subject":[],"published":{"date-parts":[[2021,6,28]]},"assertion":[{"value":"22 May 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 June 2021","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"This paper has been approved for unlimited public distribution under LA-UR-21-22749. Any opinions, findings, and\/or conclusions expressed in this paper do not necessarily represent the views of the DOE or the US Government.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}}]}}