{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,30]],"date-time":"2025-08-30T17:04:23Z","timestamp":1756573463212,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":64,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"European High Performance Computing Joint Undertaking (JU) under Framework Partnership Agreement No 800928 and Specific Grant Agreement No 101036168"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,9,30]]},"DOI":"10.1145\/3695794.3695800","type":"proceedings-article","created":{"date-parts":[[2024,12,12]],"date-time":"2024-12-12T04:06:53Z","timestamp":1733976413000},"page":"45-60","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Data Prefetching on Processors with Heterogeneous Memory"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-5982-6687","authenticated-orcid":false,"given":"Berk","family":"Saglam","sequence":"first","affiliation":[{"name":"J\u00fclich Supercomputing Centre; Institute for Advanced Simulation, Forschungszentrum J\u00fclich, J\u00fclich, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6973-4120","authenticated-orcid":false,"given":"Nam","family":"Ho","sequence":"additional","affiliation":[{"name":"J\u00fclich Supercomputing Centre; Institute for Advanced Simulation, Forschungszentrum J\u00fclich, J\u00fclich, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0382-7743","authenticated-orcid":false,"given":"Carlos","family":"Falquez","sequence":"additional","affiliation":[{"name":"J\u00fclich Supercomputing Centre; Institute for Advanced Simulation, Forschungszentrum J\u00fclich, J\u00fclich, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1319-6404","authenticated-orcid":false,"given":"Antoni","family":"Portero","sequence":"additional","affiliation":[{"name":"J\u00fclich Supercomputing Centre; Institute for Advanced Simulation, Forschungszentrum J\u00fclich, J\u00fclich, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2982-0566","authenticated-orcid":false,"given":"Fabian","family":"Sch\u00e4tzle","sequence":"additional","affiliation":[{"name":"Central Institute of Engineering, Electronics and Analytics ZEA-2, Forschungszentrum J\u00fclich, J\u00fclich, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0748-7264","authenticated-orcid":false,"given":"Estela","family":"Suarez","sequence":"additional","affiliation":[{"name":"J\u00fclich Supercomputing Centre; Institute for Advanced Simulation, Forschungszentrum J\u00fclich, J\u00fclich, Germany and Computer Science Department, University of Bonn, Bonn, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7296-7817","authenticated-orcid":false,"given":"Dirk","family":"Pleiter","sequence":"additional","affiliation":[{"name":"Division of Computational Science and Technology, EECS, KTH Royal Institute of Technology, Stockholm, Sweden"}]}],"member":"320","published-online":{"date-parts":[[2024,12,11]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","unstructured":"Dmytro Apalkov Alexey Khvalkovskiy Steven Watts Vladimir Nikitin Xueti Tang Daniel Lottis Kiseok Moon Xiao Luo Eugene Chen Adrian Ong Alexander Driskill-Smith and Mohamad Krounbi. 2013. Spin-transfer torque magnetic random access memory (STT-MRAM). J. Emerg. Technol. Comput. Syst. 9 2 Article 13 (may 2013) 35\u00a0pages. 10.1145\/2463585.2463589https:\/\/dl.acm.org\/doi\/10.1145\/2463585.2463589","DOI":"10.1145\/2463585.2463589"},{"volume-title":"AMBA\u00ae 5 CHI architecture specification","year":"2020","key":"e_1_3_3_2_3_2","unstructured":"Arm. 2020. AMBA\u00ae 5 CHI architecture specification. https:\/\/developer.arm.com\/documentation\/ihi0050\/ea\/, 2020"},{"volume-title":"Arm\u00ae Neoverse\u2122 CMN\u2011650 Coherent Mesh Network Technical Reference Manual","year":"2021","key":"e_1_3_3_2_4_2","unstructured":"Arm. 2021. Arm\u00ae Neoverse\u2122 CMN\u2011650 Coherent Mesh Network Technical Reference Manual. https:\/\/developer.arm.com\/documentation\/101481\/0200\/?lang=en"},{"volume-title":"Arm\u00ae Neoverse\u2122 V1 reference design - software developer guide.","year":"2021","key":"e_1_3_3_2_5_2","unstructured":"Arm. 2021. Arm\u00ae Neoverse\u2122 V1 reference design - software developer guide.https:\/\/developer.arm.com\/documentation\/PJDOC-1779577084-33214\/ RelG?lang=en"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","unstructured":"Jean-Loup Baer and Tien-Fu Chen. 1995. Effective Hardware-Based Data Prefetching for High-Performance Processors. IEEE Trans. Comput. 44 5 (may 1995) 609\u2013623. 10.1109\/12.381947 https:\/\/dl.acm.org\/doi\/10.1109\/12.381947","DOI":"10.1109\/12.381947"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358325"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER49012.2020.00079"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS59251.2023.10254718"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00017"},{"key":"e_1_3_3_2_11_2","volume-title":"A primer on hardware prefetching","author":"Falsafi Babak","year":"2022","unstructured":"Babak Falsafi and Thomas\u00a0F Wenisch. 2022. A primer on hardware prefetching. Springer Nature."},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-84882-409-6_8"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3243176.3243181"},{"key":"e_1_3_3_2_14_2","unstructured":"Wim Heirman Ibrahim Hur Ugonna Echeruo Stijn Eyerman and Kristof Du\u00a0Bois. 2020. Apparatus method and system for enhanced data prefetching based on non-uniform memory access (NUMA) characteristics. US Patent 10 621 099."},{"key":"e_1_3_3_2_15_2","unstructured":"Wim Heirman Ibrahim Hur Ugonna Echeruo Stijn Eyerman and Kristof du Bois. U.S. Patent 11 256 626 B2 Feb. 22 2022. Apparatus method and system for enhanced data prefetching based on non-uniform memory access (NUMA) characteristics."},{"key":"e_1_3_3_2_16_2","unstructured":"Mark\u00a0D. Hill. 2019. Three Other Models of Computer System Performance. CoRR abs\/1901.02926 (2019). arXiv:https:\/\/arXiv.org\/abs\/1901.02926http:\/\/arxiv.org\/abs\/1901.02926"},{"key":"e_1_3_3_2_17_2","unstructured":"Yasuo Ishii Mary Inaba and Kei Hiraki. 2011. Access map pattern matching for high performance data cache prefetch. Journal of Instruction-Level Parallelism 13 2011 (2011) 1\u201324."},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.51"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485957"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","unstructured":"D. Joseph and D. Grunwald. 1999. Prefetching using Markov predictors. IEEE Trans. Comput. 48 2 (1999) 121\u2013133. 10.1109\/12.752653 https:\/\/dl.acm.org\/doi\/10.1109\/12.752653","DOI":"10.1109\/12.752653"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","unstructured":"Norman\u00a0P. Jouppi. 1990. Improving Direct-Mapped Cache Performance by the Addition of a Small Fully-Associative Cache and Prefetch Buffers. SIGARCH Comput. Archit. News 18 2SI (may 1990) 364\u2013373. 10.1145\/325096.325162https:\/\/dl.acm.org\/doi\/10.1145\/325096.325162","DOI":"10.1145\/325096.325162"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783763"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783763"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.23919\/DATE54114.2022.9774765"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","unstructured":"Moritz Kreutzer Georg Hager Gerhard Wellein Holger Fehske and Alan\u00a0R. Bishop. 2014. A Unified Sparse Matrix Data Format for Efficient General Sparse Matrix-Vector Multiplication on Modern Processors with Wide SIMD Units. SIAM Journal on Scientific Computing 36 5 (2014) C401\u2013C423. 10.1137\/130930352 https:\/\/dl.acm.org\/doi\/10.1137\/130930352","DOI":"10.1137\/130930352"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","unstructured":"Benjamin\u00a0C. Lee Engin Ipek Onur Mutlu and Doug Burger. 2009. Architecting phase change memory as a scalable dram alternative. 37 3 (jun 2009) 2\u201313. 10.1145\/1555815.1555758https:\/\/dl.acm.org\/doi\/10.1145\/1555815.1555758","DOI":"10.1145\/1555815.1555758"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD.2011.6081427"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3240302.3240315"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","unstructured":"Ing-Chao Lin Da-Wei Chang Wei-Jun Chen Jian-Ting Ke and Po-Han Huang. 2020. Global Clean Page First Replacement and Index-Aware Multistream Prefetcher in Hybrid Memory Architecture. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 39 9 (2020) 1750\u20131763. 10.1109\/TCAD.2019.2925404","DOI":"10.1109\/TCAD.2019.2925404"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2008.15"},{"key":"e_1_3_3_2_31_2","unstructured":"Jason Lowe-Power Abdul\u00a0Mutaal Ahmad Ayaz Akram Mohammad Alian Rico Amslinger Matteo Andreozzi Adri\u00e0 Armejach Nils Asmussen Srikant Bharadwaj Gabe Black Gedare Bloom Bobby\u00a0R. Bruce Daniel\u00a0Rodrigues Carvalho Jer\u00f3nimo Castrill\u00f3n Lizhong Chen Nicolas Derumigny Stephan Diestelhorst Wendy Elsasser Marjan Fariborz Amin\u00a0Farmahini Farahani Pouya Fotouhi Ryan Gambord Jayneel Gandhi Dibakar Gope Thomas Grass Bagus Hanindhito Andreas Hansson Swapnil Haria Austin Harris Timothy Hayes Adrian Herrera Matthew Horsnell Syed Ali\u00a0Raza Jafri Radhika Jagtap Hanhwi Jang Reiley Jeyapaul Timothy\u00a0M. Jones Matthias Jung Subash Kannoth Hamidreza Khaleghzadeh Yuetsu Kodama Tushar Krishna Tommaso Marinelli Christian Menard Andrea Mondelli Tiago M\u00fcck Omar Naji Krishnendra Nathella Hoa Nguyen Nikos Nikoleris Lena\u00a0E. Olson Marc\u00a0S. Orr Binh Pham Pablo Prieto Trivikram Reddy Alec Roelke Mahyar Samani Andreas Sandberg Javier Setoain Boris Shingarov Matthew\u00a0D. Sinclair Tuan Ta Rahul Thakur Giacomo Travaglini Michael Upton Nilay Vaish Ilias Vougioukas Zhengrong Wang Norbert Wehn Christian Weis David\u00a0A. Wood Hongil Yoon and \u00c9der\u00a0F. Zulian. 2020. The gem5 Simulator: Version 20.0+. CoRR abs\/2007.03152 (2020). arXiv:https:\/\/arXiv.org\/abs\/2007.03152https:\/\/arxiv.org\/abs\/2007.03152"},{"volume-title":"STREAM: Sustainable Memory Bandwidth in High Performance Computers","author":"McCalpin John\u00a0D.","key":"e_1_3_3_2_32_2","unstructured":"John\u00a0D. McCalpin. [n. d.]. STREAM: Sustainable Memory Bandwidth in High Performance Computers. https:\/\/www.cs.virginia.edu\/stream\/"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","unstructured":"Jagan\u00a0Singh Meena Simon\u00a0Min Sze Umesh Chand and Tseung-Yuen Tseng. 2014. Overview of emerging nonvolatile memory technologies. Nanoscale Research Letters 9 Article 526 (2014). 10.1186\/1556-276X-9-526","DOI":"10.1186\/1556-276X-9-526"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056027"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446087"},{"key":"e_1_3_3_2_36_2","volume-title":"DDR5 SDRAM Product Core Data Sheet","author":"Micron\u00a0Technology Inc","year":"2022","unstructured":"Inc Micron\u00a0Technology. 2022. DDR5 SDRAM Product Core Data Sheet. https:\/\/www.micron.com\/products\/memory\/dram-components\/ddr5-sdram\/part-catalog\/part-detail\/mt60b2g8hb-48b-a, 2022"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"crossref","unstructured":"Sparsh Mittal. 2016. A survey of recent prefetching techniques for processor caches. ACM Computing Surveys (CSUR) 49 2 (2016) 1\u201335.","DOI":"10.1145\/2907071"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-11515-8_10"},{"key":"e_1_3_3_2_39_2","volume-title":"CHI","author":"M\u00fcck Tiago","year":"2021","unstructured":"Tiago M\u00fcck. 2021. CHI. https:\/\/www.gem5.org\/documentation\/general_docs\/ruby\/CHI\/"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","unstructured":"Carlos Navarro Josu\u00e9 Feliu Salvador Petit Maria\u00a0E. G\u00f3mez and Julio Sahuquillo. 2020. Bandwidth-Aware Dynamic Prefetch Configuration for IBM POWER8. IEEE Transactions on Parallel and Distributed Systems 31 8 (2020) 1970\u20131982. 10.1109\/TPDS.2020.2982392","DOI":"10.1109\/TPDS.2020.2982392"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00072"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00072"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2004.10030"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"crossref","unstructured":"Nuno Neves Pedro Tom\u00e1s and Nuno Roma. 2020. Compiler-assisted data streaming for regular code structures. IEEE Trans. Comput. 70 3 (2020) 483\u2013494.","DOI":"10.1109\/TC.2020.2990302"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00019"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","unstructured":"Geraldo\u00a0F. Oliveira Juan G\u00f3mez-Luna Lois Orosa Saugata Ghose Nandita Vijaykumar Ivan Fernandez Mohammad Sadrosadati and Onur Mutlu. 2021. DAMOV: A New Methodology and Benchmark Suite for Evaluating Data Movement Bottlenecks. IEEE Access 9 (2021) 134457\u2013134502. 10.1109\/ACCESS.2021.3110993","DOI":"10.1109\/ACCESS.2021.3110993"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3614245"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2011.7477494"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS52781.2021.9567483"},{"volume-title":"miniFE Finite Element Mini-Application","author":"Project Mantevo","key":"e_1_3_3_2_50_2","unstructured":"Mantevo Project. [n. d.]. miniFE Finite Element Mini-Application. https:\/\/github.com\/Mantevo\/miniFE"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","unstructured":"Yasir\u00a0Mahmood Qureshi William\u00a0Andrew Simon Marina Zapater Katzalin Olcoz and David Atienza. 2021. Gem5-X: A Many-core Heterogeneous Simulation Platform for Architectural Exploration and Optimization. ACM Trans. Archit. Code Optim. 18 4 Article 44 (jul 2021) 27\u00a0pages. 10.1145\/3461662https:\/\/dl.acm.org\/doi\/10.1145\/3461662","DOI":"10.1145\/3461662"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/1995896.1995911"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527398"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","unstructured":"Julian Shun and Guy\u00a0E. Blelloch. 2013. Ligra: A Lightweight Graph Processing Framework for Shared Memory(PPoPP \u201913). Association for Computing Machinery New York NY USA 135\u2013146. 10.1145\/2442516.2442530https:\/\/dl.acm.org\/doi\/10.1145\/2442516.2442530","DOI":"10.1145\/2442516.2442530"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","unstructured":"A.\u00a0J. Smith. 1978. Sequential Program Prefetching in Memory Hierarchies. Computer 11 12 (dec 1978) 7\u201321. 10.1109\/C-M.1978.218016 https:\/\/dl.acm.org\/doi\/10.1109\/C-M.1978.218016","DOI":"10.1109\/C-M.1978.218016"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","unstructured":"Avinash Sodani Roger Gramunt Jesus Corbal Ho-Seop Kim Krishna Vinod Sundaram Chinthamani Steven Hutsell Rajat Agarwal and Yen-Chen Liu. 2016. Knights Landing: Second-Generation Intel Xeon Phi Product. IEEE Micro 36 2 (2016) 34\u201346. 10.1109\/MM.2016.25 https:\/\/dl.acm.org\/doi\/10.1109\/MM.2016.25","DOI":"10.1109\/MM.2016.25"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2007.346185"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2007.346185"},{"key":"e_1_3_3_2_59_2","unstructured":"Isaac S\u00e1nchez\u00a0Barrera. 2022. Exploiting data locality in cache-coherent NUMA systems. Ph.\u00a0D. Dissertation. Universitat Polit\u00e8cnica de Catalunya. http:\/\/hdl.handle.net\/2117\/367546"},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"publisher","DOI":"10.5555\/2588259"},{"volume-title":"Simple benchmark for memory throughput and latency","key":"e_1_3_3_2_61_2","unstructured":"Tinymembench. [n. d.]. Simple benchmark for memory throughput and latency. https:\/\/www.cs.virginia.edu\/stream\/"},{"key":"e_1_3_3_2_62_2","volume-title":"Clustering Modes in Knights Landing Processors: Developer\u2019s Guide","author":"Vladimirov Andrey","year":"2016","unstructured":"Andrey Vladimirov and Ryo Asai. 2016. Clustering Modes in Knights Landing Processors: Developer\u2019s Guide. https:\/\/colfaxresearch.com\/knl-numa\/"},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2005.59"},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"publisher","unstructured":"Wm.\u00a0A. Wulf and Sally\u00a0A. McKee. 1995. Hitting the Memory Wall: Implications of the Obvious. SIGARCH Comput. Archit. News 23 1 (1995) 20\u201324. 10.1145\/216585.216588https:\/\/dl.acm.org\/doi\/10.1145\/216585.216588","DOI":"10.1145\/216585.216588"},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"publisher","DOI":"10.1145\/1810085.1810110"}],"event":{"name":"MEMSYS '24: The International Symposium on Memory Systems","acronym":"MEMSYS '24","location":"Washington DC USA"},"container-title":["Proceedings of the International Symposium on Memory Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3695794.3695800","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695794.3695800","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:06Z","timestamp":1750295406000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3695794.3695800"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"references-count":64,"alternative-id":["10.1145\/3695794.3695800","10.1145\/3695794"],"URL":"https:\/\/doi.org\/10.1145\/3695794.3695800","relation":{},"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"2024-12-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}