{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T04:59:08Z","timestamp":1750309148616,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":73,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T00:00:00Z","timestamp":1717372800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Spanish Government","award":["PID2019-107255GB-C21"],"award-info":[{"award-number":["PID2019-107255GB-C21"]}]},{"name":"Spanish Government, Department of Research and Universities of the Government of Catalonia","award":["CEX2021-001148-S","RYC2018-025628-I","2021 SGR 00807"],"award-info":[{"award-number":["CEX2021-001148-S","RYC2018-025628-I","2021 SGR 00807"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,3]]},"DOI":"10.1145\/3625549.3658686","type":"proceedings-article","created":{"date-parts":[[2024,8,30]],"date-time":"2024-08-30T15:55:29Z","timestamp":1725033329000},"page":"240-252","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Reinforcement Learning-based Adaptive Mitigation of Uncorrected DRAM Errors in the Field"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7738-524X","authenticated-orcid":false,"given":"Isaac","family":"Boixaderas","sequence":"first","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8997-4051","authenticated-orcid":false,"given":"Sergi","family":"Mor\u00e9","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5785-4942","authenticated-orcid":false,"given":"Javier","family":"Bartolome","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1584-1182","authenticated-orcid":false,"given":"David","family":"Vicente","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9334-3330","authenticated-orcid":false,"given":"Petar","family":"Radojkovi\u0107","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9392-0521","authenticated-orcid":false,"given":"Paul M.","family":"Carpenter","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5146-103X","authenticated-orcid":false,"given":"Eduard","family":"Ayguad\u00e9","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"},{"name":"Universitat Polit\u00e8cnica de Catalunya, Barcelona, Spain"}]}],"member":"320","published-online":{"date-parts":[[2024,8,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. Logs of Real Parallel Workloads from Production Systems. https:\/\/www.cs.huji.ac.il\/labs\/parallel\/workload\/logs.html."},{"key":"e_1_3_2_1_2_1","unstructured":"[n. d.]. Partnership for Advanced Computing in Europe (PRACE) Research Infrastructure. http:\/\/www.prace-ri.eu."},{"key":"e_1_3_2_1_3_1","unstructured":"Barcelona Supercomputing Center. 2016. MareNostrum 3 User's Guide."},{"key":"e_1_3_2_1_4_1","volume-title":"System Architecture. https:\/\/www.bsc.es\/marenostrum\/marenostrum\/technical-information","author":"Barcelona Supercomputing Center","year":"2017","unstructured":"Barcelona Supercomputing Center. 2017. MareNostrum 4 (2017) System Architecture. https:\/\/www.bsc.es\/marenostrum\/marenostrum\/technical-information."},{"key":"e_1_3_2_1_5_1","volume-title":"International Conference on Dependable Systems and Networks Workshop (DSN-W).","author":"Baseman Elisabeth","year":"2016","unstructured":"Elisabeth Baseman, Nathan DeBardeleben, Kurt Ferreira, Scott Levy, Steven Raasch, Vilas Sridharan, Taniya Siddiqua, and Qiang Guan. 2016. Improving DRAM Fault Characterization through Machine Learning. In International Conference on Dependable Systems and Networks Workshop (DSN-W)."},{"key":"e_1_3_2_1_6_1","volume-title":"Automating DRAM Fault Mitigation By Learning From Experience. In International Conference on Dependable Systems and Networks Workshops, (DSN-W).","author":"Baseman Elisabeth","year":"2017","unstructured":"Elisabeth Baseman, Nathan DeBardeleben, Kurt B. Ferreira, Vilas Sridharan, Taniya Siddiqua, and Olena Tkachenko. 2017. Automating DRAM Fault Mitigation By Learning From Experience. In International Conference on Dependable Systems and Networks Workshops, (DSN-W)."},{"key":"e_1_3_2_1_7_1","volume-title":"International Conference for High Performance Computing, Networking, Storage, and Analysis (SC).","author":"Bautista-Gomez Leonardo","year":"2011","unstructured":"Leonardo Bautista-Gomez, Seiji Tsuboi, Dimitri Komatitsch, Franck Cappello, Naoya Maruyama, and Satoshi Matsuoka. 2011. FTI: High performance fault tolerance interface for hybrid systems. In International Conference for High Performance Computing, Networking, Storage, and Analysis (SC)."},{"key":"e_1_3_2_1_8_1","article-title":"A Markovian decision process","volume":"6","author":"Bellman Richard","year":"1957","unstructured":"Richard Bellman. 1957. A Markovian decision process. Journal of mathematics and mechanics 6, 5 (1957).","journal-title":"Journal of mathematics and mechanics"},{"key":"e_1_3_2_1_9_1","unstructured":"Keren Bergman Shekhar Borkar Dan Campbell William Carlson William Dally Monty Denneau Paul Franzon William Harrod Kerry Hill Jon Hiller et al. 2008. Exascale computing study: Technology challenges in achieving exascale systems. Defense Advanced Research Projects Agency Information Processing Techniques Office (DARPA IPTO) Tech. Rep 15 (2008)."},{"key":"e_1_3_2_1_10_1","volume-title":"UERL: An RL-based method for mitigating DRAM Uncorrected Errors. https:\/\/github.com\/bsc-mem\/UERL.","author":"Boixaderas Isaac","year":"2024","unstructured":"Isaac Boixaderas, Sergi Mor\u00e9, Javier Bartolome, David Vicente, Petar Radojkovi\u0107, Paul M. Carpenter, and Eduard Ayguad\u00e9. 2024. UERL: An RL-based method for mitigating DRAM Uncorrected Errors. https:\/\/github.com\/bsc-mem\/UERL."},{"key":"e_1_3_2_1_11_1","volume-title":"Cost-Aware Prediction of Uncorrected DRAM Errors in the Field. In International Conference for High Performance Computing, Networking, Storage and Analysis (SC).","author":"Boixaderas Isaac","year":"2020","unstructured":"Isaac Boixaderas, Darko Zivanovic, Sergi Mor\u00e9, Javier Bartolome, David Vicente, Marc Casas, Paul M. Carpenter, Petar Radojkovi\u0107, and Eduard Ayguad\u00e9. 2020. Cost-Aware Prediction of Uncorrected DRAM Errors in the Field. In International Conference for High Performance Computing, Networking, Storage and Analysis (SC)."},{"key":"e_1_3_2_1_12_1","volume-title":"An In-Depth Correlative Study Between DRAM Errors and Server Failures in Production Data Centers. In International Symposium on Reliable Distributed Systems (SRDS).","author":"Cheng Zhinan","year":"2022","unstructured":"Zhinan Cheng, Shujie Han, Patrick PC Lee, Xin Li, Jiongzhou Liu, and Zhan Li. 2022. An In-Depth Correlative Study Between DRAM Errors and Server Failures in Production Data Centers. In International Symposium on Reliable Distributed Systems (SRDS)."},{"key":"e_1_3_2_1_13_1","volume-title":"International Workshop on Workload Characterization (WWC).","author":"Cirne Walfredo","year":"2001","unstructured":"Walfredo Cirne and Francine Berman. 2001. A comprehensive model of the supercomputer workload. In International Workshop on Workload Characterization (WWC)."},{"key":"e_1_3_2_1_14_1","volume-title":"Symposium on Operating Systems Principles (SOSP).","author":"Cortez Eli","year":"2017","unstructured":"Eli Cortez, Anand Bonde, Alexandre Muzio, Mark Russinovich, Marcus Fontoura, and Ricardo Bianchini. 2017. Resource central: Understanding and Predicting Workloads for Improved Resource Management in Large Cloud Platforms. In Symposium on Operating Systems Principles (SOSP)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.63"},{"key":"e_1_3_2_1_16_1","volume-title":"International Symposium on Cluster Computing and the Grid (CCGRID).","author":"Daly John T","year":"2008","unstructured":"John T Daly, Lori A Pritchett-Sheats, and Sarah Ellen Michalak. 2008. Application MTTFE vs. platform MTBF: A fresh perspective on system reliability and application throughput for computations at scale. In International Symposium on Cluster Computing and the Grid (CCGRID)."},{"key":"e_1_3_2_1_17_1","volume-title":"Doomsday: Predicting Which Node Will Fail When on Supercomputers. In International Conference for High Performance Computing, Networking, Storage, and Analysis (SC).","author":"Das Anwesha","year":"2018","unstructured":"Anwesha Das, Frank Mueller, Paul Hargrove, Eric Roman, and Scott Baden. 2018. Doomsday: Predicting Which Node Will Fail When on Supercomputers. In International Conference for High Performance Computing, Networking, Storage, and Analysis (SC)."},{"key":"e_1_3_2_1_18_1","volume-title":"Memory Failure Prediction Using Online Learning. In International Symposium on Memory Systems (MEMSYS).","author":"Du Xiaoming","year":"2018","unstructured":"Xiaoming Du and Cong Li. 2018. Memory Failure Prediction Using Online Learning. In International Symposium on Memory Systems (MEMSYS)."},{"key":"e_1_3_2_1_19_1","volume-title":"Fault-Aware Prediction-Guided Page Offlining for Uncorrectable Memory Error Prevention. In International Conference on Computer Design (ICCD).","author":"Du Xiaoming","year":"2021","unstructured":"Xiaoming Du, Cong Li, Shen Zhou, Xian Liu, Xiaohan Xu, Tianjiao Wang, and Shijian Ge. 2021. Fault-Aware Prediction-Guided Page Offlining for Uncorrectable Memory Error Prevention. In International Conference on Computer Design (ICCD)."},{"key":"e_1_3_2_1_20_1","volume-title":"Predicting Uncorrectable Memory Errors for Proactive Replacement: An Empirical Study on Large-Scale Field Data. In European Dependable Computing Conference (EDCC).","author":"Du Xiaoming","year":"2020","unstructured":"Xiaoming Du, Cong Li, Shen Zhou, Mao Ye, and Jing Li. 2020. Predicting Uncorrectable Memory Errors for Proactive Replacement: An Empirical Study on Large-Scale Field Data. In European Dependable Computing Conference (EDCC)."},{"key":"e_1_3_2_1_21_1","volume-title":"International Conference on Distributed Computing Systems (ICDCS).","author":"Elliott James","year":"2012","unstructured":"James Elliott, Kishor Kharbas, David Fiala, Frank Mueller, Kurt Ferreira, and Christian Engelmann. 2012. Combining partial redundancy and checkpointing for HPC. In International Conference on Distributed Computing Systems (ICDCS)."},{"key":"e_1_3_2_1_22_1","volume-title":"William Harrod","author":"Elnozahy Mootaz","year":"2009","unstructured":"Mootaz Elnozahy. 2009. System resilience at extreme scale: A white paper. DARPA Resilience Report for ITO, William Harrod (2009)."},{"key":"e_1_3_2_1_23_1","volume-title":"Deep Reinforcement Agent for Scheduling in HPC. In International Parallel and Distributed Processing Symposium (IPDPS).","author":"Fan Yuping","year":"2021","unstructured":"Yuping Fan, Zhiling Lan, Taylor Childers, Paul Rich, William Allcock, and Michael E Papka. 2021. Deep Reinforcement Agent for Scheduling in HPC. In International Parallel and Distributed Processing Symposium (IPDPS)."},{"key":"e_1_3_2_1_24_1","volume-title":"International Parallel and Distributed Processing Symposium (IPDPS).","author":"Frank Alvaro","year":"2021","unstructured":"Alvaro Frank, Manuel Baumgartner, Reza Salkhordeh, and Andr\u00e9 Brinkmann. 2021. Improving checkpointing intervals by considering individual job failure probabilities. In International Parallel and Distributed Processing Symposium (IPDPS)."},{"key":"e_1_3_2_1_25_1","volume-title":"Predicting DRAM Reliability in the Field with Machine Learning. In Middleware Conference: Industrial Track.","author":"Giurgiu Ioana","year":"2017","unstructured":"Ioana Giurgiu, Jacint Szabo, Dorothea Wiesmann, and John Bird. 2017. Predicting DRAM Reliability in the Field with Machine Learning. In Middleware Conference: Industrial Track."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126937"},{"key":"e_1_3_2_1_27_1","volume-title":"International Conference for High Performance Computing, Networking, Storage and Analysis (SC).","author":"Hart David","year":"2011","unstructured":"David Hart. 2011. Deep and wide metrics for HPC resource capability and project usage. In International Conference for High Performance Computing, Networking, Storage and Analysis (SC)."},{"key":"e_1_3_2_1_28_1","unstructured":"Hewlett Packard Enterprise 2016. HPE ProLiant DL580 Gen9 Server User Guide. Hewlett Packard Enterprise."},{"key":"e_1_3_2_1_29_1","unstructured":"HP. 2016. How memory RAS technologies can enhance the uptime of HPE ProLiant servers. Technical white paper 4AA4-3490ENW. Hewlett Packard Enterprise."},{"key":"e_1_3_2_1_30_1","volume-title":"International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS).","author":"Hwang Andy A.","year":"2012","unstructured":"Andy A. Hwang, Ioan A. Stefanovici, and Bianca Schroeder. 2012. Cosmic Rays Don't Strike Twice: Understanding the Nature of DRAM Errors and the Implications for System Design. In International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)."},{"key":"e_1_3_2_1_31_1","unstructured":"IBM 2014. System x iDataPlex dx360 M4 Types 7912 and 7913: Problem Determination and Service Guide. IBM."},{"key":"e_1_3_2_1_32_1","unstructured":"Intel Server Products and Solutions 2017. System Event Log (SEL) Troubleshooting Guide. Intel Server Products and Solutions."},{"key":"e_1_3_2_1_33_1","volume-title":"Symposium on Principles and practice of parallel programming (PPoPP).","author":"Iskra Kamil","year":"2008","unstructured":"Kamil Iskra, John W Romein, Kazutomo Yoshii, and Pete Beckman. 2008. ZOID: I\/O-forwarding infrastructure for petascale architectures. In Symposium on Principles and practice of parallel programming (PPoPP)."},{"key":"e_1_3_2_1_34_1","volume-title":"MCELOG: Memory Error Handling in User Space. In International Linux System Technology Conference (Linux Kongress).","author":"Kleen Andy","year":"2010","unstructured":"Andy Kleen. 2010. MCELOG: Memory Error Handling in User Space. In International Linux System Technology Conference (Linux Kongress)."},{"key":"e_1_3_2_1_35_1","volume-title":"International Conference for High Performance Computing, Networking, Storage, and Analysis (SC).","author":"Levy Scott","year":"2018","unstructured":"Scott Levy, Kurt B. Ferreira, Nathan DeBardeleben, Taniya Siddiqua, Vilas Sridharan, and Elisabeth Baseman. 2018. Lessons Learned from Memory Errors Observed over the Lifetime of Cielo. In International Conference for High Performance Computing, Networking, Storage, and Analysis (SC)."},{"key":"e_1_3_2_1_36_1","volume-title":"International Conference for High Performance Computing, Networking, Storage and Analysis (SC).","author":"Li Cong","year":"2022","unstructured":"Cong Li, Yu Zhang, Jialei Wang, Hang Chen, Xian Liu, Tai Huang, Liang Peng, Shen Zhou, Lixin Wang, and Shijian Ge. 2022. From Correctable Memory Errors to Uncorrectable Memory Errors: what Error Bits Tell. In International Conference for High Performance Computing, Networking, Storage and Analysis (SC)."},{"key":"e_1_3_2_1_37_1","volume-title":"USENIX Conference on USENIX Annual Technical Conference (USENIXATC).","author":"Li Xin","year":"2010","unstructured":"Xin Li, Michael C. Huang, Kai Shen, and Lingkun Chu. 2010. A Realistic Evaluation of Memory Hardware Errors and Software System Susceptibility. In USENIX Conference on USENIX Annual Technical Conference (USENIXATC)."},{"key":"e_1_3_2_1_38_1","volume-title":"International Conference on Dependable Systems and Networks (DSN).","author":"Martino Catello Di","year":"2014","unstructured":"Catello Di Martino, Zbigniew Kalbarczyk, Ravishankar K. Iyer, Fabio Baccanico, Joseph Fullop, and William Kramer. 2014. Lessons Learned from the Analysis of System Failures at Petascale: The Case of Blue Waters. In International Conference on Dependable Systems and Networks (DSN)."},{"key":"e_1_3_2_1_39_1","volume-title":"International Conference on Dependable Systems and Networks (DSN).","author":"Meza Justin","year":"2015","unstructured":"Justin Meza, Qiang Wu, Sanjeev Kumar, and Onur Mutlu. 2015. Revisiting Memory Errors in Large-Scale Production Data Centers: Analysis and Modeling of New Trends from the Field. In International Conference on Dependable Systems and Networks (DSN)."},{"key":"e_1_3_2_1_40_1","volume-title":"Playing Atari with Deep Reinforcement Learning. arXiv preprint arXiv:1312.5602","author":"Mnih Volodymyr","year":"2013","unstructured":"Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, and Martin Riedmiller. 2013. Playing Atari with Deep Reinforcement Learning. arXiv preprint arXiv:1312.5602 (2013)."},{"key":"e_1_3_2_1_42_1","volume-title":"International Symposium on Workload Characterization (IISWC).","author":"Mukhanov Lev","year":"2019","unstructured":"Lev Mukhanov, Konstantinos Tovletoglou, Hans Vandierendonck, Dimitrios S Nikolopoulos, and Georgios Karakonstantis. 2019. Workload-Aware DRAM Error Prediction using Machine Learning. In International Symposium on Workload Characterization (IISWC)."},{"key":"e_1_3_2_1_43_1","volume-title":"International Symposium on High Performance Computer Architecture (HPCA).","author":"Nie Bin","year":"2016","unstructured":"Bin Nie, Devesh Tiwari, Saurabh Gupta, Evgenia Smirni, and James H Rogers. 2016. A large-scale study of soft-errors on GPUs in the field. In International Symposium on High Performance Computer Architecture (HPCA)."},{"key":"e_1_3_2_1_44_1","volume-title":"International Conference on Dependable Systems and Networks (DSN).","author":"Nie Bin","year":"2018","unstructured":"Bin Nie, Ji Xue, Saurabh Gupta, Tirthak Patel, Christian Engelmann, Evgenia Smirni, and Devesh Tiwari. 2018. Machine Learning Models for GPU Error Prediction in a Large Scale HPC System. In International Conference on Dependable Systems and Networks (DSN)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.5555\/1306871.1306895"},{"key":"e_1_3_2_1_46_1","volume-title":"International Parallel and Distributed Processing Symposium (IPDPS).","author":"Oliner Adam J","year":"2005","unstructured":"Adam J Oliner, Ramendra K Sahoo, Jos\u00e9 E Moreira, and Manish Gupta. 2005. Performance implications of periodic checkpointing on large-scale cluster systems. In International Parallel and Distributed Processing Symposium (IPDPS)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Petar Radojkovic Manolis Marazakis Paul Carpenter Reiley Jeyapaul Dimitris Gizopoulos Martin Schulz Adria Armejach Eduard Ayguade Fran\u00e7ois Bodin Ramon Canal et al. 2020. Towards resilient EU HPC systems: A blueprint.","DOI":"10.1145\/3310273.3323434"},{"key":"e_1_3_2_1_48_1","volume-title":"Tog Job Heterogeneity in HPC: A NERSC Case Study. In International Symposium on Cluster, Cloud and Grid Computing (CCGrid).","author":"Rodrigo Gonzalo P.","year":"2016","unstructured":"Gonzalo P. Rodrigo, Per-Olov \u00d6stberg, Erik Elmroth, Katie Antypas, Richard Gerber, and Lavanya Ramakrishnan. 2016. Tog Job Heterogeneity in HPC: A NERSC Case Study. In International Symposium on Cluster, Cloud and Grid Computing (CCGrid)."},{"key":"e_1_3_2_1_49_1","volume-title":"Towards understanding HPC users and systems: a NERSC case study. J. Parallel and Distrib. Comput. 111","author":"Rodrigo Gonzalo P","year":"2018","unstructured":"Gonzalo P Rodrigo, P-O \u00d6stberg, Erik Elmroth, Katie Antypas, Richard Gerber, and Lavanya Ramakrishnan. 2018. Towards understanding HPC users and systems: a NERSC case study. J. Parallel and Distrib. Comput. 111 (2018)."},{"key":"e_1_3_2_1_50_1","volume-title":"HPC System Lifetime Story: Workload Characterization and Evolutionary Analyses on NERSC Systems. In International Symposium on High-Performance Parallel and Distributed Computing (HPDC).","author":"Rodrigo \u00c1lvarez Gonzalo Pedro","year":"2015","unstructured":"Gonzalo Pedro Rodrigo \u00c1lvarez, Per-Olov \u00d6stberg, Erik Elmroth, Katie Antypas, Richard Gerber, and Lavanya Ramakrishnan. 2015. HPC System Lifetime Story: Workload Characterization and Evolutionary Analyses on NERSC Systems. In International Symposium on High-Performance Parallel and Distributed Computing (HPDC)."},{"key":"e_1_3_2_1_51_1","unstructured":"Rob Ross Jose Moreira Kim Cupps and Wayne Pfeiffer. 2006. Parallel I\/O on the IBM blue gene\/L system. Blue Gene\/L Consortium Quarterly Newsletter Tech. Rep. First Quarter (2006)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.5555\/2388996.2389022"},{"key":"e_1_3_2_1_53_1","volume-title":"Prioritized Experience Replay. arXiv preprint arXiv:1511.05952","author":"Schaul Tom","year":"2015","unstructured":"Tom Schaul, John Quan, Ioannis Antonoglou, and David Silver. 2015. Prioritized Experience Replay. arXiv preprint arXiv:1511.05952 (2015)."},{"key":"e_1_3_2_1_54_1","series-title":"Journal of Physics: Conference Series","volume-title":"Understanding failures in petascale computers","author":"Schroeder Bianca","unstructured":"Bianca Schroeder and Garth A Gibson. 2007. Understanding failures in petascale computers. In Journal of Physics: Conference Series, Vol. 78. IOP Publishing."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2009.4"},{"key":"e_1_3_2_1_56_1","volume-title":"DRAM Errors in the Wild: A Large-scale Field Study. In International Joint Conference on Measurement and Modeling of Computer Systems (SIGMETRICS).","author":"Schroeder Bianca","year":"2009","unstructured":"Bianca Schroeder, Eduardo Pinheiro, and Wolf-Dietrich Weber. 2009. DRAM Errors in the Wild: A Large-scale Field Study. In International Joint Conference on Measurement and Modeling of Computer Systems (SIGMETRICS)."},{"key":"e_1_3_2_1_57_1","volume-title":"Furlani","author":"Simakov Nikolay A","year":"2018","unstructured":"Nikolay A Simakov, Joseph P White, Robert L. DeLeon, Steven M. Gallo, Matthew D. Jones, Jeffrey T. Palmer, Benjamin Plessinger, and Thomas R. Furlani. 2018. A Workload Analysis of NSF's Innovative HPC Resources Using XDMoD. arXiv preprint arXiv:1801.04306 (2018)."},{"key":"e_1_3_2_1_58_1","volume-title":"International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS).","author":"Sridharan Vilas","year":"2015","unstructured":"Vilas Sridharan, Nathan DeBardeleben, Sean Blanchard, Kurt B. Ferreira, Jon Stearley, John Shalf, and Sudhanva Gurumurthi. 2015. Memory Errors in Modern Systems: The Good, The Bad, and The Ugly. In International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.5555\/2388996.2389100"},{"key":"e_1_3_2_1_60_1","volume-title":"Feng Shui of Supercomputer Memory: Positional Effects in DRAM and SRAM Faults. In International Conference on High Performance Computing, Networking, Storage and Analysis (SC).","author":"Sridharan Vilas","year":"2013","unstructured":"Vilas Sridharan, Jon Stearley, Nathan DeBardeleben, Sean Blanchard, and Sudhanva Gurumurthi. 2013. Feng Shui of Supercomputer Memory: Positional Effects in DRAM and SRAM Faults. In International Conference on High Performance Computing, Networking, Storage and Analysis (SC)."},{"key":"e_1_3_2_1_61_1","volume-title":"System-Level Hardware Failure Prediction Using Deep Learning. In Annual Design Automation Conference (DAC).","author":"Sun Xiaoyi","year":"2019","unstructured":"Xiaoyi Sun, Krishnendu Chakrabarty, Ruirui Huang, Yiquan Chen, Bing Zhao, Hai Cao, Yinhe Han, Xiaoyao Liang, and Li Jiang. 2019. System-Level Hardware Failure Prediction Using Deep Learning. In Annual Design Automation Conference (DAC)."},{"volume-title":"Assessment of the Effect of Memory Page Retirement on System RAS Against Hardware Faults. In International Conference on Dependable Systems and Networks (DSN).","author":"Tang Dong","key":"e_1_3_2_1_62_1","unstructured":"Dong Tang, Peter Carruthers, Zuheir Totari, and Michael W. Shapiro. 2006. Assessment of the Effect of Memory Page Retirement on System RAS Against Hardware Faults. In International Conference on Dependable Systems and Networks (DSN)."},{"key":"e_1_3_2_1_63_1","volume-title":"International Conference on Parallel and Distributed Systems (ICPADS).","author":"Wang Chao","year":"2010","unstructured":"Chao Wang, Frank Mueller, Christian Engelmann, and Stephen L Scott. 2010. Hybrid checkpointing for MPI jobs in HPC environments. In International Conference on Parallel and Distributed Systems (ICPADS)."},{"key":"e_1_3_2_1_64_1","volume-title":"On Workload-Aware DRAM Failure Prediction in Large-Scale Data Centers. In VLSI Test Symposium (VTS).","author":"Wang Xingyi","year":"2021","unstructured":"Xingyi Wang, Yu Li, Yiquan Chen, Shiwen Wang, Yin Du, Cheng He, YuZhong Zhang, Pinan Chen, Xin Li, Wenjun Song, et al. 2021. On Workload-Aware DRAM Failure Prediction in Large-Scale Data Centers. In VLSI Test Symposium (VTS)."},{"key":"e_1_3_2_1_65_1","volume-title":"Dueling Network Architectures for Deep Reinforcement Learning. In International Conference on Machine Learning (ICML).","author":"Wang Ziyu","year":"2016","unstructured":"Ziyu Wang, Tom Schaul, Matteo Hessel, Hado Hasselt, Marc Lanctot, and Nando Freitas. 2016. Dueling Network Architectures for Deep Reinforcement Learning. In International Conference on Machine Learning (ICML)."},{"volume-title":"Learning from Delayed Rewards. Ph. D. Dissertation","author":"Cornish Hellaby Watkins Christopher John","key":"e_1_3_2_1_66_1","unstructured":"Christopher John Cornish Hellaby Watkins. 1989. Learning from Delayed Rewards. Ph. D. Dissertation. King's College, Cambridge, UK. http:\/\/www.cs.rhul.ac.uk\/~chrisw\/new_thesis.pdf"},{"volume-title":"Lenovo System x3850 X6 and x3950 X6 Planning and Implementation Guide","author":"Watts David","key":"e_1_3_2_1_67_1","unstructured":"David Watts, Rani Doughly, and Ilya Solovyev. 2018. Lenovo System x3850 X6 and x3950 X6 Planning and Implementation Guide. Lenovo Press."},{"key":"e_1_3_2_1_68_1","volume-title":"Protecting Against Evaluation Overfitting in Empirical Reinforcement Learning. In Symposium on Adaptive Dynamic Programming and Reinforcement Learning (ADPRL).","author":"Whiteson Shimon","year":"2011","unstructured":"Shimon Whiteson, Brian Tanner, Matthew E Taylor, and Peter Stone. 2011. Protecting Against Evaluation Overfitting in Empirical Reinforcement Learning. In Symposium on Adaptive Dynamic Programming and Reinforcement Learning (ADPRL)."},{"key":"e_1_3_2_1_69_1","unstructured":"John Wilkes. 2020. Yet more Google compute cluster trace data. Google research blog. Posted at https:\/\/ai.googleblog.com\/2020\/04\/yet-more-google-compute-cluster-trace.html."},{"key":"e_1_3_2_1_70_1","volume-title":"What's Working in HPC: Investigating HPC User Behavior and Productivity. CTWatch Quarterly 2, 4A","author":"Wolter Nicole","year":"2006","unstructured":"Nicole Wolter, Michael O McCracken, Allan Snavely, Lorin Hochstein, Taiga Nakamura, and Victor Basili. 2006. What's Working in HPC: Investigating HPC User Behavior and Productivity. CTWatch Quarterly 2, 4A (2006)."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/2503210.2503264"},{"key":"e_1_3_2_1_72_1","volume-title":"Slurm: Simple Linux Utility for Resource Management. In Workshop on Job Scheduling Strategies for Parallel Processing (JSSPP).","author":"Yoo Andy B.","year":"2003","unstructured":"Andy B. Yoo, Morris A. Jette, and Mark Grondona. 2003. Slurm: Simple Linux Utility for Resource Management. In Workshop on Job Scheduling Strategies for Parallel Processing (JSSPP)."},{"key":"e_1_3_2_1_73_1","volume-title":"Predicting DRAM-Caused Node Unavailability in Hyper-Scale Clouds. In International Conference on Dependable Systems and Networks (DSN).","author":"Zhang Pengcheng","year":"2022","unstructured":"Pengcheng Zhang, Yunong Wang, Xuhua Ma, Yaoheng Xu, Bin Yao, Xudong Zheng, and Linquan Jiang. 2022. Predicting DRAM-Caused Node Unavailability in Hyper-Scale Clouds. In International Conference on Dependable Systems and Networks (DSN)."},{"key":"e_1_3_2_1_74_1","volume-title":"DRAM Errors in the Field: A Statistical Approach. In International Symposium on Memory Systems (MEMSYS).","author":"Zivanovic Darko","year":"2019","unstructured":"Darko Zivanovic, Pouya Esmaili Dokht, Sergi Mor\u00e9, Javier Bartolome, Paul M. Carpenter, Petar Radojkovi\u0107, and Eduard Ayguad\u00e9. 2019. DRAM Errors in the Field: A Statistical Approach. In International Symposium on Memory Systems (MEMSYS)."}],"event":{"name":"HPDC '24: 33rd International Symposium on High-Performance Parallel and Distributed Computing","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"],"location":"Pisa Italy","acronym":"HPDC '24"},"container-title":["Proceedings of the 33rd International Symposium on High-Performance Parallel and Distributed Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3625549.3658686","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3625549.3658686","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T22:50:38Z","timestamp":1750287038000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3625549.3658686"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,3]]},"references-count":73,"alternative-id":["10.1145\/3625549.3658686","10.1145\/3625549"],"URL":"https:\/\/doi.org\/10.1145\/3625549.3658686","relation":{},"subject":[],"published":{"date-parts":[[2024,6,3]]},"assertion":[{"value":"2024-08-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}