{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T16:11:41Z","timestamp":1774627901950,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"U.S. Department of Energy","doi-asserted-by":"publisher","award":["DE-AC05-00OR22725"],"award-info":[{"award-number":["DE-AC05-00OR22725"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006374","name":"National Science Foundation","doi-asserted-by":"publisher","award":["IS-2130681"],"award-info":[{"award-number":["IS-2130681"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3650200.3656615","type":"proceedings-article","created":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T14:11:54Z","timestamp":1717423914000},"page":"188-200","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["Understanding GPU Memory Corruption at Extreme Scale: The Summit Case Study"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8872-7463","authenticated-orcid":false,"given":"Vladyslav","family":"Oles","sequence":"first","affiliation":[{"name":"Oak Ridge National Laboratory, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3392-6574","authenticated-orcid":false,"given":"Anna","family":"Schmedding","sequence":"additional","affiliation":[{"name":"William &amp; Mary, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2043-6026","authenticated-orcid":false,"given":"George","family":"Ostrouchov","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7207-7814","authenticated-orcid":false,"given":"Woong","family":"Shin","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8754-581X","authenticated-orcid":false,"given":"Evgenia","family":"Smirni","sequence":"additional","affiliation":[{"name":"William &amp; Mary, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4365-6416","authenticated-orcid":false,"given":"Christian","family":"Engelmann","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,6,3]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. NVidia XID errors. https:\/\/docs.nvidia.com\/deploy\/xid-errors\/index.html."},{"key":"e_1_3_2_1_2_1","volume-title":"A Flexible Random Forest-Based Feature Importance Framework. arXiv preprint arXiv:2307.01932","author":"Agarwal Abhineet","year":"2023","unstructured":"Abhineet Agarwal, Ana\u00a0M Kenney, Yan\u00a0Shuo Tan, Tiffany\u00a0M Tang, and Bin Yu. 2023. MDI+: A Flexible Random Forest-Based Feature Importance Framework. arXiv preprint arXiv:2307.01932 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Categorical data analysis. Vol.\u00a0792","author":"Agresti Alan","unstructured":"Alan Agresti. 2012. Categorical data analysis. Vol.\u00a0792. John Wiley & Sons."},{"key":"e_1_3_2_1_4_1","unstructured":"Ghazanfar Ali Sridutt Bhalachandra Nicholas Wright Alan Sill and Yong Chen. 2020. Evaluation of power controls and counters on general-purpose Graphics Processing Units (GPUs). (2020)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356172"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071066"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1002\/widm.1072"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN48987.2021.00018"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.2517-6161.1972.tb00899.x"},{"key":"e_1_3_2_1_10_1","volume-title":"Aarohi: Making Real-Time Node Failure Prediction Feasible. In 2020 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 1092\u20131101","author":"Das Anwesha","year":"2020","unstructured":"Anwesha Das, Frank Mueller, and Barry Rountree. 2020. Aarohi: Making Real-Time Node Failure Prediction Feasible. In 2020 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 1092\u20131101."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.21105\/joss.01317"},{"key":"e_1_3_2_1_12_1","volume-title":"d.]. Top500. Retrieved","author":"Dongarra J.","year":"2019","unstructured":"Jack\u00a0J. Dongarra, Hans\u00a0W. Meuer, and Erich Strohmaier. [n. d.]. Top500. Retrieved May 7, 2019 from https:\/\/www.top500.org\/"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2014.6844486"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3502181.3531465"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the Workshop on Silicon Errors in Logic-System Effects.","author":"Kumar\u00a0Sastry Hari Siva","year":"2015","unstructured":"Siva Kumar\u00a0Sastry Hari, Timothy Tsai, Mark Stephenson, Stephen\u00a0W Keckler, and Joel Emer. 2015. SASSIFI: Evaluating resilience of GPU applications. In Proceedings of the Workshop on Silicon Errors in Logic-System Effects."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2678373.2665726"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2508148.2485928"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.fishres.2020.105534"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2015.57"},{"key":"e_1_3_2_1_20_1","unstructured":"Jie Min Yili Hong William\u00a0Q. Meeker and George Ostrouchov. 2023. A Spatially Correlated Competing Risks Time-to-Event Model for Supercomputer GPU Failure Data. arxiv:2303.16369\u00a0[stat.AP]"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS47720.2021.9553924"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446091"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/MASCOTS.2017.12"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2018.00022"},{"key":"e_1_3_2_1_25_1","volume-title":"Fault Site Pruning for Practical Reliability Analysis of GPGPU Applications. In 2018 51st Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). IEEE, 749\u2013761","author":"Nie Bin","year":"2018","unstructured":"Bin Nie, Lishan Yang, Adwait Jog, and Evgenia Smirni. 2018. Fault Site Pruning for Practical Reliability Analysis of GPGPU Applications. In 2018 51st Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). IEEE, 749\u2013761."},{"key":"e_1_3_2_1_26_1","volume-title":"Technical Note. Retrieved","author":"NVIDIA Corporation","year":"2021","unstructured":"NVIDIA Corporation. [n. d.]. Dynamic Page Retirement, Technical Note. Retrieved July 10, 2021 from https:\/\/docs.nvidia.com\/deploy\/dynamic-page-retirement\/index.html"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-020-03324-9"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00045"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3140659.3080242"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2021.3131571"},{"key":"e_1_3_2_1_31_1","volume-title":"Automation & Test in Europe Conference & Exhibition (DATE). IEEE, 809\u2013814","author":"Previlon G","year":"2018","unstructured":"Fritz\u00a0G Previlon, Charu Kalra, David\u00a0R Kaeli, and Paolo Rech. 2018. Evaluating the impact of execution parameters on program vulnerability in gpu applications. In 2018 Design, Automation & Test in Europe Conference & Exhibition (DATE). IEEE, 809\u2013814."},{"key":"e_1_3_2_1_32_1","volume-title":"Ensemble machine learning: Methods and applications","author":"Yanjun Qi.","unstructured":"Yanjun Qi. 2012. Random forest for bioinformatics. In Ensemble machine learning: Methods and applications. Springer, 307\u2013323."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2017.30"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS55109.2022.00004"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/1897816.1897844"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476188"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.13139\/OLCF\/1970187"},{"key":"e_1_3_2_1_38_1","first-page":"626","article-title":"Rectangular confidence regions for the means of multivariate normal distributions","volume":"62","author":"\u0160id\u00e1k Zbyn\u011bk","year":"1967","unstructured":"Zbyn\u011bk \u0160id\u00e1k. 1967. Rectangular confidence regions for the means of multivariate normal distributions. J. Amer. Statist. Assoc. 62, 318 (1967), 626\u2013633.","journal-title":"J. Amer. Statist. Assoc."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4419-6993-4_3"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Charles Spearman. 1961. The proof and measurement of association between two things. (1961).","DOI":"10.1037\/11491-005"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/2786763.2694348"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2012.13"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/2503210.2503257"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/2807591.2807666"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056044"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN48987.2021.00041"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/IOLTS.2017.8046209"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE43902.2021.00114"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2020.2980541"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410220.3453917"}],"event":{"name":"ICS '24: 2024 International Conference on Supercomputing","location":"Kyoto Japan","acronym":"ICS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 38th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656615","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3650200.3656615","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:24:40Z","timestamp":1755876280000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656615"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":50,"alternative-id":["10.1145\/3650200.3656615","10.1145\/3650200"],"URL":"https:\/\/doi.org\/10.1145\/3650200.3656615","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}