{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:57:26Z","timestamp":1776931046842,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","funder":[{"name":"Office of Science of the U.S. Department of Energy","award":["DE-AC02-06CH11357"],"award-info":[{"award-number":["DE-AC02-06CH11357"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759883","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:04:47Z","timestamp":1762963487000},"page":"1073-1084","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Fine-grained Automated Failure Management for Extreme-Scale GPU Accelerated Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-1346-6908","authenticated-orcid":false,"given":"Yonatan","family":"Levitt","sequence":"first","affiliation":[{"name":"Intel Corporation, Jerusalem, Israel"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2881-3746","authenticated-orcid":false,"given":"Richard","family":"Barella","sequence":"additional","affiliation":[{"name":"Intel Corporation, Hillsboro, Oregon, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4347-2462","authenticated-orcid":false,"given":"Sam","family":"Zeltner","sequence":"additional","affiliation":[{"name":"Intel Corporation, Hillsboro, Oregon, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4577-4125","authenticated-orcid":false,"given":"Thomas","family":"Musta","sequence":"additional","affiliation":[{"name":"Intel Corporation, Rochester, Minnesota, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4088-2626","authenticated-orcid":false,"given":"Lance","family":"Cheney","sequence":"additional","affiliation":[{"name":"Intel Corporation, Folsom, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0401-7355","authenticated-orcid":false,"given":"Gustavo","family":"Espinosa","sequence":"additional","affiliation":[{"name":"Intel Corporation, Hillsboro, Oregon, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0803-9064","authenticated-orcid":false,"given":"Olivier","family":"Franza","sequence":"additional","affiliation":[{"name":"Intel Corporation, Boston, Massechusets, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8585-6031","authenticated-orcid":false,"given":"Balazs","family":"Gerofi","sequence":"additional","affiliation":[{"name":"Intel Corporation, Hillsboro, Oregon, USA and RIKEN Center for Computational Science (R-CCS), Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_3_2_2","unstructured":"2024. OpenSearch. https:\/\/opensearch.org"},{"key":"e_1_3_3_3_3_2","unstructured":"Altair. 2025. PBS Professional. https:\/\/altair.com\/pbs-professional Retrieved: April 2025."},{"key":"e_1_3_3_3_4_2","unstructured":"Apache. 2021. Kafka. https:\/\/kafka.apache.org\/"},{"key":"e_1_3_3_3_5_2","doi-asserted-by":"publisher","unstructured":"A. Avizienis J.-C. Laprie B. Randell and C. Landwehr. 2004. Basic concepts and taxonomy of dependable and secure computing. IEEE Transactions on Dependable and Secure Computing 1 1 (2004) 11\u201333. 10.1109\/TDSC.2004.2","DOI":"10.1109\/TDSC.2004.2"},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063427"},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2006.1639698"},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"publisher","unstructured":"Nevin Cini and Gulay Yalcin. 2020. A Methodology for Comparing the Reliability of GPU-Based and CPU-Based HPCs. ACM Comput. Surv. 53 1 Article 22 (Feb. 2020) 33\u00a0pages. 10.1145\/3372790","DOI":"10.1145\/3372790"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"publisher","unstructured":"Cristian Constantinescu. 1992. Analyzing the effect of permanent intermittent and transient faults on a gracefully degrading microcomputer. Microelectronics Reliability 32 6 (1992) 861\u2013866. 10.1016\/0026-2714(92)90053-N","DOI":"10.1016\/0026-2714(92)90053-N"},{"key":"e_1_3_3_3_10_2","unstructured":"Crossplane. 2025. Platform Engineering. https:\/\/www.crossplane.io\/ Retrieved: April 2025."},{"key":"e_1_3_3_3_11_2","unstructured":"Shengkun Cui Archit Patke Ziheng Chen Aditya Ranjan Hung Nguyen Phuong Cao Saurabh Jha Brett Bode Gregory Bauer Chandra Narayanaswami Daby Sow Catello\u00a0Di Martino Zbigniew\u00a0T. Kalbarczyk and Ravishankar\u00a0K. Iyer. 2025. Characterizing GPU Resilience and Impact on AI\/HPC Systems. arxiv:https:\/\/arXiv.org\/abs\/2503.11901\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2503.11901"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","unstructured":"J.T. Daly. 2006. A higher order estimate of the optimum checkpoint interval for restart dumps. Future Generation Computer Systems 22 3 (2006) 303\u2013312. 10.1016\/j.future.2004.11.016","DOI":"10.1016\/j.future.2004.11.016"},{"key":"e_1_3_3_3_13_2","unstructured":"Datadog. 2025. Could Automation. https:\/\/www.datadoghq.com\/dg\/cloud-automation\/ Retrieved: April 2025."},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-54420-0_66"},{"key":"e_1_3_3_3_15_2","unstructured":"Harish\u00a0Dattatraya Dixit Laura Boyle Gautham Vunnam Sneha Pendharkar Matt Beadon and Sriram Sankar. 2022. Detecting silent data corruptions in the wild. arxiv:https:\/\/arXiv.org\/abs\/2203.08989\u00a0[cs.AR] https:\/\/arxiv.org\/abs\/2203.08989"},{"key":"e_1_3_3_3_16_2","unstructured":"Elastic. 2023. Elastic Stack. https:\/\/www.elastic.co\/elastic-stack Retrieved from https:\/\/www.elastic.co\/elastic-stack."},{"key":"e_1_3_3_3_17_2","unstructured":"Hewlett\u00a0Packard Enterprise. 2025. HPE Performance Cluster Manager. https:\/\/www.hpe.com\/psnow\/doc\/a00044858enw. Retrieved 2 April 2025."},{"key":"e_1_3_3_3_18_2","unstructured":"Gershon et. al. 2025. The infrastructure powering IBM\u2019s Gen AI model development. arxiv:https:\/\/arXiv.org\/abs\/2407.05467\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2407.05467"},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126937"},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2015.52"},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2010.84"},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"publisher","DOI":"10.1364\/OFC.2011.OWH1"},{"key":"e_1_3_3_3_23_2","unstructured":"Richard\u00a0D Hipp. 2020. SQLite. https:\/\/www.sqlite.org\/index.html"},{"key":"e_1_3_3_3_24_2","unstructured":"Broadcom Inc.2025. RabbitMQ. https:\/\/www.rabbitmq.com\/ Retrieved: April 2025."},{"key":"e_1_3_3_3_25_2","first-page":"947","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). USENIX Association, Renton, WA, 947\u2013960. https:\/\/www.usenix.org\/conference\/atc19\/presentation\/jeon"},{"key":"e_1_3_3_3_26_2","unstructured":"Apostolos Kokolis Michael Kuchnik John Hoffman Adithya Kumar Parth Malani Faye Ma Zachary DeVito Shubho Sengupta Kalyan Saladi and Carole-Jean Wu. 2025. Revisiting Reliability in Large-Scale Machine Learning Research Clusters. arxiv:https:\/\/arXiv.org\/abs\/2410.21680\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2410.21680"},{"key":"e_1_3_3_3_27_2","unstructured":"Argonne\u00a0National Laboratory. 2024. Aurora. https:\/\/www.alcf.anl.gov\/aurora"},{"key":"e_1_3_3_3_28_2","unstructured":"Grafana Labs. 2024. Grafana. https:\/\/grafana.com"},{"key":"e_1_3_3_3_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/DSN-S.2019.00020"},{"key":"e_1_3_3_3_30_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-7091-8990-0"},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00093"},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2010.18"},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00099"},{"key":"e_1_3_3_3_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446091"},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/MASCOTS.2017.12"},{"key":"e_1_3_3_3_36_2","unstructured":"NVIDIA. 2023. GPU Memory Error Management. https:\/\/docs.nvidia.com\/deploy\/pdf\/a100-gpu-mem-error-mgmt.pdf Retrieved: April 2025."},{"key":"e_1_3_3_3_37_2","unstructured":"NVIDIA. 2024. GPU Deployment and Management. https:\/\/docs.nvidia.com\/deploy\/pdf\/XID_Errors.pdf Retrieved: April 2025."},{"key":"e_1_3_3_3_38_2","unstructured":"NVIDIA. 2025. Ensuring Reliable Model Training on NVIDIA DGX Cloud. https:\/\/developer.nvidia.com\/blog\/ensuring-reliable-model-training-on-nvidia-dgx-cloud\/ Retrieved: April 2025."},{"key":"e_1_3_3_3_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3650200.3656615"},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"publisher","unstructured":"K. Omahen and V. Marathe. 1978. Analysis and Applications of the Delay Cycle for the M\/M\/c Queueing System. J. ACM 25 2 (April 1978) 283\u2013303. 10.1145\/322063.322072","DOI":"10.1145\/322063.322072"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00045"},{"key":"e_1_3_3_3_42_2","doi-asserted-by":"publisher","unstructured":"Behrooz Parhami. 1988. From defects to failures: a view of dependable computing. SIGARCH Comput. Archit. News 16 4 (Sept. 1988) 157\u2013168. 10.1145\/54331.54345","DOI":"10.1145\/54331.54345"},{"key":"e_1_3_3_3_43_2","doi-asserted-by":"publisher","unstructured":"Bianca Schroeder and Garth\u00a0A. Gibson. 2010. A Large-Scale Study of Failures in High-Performance Computing Systems. IEEE Transactions on Dependable and Secure Computing 7 4 (2010) 337\u2013350. 10.1109\/TDSC.2009.4","DOI":"10.1109\/TDSC.2009.4"},{"key":"e_1_3_3_3_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491418.3530773"},{"key":"e_1_3_3_3_45_2","unstructured":"Smartsheet. 2025. Smartsheet. https:\/\/www.smartsheet.com\/ Retrieved: April 2025."},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"publisher","unstructured":"Marc Snir Robert\u00a0W Wisniewski Jacob\u00a0A Abraham Sarita\u00a0V Adve Saurabh Bagchi Pavan Balaji Jim Belak Pradip Bose Franck Cappello Bill Carlson Andrew\u00a0A Chien Paul Coteus Nathan\u00a0A Debardeleben Pedro\u00a0C Diniz Christian Engelmann Mattan Erez Saverio Fazzari Al Geist Rinku Gupta Fred Johnson Sriram Krishnamoorthy Sven Leyffer Dean Liberty Subhasish Mitra Todd Munson Rob Schreiber Jon Stearley and Eric\u00a0Van Hensbergen. 2014. Addressing failures in exascale computing. Int. J. High Perform. Comput. Appl. 28 2 (May 2014) 129\u2013173. 10.1177\/1094342014522573","DOI":"10.1177\/1094342014522573"},{"key":"e_1_3_3_3_47_2","unstructured":"Splunk. 2025. Splunk Security Orchestration Automation and Response. https:\/\/www.splunk.com\/en_us\/products\/splunk-security-orchestration-and-automation.html Retrieved: April 2025."},{"key":"e_1_3_3_3_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/DSN48987.2021.00043"},{"key":"e_1_3_3_3_49_2","unstructured":"Gemini Team. 2024. Gemini: A Family of Highly Capable Multimodal Models. arxiv:https:\/\/arXiv.org\/abs\/2312.11805\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2312.11805"},{"key":"e_1_3_3_3_50_2","unstructured":"Meta\u00a0Llama Team. 2024. The Llama 3 Herd of Models. arxiv:https:\/\/arXiv.org\/abs\/2407.21783\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_3_3_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/2807591.2807666"},{"key":"e_1_3_3_3_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056044"},{"key":"e_1_3_3_3_53_2","doi-asserted-by":"publisher","unstructured":"John\u00a0W. Young. 1974. A first order approximation to the optimum checkpoint interval. Commun. ACM 17 9 (Sept. 1974) 530\u2013531. 10.1145\/361147.361115","DOI":"10.1145\/361147.361115"}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759883","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:48:18Z","timestamp":1773254898000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759883"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":52,"alternative-id":["10.1145\/3712285.3759883","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759883","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}