{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T10:46:48Z","timestamp":1769856408247,"version":"3.49.0"},"reference-count":32,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2025,5,1]],"date-time":"2025-05-01T00:00:00Z","timestamp":1746057600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,1]],"date-time":"2025-05-01T00:00:00Z","timestamp":1746057600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J. Comput. Sci. Technol."],"published-print":{"date-parts":[[2025,5]]},"DOI":"10.1007\/s11390-025-4634-6","type":"journal-article","created":{"date-parts":[[2025,5,22]],"date-time":"2025-05-22T03:15:43Z","timestamp":1747883743000},"page":"671-685","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["ResCheckpointer: Building Program Error Resilience-Aware Checkpointing Mechanism for HPC Systems"],"prefix":"10.1007","volume":"40","author":[{"given":"Xiao-Hui","family":"Wei","sequence":"first","affiliation":[]},{"given":"Shi-Yu","family":"Tong","sequence":"additional","affiliation":[]},{"given":"Zhong-Ao","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Xiang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Heng-Shan","family":"Yue","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,22]]},"reference":[{"key":"4634_CR1","volume-title":"High-end computing resilience: Analysis of issues facing the HEC community and path-forward for research and development, 2009","author":"N DeBardeleben","year":"2025","unstructured":"DeBardeleben N, Laros J, Daly J T, Scott S L, Engelmann C, Harrod B. High-end computing resilience: Analysis of issues facing the HEC community and path-forward for research and development, 2009. https:\/\/www.christian-engelmann.info\/publications\/debardeleben09high-end.pdf, Mar. 2025."},{"key":"4634_CR2","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126960","volume-title":"Proc. the 2017 International Conference for High Performance Computing, Networking, Storage and Analysis","author":"D Oliveira","year":"2017","unstructured":"Oliveira D, Pilla L, DeBardeleben N, Blanchard S, Quinn H, Koren I, Navaux P, Rech P. Experimental and analytical study of Xeon Phi reliability. In Proc. the 2017 International Conference for High Performance Computing, Networking, Storage and Analysis, Nov. 2017, Article No. 28. DOI: https:\/\/doi.org\/10.1145\/3126908.3126960."},{"key":"4634_CR3","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126937","volume-title":"Proc. the 2017 International Conference for High Performance Computing, Networking, Storage and Analysis","author":"S Gupta","year":"2017","unstructured":"Gupta S, Patel T, Engelmann C, Tiwari D. Failures in large scale systems: Long-term measurement, analysis, and implications. In Proc. the 2017 International Conference for High Performance Computing, Networking, Storage and Analysis, Nov. 2017, Article No. 44. DOI: https:\/\/doi.org\/10.1145\/3126908.3126937."},{"key":"4634_CR4","volume-title":"Silent data corruptions at scale","author":"H D Dixit","year":"2025","unstructured":"Dixit H D, Pendharkar S, Beadon M, Mason C, Chakravarthy T, Muthiah B, Sankar S. Silent data corruptions at scale. arXiv: 2102.11245, 2021. https:\/\/arxiv.org\/abs\/2102.11245, Mar. 2025."},{"key":"4634_CR5","doi-asserted-by":"publisher","first-page":"9","DOI":"10.1145\/3458336.3465297","volume-title":"Proc. the 2021 Workshop on Hot Topics in Operating Systems","author":"P H Hochschild","year":"2021","unstructured":"Hochschild P H, Turner P, Mogul J C, Govindaraju R, Ranganathan P, Culler D E, Vahdat A. Cores that don\u2019t count. In Proc. the 2021 Workshop on Hot Topics in Operating Systems, Jun. 2021, pp.9\u201316. DOI: https:\/\/doi.org\/10.1145\/3458336.3465297."},{"key":"4634_CR6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-20943-2","volume-title":"Fault-Tolerance Techniques for High-Performance Computing","author":"T Herault","year":"2015","unstructured":"Herault T, Robert Y. Fault-Tolerance Techniques for High-Performance Computing. Springer, 2015. DOI: https:\/\/doi.org\/10.1007\/978-3-319-20943-2."},{"issue":"9","key":"4634_CR7","doi-asserted-by":"publisher","first-page":"530","DOI":"10.1145\/361147.361115","volume":"17","author":"J W Young","year":"1974","unstructured":"Young J W. A first order approximation to the optimum checkpoint interval. Communications of the ACM, 1974, 17(9): 530\u2013531. DOI: https:\/\/doi.org\/10.1145\/361147.361115.","journal-title":"Communications of the ACM"},{"issue":"3","key":"4634_CR8","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1016\/j.future.2004.11.016","volume":"22","author":"J T Daly","year":"2006","unstructured":"Daly J T. A higher order estimate of the optimum checkpoint interval for restart dumps. Future Generation Computer Systems, 2006, 22(3): 303\u2013312. DOI: https:\/\/doi.org\/10.1016\/j.future.2004.11.016.","journal-title":"Future Generation Computer Systems"},{"key":"4634_CR9","doi-asserted-by":"publisher","first-page":"299","DOI":"10.1109\/IPDPS49936.2021.00038","volume-title":"Proc. the 2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS)","author":"A Frank","year":"2021","unstructured":"Frank A, Baumgartner M, Salkhordeh R, Brinkmann A. Improving checkpointing intervals by considering individual job failure probabilities. In Proc. the 2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS), May 2021, pp.299\u2013309. DOI: https:\/\/doi.org\/10.1109\/IPDPS49936.2021.00038."},{"issue":"3","key":"4634_CR10","doi-asserted-by":"publisher","first-page":"507","DOI":"10.1109\/TPDS.2021.3099440","volume":"33","author":"Y Du","year":"2022","unstructured":"Du Y, Marchal L, Pallez G, Robert Y. Optimal checkpointing strategies for iterative applications. IEEE Trans. Parallel and Distributed Systems, 2022, 33(3): 507\u2013522. DOI: https:\/\/doi.org\/10.1109\/TPDS.2021.3099440.","journal-title":"IEEE Trans. Parallel and Distributed Systems"},{"key":"4634_CR11","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1109\/DSN.2014.101","volume-title":"Proc. the 44th Annual IEEE\/IFIP International Conference on Dependable Systems and Networks","author":"D Tiwari","year":"2014","unstructured":"Tiwari D, Gupta S, Vazhkudai S S. Lazy checkpointing: Exploiting temporal locality in failures to mitigate checkpointing overheads on extreme-scale systems. In Proc. the 44th Annual IEEE\/IFIP International Conference on Dependable Systems and Networks, Jun. 2014, pp.25\u201336. DOI: https:\/\/doi.org\/10.1109\/DSN.2014.101."},{"key":"4634_CR12","doi-asserted-by":"publisher","first-page":"375","DOI":"10.1109\/DSN.2014.2","volume-title":"Proc. the 44th Annual IEEE\/IFIP International Conference on Dependable Systems and Networks","author":"J Wei","year":"2014","unstructured":"Wei J, Thomas A, Li G, Pattabiraman K. Quantifying the accuracy of high-level fault injection techniques for hardware faults. In Proc. the 44th Annual IEEE\/IFIP International Conference on Dependable Systems and Networks, Jun. 2014, pp.375\u2013382. DOI: https:\/\/doi.org\/10.1109\/DSN.2014.2."},{"key":"4634_CR13","doi-asserted-by":"publisher","first-page":"502","DOI":"10.1109\/DATE.2009.5090716","volume-title":"Proc. the 2009 Design, Automation & Test in Europe Conference & Exhibition","author":"R Leveugle","year":"2009","unstructured":"Leveugle R, Calvez A, Maistri P, Vanhauwaert P. Statistical fault injection: Quantified error and confidence. In Proc. the 2009 Design, Automation & Test in Europe Conference & Exhibition, Apr. 2009, pp.502\u2013506. DOI: https:\/\/doi.org\/10.1109\/DATE.2009.5090716."},{"key":"4634_CR14","doi-asserted-by":"publisher","first-page":"935","DOI":"10.1109\/HPCA56546.2023.10071105","volume-title":"Proc. the 2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","author":"G Papadimitriou","year":"2023","unstructured":"Papadimitriou G, Gizopoulos D. AVGI: Microarchitecture-driven, fast and accurate vulnerability assessment. In Proc. the 2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA), Feb. 25\u2013Mar. 1, 2023, pp.935\u2013948. DOI: https:\/\/doi.org\/10.1109\/HPCA56546.2023.10071105."},{"key":"4634_CR15","first-page":"227","volume-title":"Proc. the 2016 IEEE\/ACM International Symposium on Code Generation and Optimization","author":"I Laguna","year":"2016","unstructured":"Laguna I, Schulz M, Richards D F, Calhoun J, Olson L. IPAS: Intelligent protection against silent output corruption in scientific applications. In Proc. the 2016 IEEE\/ACM International Symposium on Code Generation and Optimization, Mar. 2016, pp.227\u2013238."},{"key":"4634_CR16","doi-asserted-by":"publisher","first-page":"111","DOI":"10.1016\/j.jpdc.2021.02.015","volume":"152","author":"L Guo","year":"2021","unstructured":"Guo L, Li D, Laguna I. PARIS: Predicting application resilience using machine learning. Journal of Parallel and Distributed Computing, 2021, 152:111\u2013124. DOI: https:\/\/doi.org\/10.1016\/j.jpdc.2021.02.015.","journal-title":"Journal of Parallel and Distributed Computing"},{"issue":"3","key":"4634_CR17","doi-asserted-by":"publisher","first-page":"305","DOI":"10.1109\/TDMR.2005.853449","volume":"5","author":"R C Baumann","year":"2005","unstructured":"Baumann R C. Radiation-induced soft errors in advanced semiconductor technologies. IEEE Trans. Device and Materials Reliability, 2005, 5(3): 305\u2013366. DOI: https:\/\/doi.org\/10.1109\/TDMR.2005.853449.","journal-title":"IEEE Trans. Device and Materials Reliability"},{"key":"4634_CR18","doi-asserted-by":"publisher","DOI":"10.7873\/DATE.2014.354","volume-title":"Proc. the 2014 Design, Automation & Test in Europe Conference & Exhibition (DATE)","author":"L B Gomez","year":"2014","unstructured":"Gomez L B, Cappello F, Carro L, DeBardeleben N, Fang B, Gurumurthi S, Pattabiraman K, Rech P, Reorda M S. GPGPUs: How to combine high computational power with high reliability. In Proc. the 2014 Design, Automation & Test in Europe Conference & Exhibition (DATE), Mar. 2014. DOI: https:\/\/doi.org\/10.7873\/DATE.2014.354."},{"key":"4634_CR19","doi-asserted-by":"publisher","first-page":"331","DOI":"10.1109\/HPCA.2015.7056044","volume-title":"Proc. the 21st IEEE International Symposium on High Performance Computer Architecture (HPCA)","author":"D Tiwari","year":"2015","unstructured":"Tiwari D, Gupta S, Rogers J, Maxwell D, Rech P, Vazhkudai S, Oliveira D, Londo D, DeBardeleben N, Navaux P, Carro L, Bland A. Understanding GPU errors on large-scale HPC systems and the implications for system design and operation. In Proc. the 21st IEEE International Symposium on High Performance Computer Architecture (HPCA), Feb. 2015, pp.331\u2013342. DOI: https:\/\/doi.org\/10.1109\/HPCA.2015.7056044."},{"key":"4634_CR20","doi-asserted-by":"publisher","first-page":"501","DOI":"10.1109\/IPDPS.2013.74","volume-title":"Proc. the 27th IEEE International Symposium on Parallel and Distributed Processing","author":"M S Bouguerra","year":"2013","unstructured":"Bouguerra M S, Gainaru A, Gomez L B, Cappello F, Matsuoka S, Maruyama N. Improving the computing efficiency of HPC systems using a combination of proactive and preventive checkpointing. In Proc. the 27th IEEE International Symposium on Parallel and Distributed Processing, May 2013, pp.501\u2013512. DOI: https:\/\/doi.org\/10.1109\/IPDPS.2013.74."},{"key":"4634_CR21","doi-asserted-by":"publisher","first-page":"44","DOI":"10.1109\/IISWC.2009.5306797","volume-title":"Proc. the 2009 IEEE International Symposium on Workload Characterization (IISWC)","author":"S Che","year":"2009","unstructured":"Che S, Boyer M, Meng J, Tarjan D, Sheaffer J W, Lee S H, Skadron K. Rodinia: A benchmark suite for heterogeneous computing. In Proc. the 2009 IEEE International Symposium on Workload Characterization (IISWC), Oct. 2009, pp.44\u201354. DOI: https:\/\/doi.org\/10.1109\/IISWC.2009.5306797."},{"key":"4634_CR22","volume-title":"Proc. the 2014 Role of Reactor Physics Toward A Sustainable Future (PHYSOR)","author":"J R Tramm","year":"2014","unstructured":"Tramm J R, Siegel A R, Islam T, Schulz M. XSBench-the development and verification of a performance abstraction for Monte Carlo reactor analysis. In Proc. the 2014 Role of Reactor Physics Toward A Sustainable Future (PHYSOR), Sept. 28\u2013Oct. 3, 2014."},{"key":"4634_CR23","series-title":"Technical Report","doi-asserted-by":"publisher","DOI":"10.2172\/993908","volume-title":"Improving performance via mini-applications","author":"P S Crozier","year":"2009","unstructured":"Crozier P S, Thornquist H K, Numrich R W, Williams A B, Edwards H C, Keiter E R, Rajan M, Willenbring J M, Doerfler D W, Heroux M A. Improving performance via mini-applications. Technical Report. Sandia National Laboratories, 2009. https:\/\/scispace.com\/papers\/improving-performance-via-mini-applications-141kz27nto, Mar. 2025."},{"key":"4634_CR24","doi-asserted-by":"publisher","first-page":"632","DOI":"10.1007\/3-540-47789-666","volume-title":"Proc. the 2002 International Conference Amsterdam","author":"R D Falgout","year":"2002","unstructured":"Falgout R D, Yang U M. hypre: A library of high performance preconditioners. In Proc. the 2002 International Conference Amsterdam, Apr. 2002, pp.632\u2013641. DOI: https:\/\/doi.org\/10.1007\/3-540-47789-666."},{"issue":"1","key":"4634_CR25","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1109\/TNNLS.2020.2978386","volume":"32","author":"Z Wu","year":"2021","unstructured":"Wu Z, Pan S, Chen F, Long G, Zhang C, Yu P S. A comprehensive survey on graph neural networks. IEEE Trans. Neural Networks and Learning Systems, 2021, 32(1): 4\u201324. DOI: https:\/\/doi.org\/10.1109\/TNNLS.2020.2978386.","journal-title":"IEEE Trans. Neural Networks and Learning Systems"},{"key":"4634_CR26","doi-asserted-by":"publisher","first-page":"82","DOI":"10.23919\/DATE51398.2021.9474098","volume-title":"Proc. the 2021 Design, Automation & Test in Europe Conference & Exhibition (DATE)","author":"J Jiao","year":"2021","unstructured":"Jiao J, Pal D, Deng C, Zhang Z. GLAIVE: Graph learning assisted instruction vulnerability estimation. In Proc. the 2021 Design, Automation & Test in Europe Conference & Exhibition (DATE), Feb. 2021, pp.82\u201387. DOI: https:\/\/doi.org\/10.23919\/DATE51398.2021.9474098."},{"key":"4634_CR27","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356194","volume-title":"Proc. the 2019 International Conference for High Performance Computing, Networking, Storage and Analysis","author":"C Chen","year":"2019","unstructured":"Chen C, Eisenhauer G, Pande S, Guan Q. CARE: Compiler-assisted recovery from soft failures. In Proc. the 2019 International Conference for High Performance Computing, Networking, Storage and Analysis, Nov. 2019, Article No. 58. DOI: https:\/\/doi.org\/10.1145\/3295500.3356194."},{"key":"4634_CR28","volume-title":"Proc. the 6th International Conference on Learning Representations","author":"P Veli\u010dkovi\u0107","year":"2018","unstructured":"Veli\u010dkovi\u0107 P, Cucurull G, Casanova A, Romero A, Li\u00f2 P, Bengio Y. Graph attention networks. In Proc. the 6th International Conference on Learning Representations, Apr. 30\u2013May 3, 2018."},{"key":"4634_CR29","doi-asserted-by":"publisher","first-page":"52","DOI":"10.1145\/3208040.3208050","volume-title":"Proc. the 27th International Symposium on High-Performance Parallel and Distributed Computing","author":"D Tao","year":"2018","unstructured":"Tao D, Di S, Liang X, Chen Z, Cappello F. Improving performance of iterative methods by lossy checkponting. In Proc. the 27th International Symposium on High-Performance Parallel and Distributed Computing, Jun. 2018, pp.52\u201365. DOI: https:\/\/doi.org\/10.1145\/3208040.3208050."},{"key":"4634_CR30","doi-asserted-by":"publisher","first-page":"30","DOI":"10.1109\/HPCA47549.2020.00013","volume-title":"Proc. the 2020 IEEE International Symposium on High Performance Computer Architecture (HPCA)","author":"I Akturk","year":"2020","unstructured":"Akturk I, Karpuzcu U R. ACR: Amnesic checkpointing and recovery. In Proc. the 2020 IEEE International Symposium on High Performance Computer Architecture (HPCA), Feb. 2020, pp.30\u201343. DOI: https:\/\/doi.org\/10.1109\/HPCA47549.2020.00013."},{"key":"4634_CR31","doi-asserted-by":"publisher","first-page":"228","DOI":"10.1109\/SC.2016.19","volume-title":"Proc. the 2016 International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Q Liu","year":"2016","unstructured":"Liu Q, Jung C, Lee D, Tiwari D. Compiler-directed lightweight checkpointing for fine-grained guaranteed soft error recovery. In Proc. the 2016 International Conference for High Performance Computing, Networking, Storage and Analysis, Nov. 2016, pp.228\u2013239. DOI: https:\/\/doi.org\/10.1109\/SC.2016.19."},{"key":"4634_CR32","doi-asserted-by":"publisher","first-page":"52","DOI":"10.1109\/CCGRID.2019.00015","volume-title":"Proc. the 19th IEEE\/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGRID)","author":"K Keller","year":"2019","unstructured":"Keller K, Bautista-Gomez L. Application-level differential checkpointing for HPC applications with dynamic datasets. In Proc. the 19th IEEE\/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGRID), May 2019, pp.52\u201361. DOI: https:\/\/doi.org\/10.1109\/CCGRID.2019.00015."}],"container-title":["Journal of Computer Science and Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11390-025-4634-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11390-025-4634-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11390-025-4634-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T15:15:10Z","timestamp":1757171710000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11390-025-4634-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5]]},"references-count":32,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,5]]}},"alternative-id":["4634"],"URL":"https:\/\/doi.org\/10.1007\/s11390-025-4634-6","relation":{},"ISSN":["1000-9000","1860-4749"],"issn-type":[{"value":"1000-9000","type":"print"},{"value":"1860-4749","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,5]]},"assertion":[{"value":"20 July 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 May 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Conflict of interest The authors declare that they have no conflict of interest.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics"}}]}}