{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T00:56:33Z","timestamp":1770684993248,"version":"3.49.0"},"publisher-location":"Cham","reference-count":27,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319619811","type":"print"},{"value":"9783319619828","type":"electronic"}],"license":[{"start":{"date-parts":[[2017,1,1]],"date-time":"2017-01-01T00:00:00Z","timestamp":1483228800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017]]},"DOI":"10.1007\/978-3-319-61982-8_3","type":"book-chapter","created":{"date-parts":[[2017,7,13]],"date-time":"2017-07-13T09:22:20Z","timestamp":1499937740000},"page":"11-18","source":"Crossref","is-referenced-by-count":4,"title":["Hard Faults and Soft-Errors: Possible Numerical Remedies in Linear Algebra Solvers"],"prefix":"10.1007","author":[{"given":"E.","family":"Agullo","sequence":"first","affiliation":[]},{"given":"S.","family":"Cools","sequence":"additional","affiliation":[]},{"given":"L.","family":"Giraud","sequence":"additional","affiliation":[]},{"given":"A.","family":"Moreau","sequence":"additional","affiliation":[]},{"given":"P.","family":"Salas","sequence":"additional","affiliation":[]},{"given":"W.","family":"Vanroose","sequence":"additional","affiliation":[]},{"given":"E. F.","family":"Yetkin","sequence":"additional","affiliation":[]},{"given":"M.","family":"Zounon","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2017,7,14]]},"reference":[{"key":"3_CR1","unstructured":"Agullo, E., Cools, S., Giraud, L., Vanroose, W., Yetkin, F.E.: On the sensitivity of CG to soft-errors and robust numerical detection mechanisms. Research Report in Preparation, Inria (2017)"},{"key":"3_CR2","unstructured":"Agullo, E., GiraudL, L., Moreau, A.: Adaptive soft-error detection criterion for GMRES. Research Report in Preparation, Inria (2017)"},{"key":"3_CR3","doi-asserted-by":"crossref","first-page":"888","DOI":"10.1002\/nla.2059","volume":"23","author":"E Agullo","year":"2016","unstructured":"Agullo, E., Giraud, L., Guermouche, A., Roman, J., Zounon, M.: Numerical recovery strategies for parallel resilient Krylov linear solvers. Numer. Linear Algebra Appl. 23, 888\u2013905 (2016)","journal-title":"Numer. Linear Algebra Appl."},{"issue":"5","key":"3_CR4","doi-asserted-by":"crossref","first-page":"C560","DOI":"10.1137\/15M1042115","volume":"38","author":"E Agullo","year":"2016","unstructured":"Agullo, E., Giraud, L., Salas, P., Zounon, M.: Interpolation-restart strategies for resilient eigensolvers. SIAM J. Sci. Comput. 38(5), C560\u2013C583 (2016)","journal-title":"SIAM J. Sci. Comput."},{"issue":"2","key":"3_CR5","doi-asserted-by":"crossref","first-page":"149","DOI":"10.1109\/32.666828","volume":"24","author":"L Alvisi","year":"1998","unstructured":"Alvisi, L., Marzullo, K.: Message logging: pessimistic, optimistic, causal, and optimal. IEEE Trans. Softw. Eng. 24(2), 149\u2013159 (1998)","journal-title":"IEEE Trans. Softw. Eng."},{"key":"3_CR6","doi-asserted-by":"crossref","first-page":"1599","DOI":"10.1109\/12.9736","volume":"37","author":"J Anfinson","year":"1988","unstructured":"Anfinson, J., Luk, F.T.: A linear algebraic model of algorithm-based fault tolerance. IEEE Trans. Comput. 37, 1599\u20131604 (1988)","journal-title":"IEEE Trans. Comput."},{"key":"3_CR7","doi-asserted-by":"crossref","unstructured":"Austin, T.M.: DIVA: a reliable substrate for deep submicron microarchitecture design. In: Proceedings of the 32nd Annual ACM\/IEEE International Symposium on Microarchitecture, MICRO 32, Washington, DC, pp. 196\u2013207. IEEE Computer Society (1999)","DOI":"10.1109\/MICRO.1999.809458"},{"key":"3_CR8","doi-asserted-by":"crossref","first-page":"111","DOI":"10.1142\/S0129626411000126","volume":"21","author":"F Cappello","year":"2011","unstructured":"Cappello, F., Casanova, H., Robert, Y.: Preventive migration vs. preventive checkpointing for extreme scale supercomputers. Parallel Process. Lett. 21, 111\u2013132 (2011)","journal-title":"Parallel Process. Lett."},{"key":"3_CR9","doi-asserted-by":"crossref","unstructured":"Chen, Z.: Online-ABFT: an online algorithm based fault tolerance scheme for soft error detection in iterative methods. In: ACM SIGPLAN Notices, vol. 48, pp. 167\u2013176. ACM (2013)","DOI":"10.1145\/2442516.2442533"},{"issue":"3","key":"3_CR10","doi-asserted-by":"crossref","first-page":"375","DOI":"10.1145\/568522.568525","volume":"34","author":"EN Elnozahy","year":"2002","unstructured":"Elnozahy, E.N., Alvisi, L., Wang, Y.-M., Johnson, D.B.: A survey of rollback-recovery protocols in message-passing systems. ACM Comput. Surv. 34(3), 375\u2013408 (2002)","journal-title":"ACM Comput. Surv."},{"key":"3_CR11","doi-asserted-by":"crossref","unstructured":"Elnozahy, E.N., Johnson, D.B., Zwaenepoel, W.: The performance of consistent checkpointing. In: Proceedings of the 11th Symposium on Reliable Distributed Systems, pp. 39\u201347, October 1992","DOI":"10.1109\/RELDIS.1992.235144"},{"key":"3_CR12","doi-asserted-by":"crossref","unstructured":"Gunnels, J.A., Van De Geijn, R.A., Katz, D.S., Quintana-ort\u00ed, E.S.: Fault-tolerant high-performance matrix multiplication: theory and practice. In: Dependable Systems and Networks, pp. 47\u201356 (2001)","DOI":"10.1109\/DSN.2001.941390"},{"key":"3_CR13","doi-asserted-by":"crossref","first-page":"518","DOI":"10.1109\/TC.1984.1676475","volume":"33","author":"K-H Huang","year":"1984","unstructured":"Huang, K.-H., Abraham, J.A.: Algorithm-based fault tolerance for matrix operations. IEEE Trans. Comput. 33, 518\u2013528 (1984)","journal-title":"IEEE Trans. Comput."},{"issue":"6","key":"3_CR14","doi-asserted-by":"crossref","first-page":"18","DOI":"10.1109\/MM.2005.119","volume":"25","author":"RK Iyer","year":"2005","unstructured":"Iyer, R.K., Nakka, N.M., Kalbarczyk, Z.T., Mitra, S.: Recent advances and new avenues in hardware-level reliability support. IEEE Micro 25(6), 18\u201329 (2005)","journal-title":"IEEE Micro"},{"key":"3_CR15","unstructured":"Johnson, D.B., Zwaenepoel, W.: Sender-based message logging (1987)"},{"key":"3_CR16","doi-asserted-by":"crossref","first-page":"102","DOI":"10.1137\/040620394","volume":"30","author":"J Langou","year":"2007","unstructured":"Langou, J., Chen, Z., Bosilca, G., Dongarra, J.: Recovery patterns for iterative methods in a parallel unstable environment. SIAM J. Sci. Comput. 30, 102\u2013116 (2007)","journal-title":"SIAM J. Sci. Comput."},{"key":"3_CR17","doi-asserted-by":"crossref","unstructured":"Li, C.-C.J., Fuchs, W.K.: Catch-compiler-assisted techniques for checkpointing. In: 20th International Symposium on Fault-Tolerant Computing. FTCS-20. Digest of Papers, pp. 74\u201381, June 1990","DOI":"10.1109\/FTCS.1990.89337"},{"key":"3_CR18","doi-asserted-by":"crossref","unstructured":"Liu, Y., Nassar, R., Leangsuksun, C.B., Naksinehaboon, N., Paun, M., Scott, S.L.: An optimal checkpoint\/restart model for a large scale high performance computing system. In: IEEE International Symposium on Parallel and Distributed Processing (IPDPS 2008), pp. 1\u20139, April 2008","DOI":"10.1109\/IPDPS.2008.4536279"},{"issue":"1","key":"3_CR19","doi-asserted-by":"crossref","first-page":"63","DOI":"10.1109\/24.994913","volume":"51","author":"N Oh","year":"2002","unstructured":"Oh, N., Shirvani, P.P., McCluskey, E.J.: Error detection by duplicated instructions in super-scalar processors. IEEE Trans. Reliab. 51(1), 63\u201375 (2002)","journal-title":"IEEE Trans. Reliab."},{"issue":"2","key":"3_CR20","doi-asserted-by":"crossref","first-page":"125","DOI":"10.1006\/jpdc.1997.1336","volume":"43","author":"JS Plank","year":"1997","unstructured":"Plank, J.S., Kim, Y., Dongarra, J.: Fault tolerant matrix operations for networks of workstations using diskless checkpointing. J. Parallel Distrib. Comput. 43(2), 125\u2013138 (1997)","journal-title":"J. Parallel Distrib. Comput."},{"key":"3_CR21","unstructured":"Plank, J.: An overview of checkpointing in uniprocessor and distributed systems, focusing on implementation and performance. Technical report UT-CS-97-372, Department of Computer Science, University of Tennessee (1997)"},{"issue":"2","key":"3_CR22","doi-asserted-by":"crossref","first-page":"62","DOI":"10.1109\/88.311574","volume":"2","author":"JS Plank","year":"1994","unstructured":"Plank, J.S., Li, K.: ICKP: a consistent checkpointer for multicomputers. Parallel Distrib. Technol. Syst. Appl. 2(2), 62\u201367 (1994). IEEE","journal-title":"Parallel Distrib. Technol. Syst. Appl."},{"key":"3_CR23","unstructured":"Raju, N., Liu, Y., Leangsuksun, C.B., Nassar, R., Scott, S.: Reliability Analysis in HPC clusters. In: Proceedings of the High Availability and Performance Computing Workshop (2006)"},{"key":"3_CR24","doi-asserted-by":"crossref","unstructured":"Sancho, J.C., Petrini, F., Davis, K., Gioiosa, R., Jiang, S.: Current practice and a direction forward in checkpoint\/restart implementations for fault tolerance. In: Proceedings of 19th IEEE International Parallel and Distributed Processing Symposium, April 2005","DOI":"10.1109\/IPDPS.2005.157"},{"key":"3_CR25","doi-asserted-by":"crossref","unstructured":"Scholzel, M.: Reduced triple modular redundancy for built-in self-repair in VLIW-processors. In: Signal Processing Algorithms, Architectures, Arrangements and Applications, pp. 21\u201326 (2007)","DOI":"10.1109\/SPA.2007.5903294"},{"key":"3_CR26","doi-asserted-by":"crossref","unstructured":"Vijaykumar, T.N., Pomeranz, I., Cheng, K.: Transient-fault recovery using simultaneous multithreading. In: Proceedings of the 29th Annual International Symposium on Computer Architecture, pp. 87\u201398 (2002)","DOI":"10.1109\/ISCA.2002.1003565"},{"key":"3_CR27","doi-asserted-by":"crossref","unstructured":"Wang, C., Mueller, F., Engelmann, C., Scott, S.L.: Hybrid full\/incremental checkpoint\/restart for MPI jobs in HPC environments. Department of Computer Science, North Carolina State University (2009)","DOI":"10.1109\/ICPADS.2010.48"}],"container-title":["Lecture Notes in Computer Science","High Performance Computing for Computational Science \u2013 VECPAR 2016"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-61982-8_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,22]],"date-time":"2025-06-22T11:33:57Z","timestamp":1750592037000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-61982-8_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017]]},"ISBN":["9783319619811","9783319619828"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-61982-8_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2017]]}}}