{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,23]],"date-time":"2025-05-23T04:59:32Z","timestamp":1747976372431,"version":"3.41.0"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319172477"},{"type":"electronic","value":"9783319172484"}],"license":[{"start":{"date-parts":[[2015,1,1]],"date-time":"2015-01-01T00:00:00Z","timestamp":1420070400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2015,1,1]],"date-time":"2015-01-01T00:00:00Z","timestamp":1420070400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2015]]},"DOI":"10.1007\/978-3-319-17248-4_11","type":"book-chapter","created":{"date-parts":[[2015,4,17]],"date-time":"2015-04-17T15:11:09Z","timestamp":1429283469000},"page":"215-236","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Assessing General-Purpose Algorithms to Cope with Fail-Stop and Silent Errors"],"prefix":"10.1007","author":[{"given":"Anne","family":"Benoit","sequence":"first","affiliation":[]},{"given":"Aur\u00e9lien","family":"Cavelan","sequence":"additional","affiliation":[]},{"given":"Yves","family":"Robert","sequence":"additional","affiliation":[]},{"given":"Hongyang","family":"Sun","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2015,4,18]]},"reference":[{"issue":"3","key":"11_CR1","doi-asserted-by":"publisher","first-page":"229","DOI":"10.1007\/s10009-012-0263-9","volume":"15","author":"I Assayad","year":"2013","unstructured":"Assayad, I., Girault, A., Kalla, H.: Tradeoff exploration between reliability, power consumption, and execution time for embedded systems. Int. J. Softw. Tools Technol. Transf. 15(3), 229\u2013245 (2013)","journal-title":"Int. J. Softw. Tools Technol. Transf."},{"key":"11_CR2","doi-asserted-by":"crossref","unstructured":"Aupy, G., Benoit, A., Robert, Y.: Energy-aware scheduling under reliability and makespan constraints. In: Proceedings of the International Conference on High Performance Computing (HiPC), pp. 1\u201310 (2012)","DOI":"10.1109\/HiPC.2012.6507482"},{"issue":"1","key":"11_CR3","doi-asserted-by":"publisher","first-page":"3:1","DOI":"10.1145\/1206035.1206038","volume":"54","author":"N Bansal","year":"2007","unstructured":"Bansal, N., Kimbrel, T., Pruhs, K.: Speed scaling to manage energy and temperature. J. ACM 54(1), 3:1\u20133:39 (2007)","journal-title":"J. ACM"},{"key":"11_CR4","doi-asserted-by":"crossref","unstructured":"Benoit, A., Cavelan, A., Robert, Y., Sun, H.: Assessing general-purpose algorithms to cope with fail-stop and silent errors. Research report RR-8599, INRIA, September 2014","DOI":"10.1007\/978-3-319-17248-4_11"},{"key":"11_CR5","unstructured":"Benson, A.R., Schmit, S., Schreiber, R.: Silent error detection in numerical time-stepping schemes. CoRR, abs\/1312.2674 (2013)"},{"issue":"4","key":"11_CR6","doi-asserted-by":"publisher","first-page":"410","DOI":"10.1016\/j.jpdc.2008.12.002","volume":"69","author":"G Bosilca","year":"2009","unstructured":"Bosilca, G., Delmas, R., Dongarra, J., Langou, J.: Algorithm-based fault tolerance applied to high performance computing. J. Parallel Distrib. Comput. 69(4), 410\u2013416 (2009)","journal-title":"J. Parallel Distrib. Comput."},{"key":"11_CR7","doi-asserted-by":"crossref","unstructured":"Bougeret, M., Casanova, H., Rabie, M., Robert, Y., Vivien, F.: Checkpointing strategies for parallel jobs. In: 2011 International Conference for High Performance Computing, Networking, Storage and Analysis (SC), pp. 1\u201311 (2011)","DOI":"10.1145\/2063384.2063428"},{"key":"11_CR8","doi-asserted-by":"crossref","unstructured":"Bronevetsky, G., de Supinski, B.: Soft error vulnerability of iterative linear algebra methods. In: Proceedings 22nd International Conference on Supercomputing, ICS 2008, pp. 155\u2013164. ACM (2008)","DOI":"10.1145\/1375527.1375552"},{"issue":"1","key":"11_CR9","doi-asserted-by":"publisher","first-page":"63","DOI":"10.1145\/214451.214456","volume":"3","author":"KM Chandy","year":"1985","unstructured":"Chandy, K.M., Lamport, L.: Distributed snapshots: determining global states of distributed systems. ACM Trans. Comput. Syst. 3(1), 63\u201375 (1985)","journal-title":"ACM Trans. Comput. Syst."},{"key":"11_CR10","doi-asserted-by":"crossref","unstructured":"Chen, Z., Online-ABFT: an online algorithm based fault tolerance scheme for soft error detection in iterative methods. In: Proceedings of the 18th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP 2013, pp. 167\u2013176. ACM (2013)","DOI":"10.1145\/2442516.2442533"},{"issue":"3","key":"11_CR11","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1016\/j.future.2004.11.016","volume":"22","author":"JT Daly","year":"2004","unstructured":"Daly, J.T.: A higher order estimate of the optimum checkpoint interval for restart dumps. FGCS 22(3), 303\u2013312 (2004)","journal-title":"FGCS"},{"key":"11_CR12","doi-asserted-by":"crossref","unstructured":"Das, A., Kumar, A., Veeravalli, B., Bolchini, C., Miele, A.: Combined DVFS and mapping exploration for lifetime and soft-error susceptibility improvement in MPSoCs. In: Proceedings of the Conference on Design, Automation and Test in Europe (DATE), pp. 1\u20136 (2014)","DOI":"10.7873\/DATE2014.074"},{"key":"11_CR13","doi-asserted-by":"crossref","unstructured":"Dixit, A., Wood, A.: The impact of new technology on soft error rates. In: IEEE International on Reliability Physics Symposium (IRPS), pp. 5B.4.1\u20135B.4.7 (2011)","DOI":"10.1109\/IRPS.2011.5784522"},{"issue":"1","key":"11_CR14","doi-asserted-by":"publisher","first-page":"163","DOI":"10.1145\/2318857.2254778","volume":"40","author":"N El-Sayed","year":"2012","unstructured":"El-Sayed, N., Stefanovici, I.A., Amvrosiadis, G., Hwang, A.A., Schroeder, B.: Temperature management in data centers: why some (might) like it hot. SIGMETRICS Perform. Eval. Rev. 40(1), 163\u2013174 (2012)","journal-title":"SIGMETRICS Perform. Eval. Rev."},{"key":"11_CR15","doi-asserted-by":"crossref","unstructured":"Elliott, J., Kharbas, K., Fiala, D., Mueller, F., Ferreira, K., Engelmann, C.: Combining partial redundancy and checkpointing for HPC. In: Proceedings of the ICDCS 2012. IEEE Computer Society (2012)","DOI":"10.1109\/ICDCS.2012.56"},{"key":"11_CR16","doi-asserted-by":"publisher","first-page":"375","DOI":"10.1145\/568522.568525","volume":"34","author":"ENM Elnozahy","year":"2002","unstructured":"Elnozahy, E.N.M., Alvisi, L., Wang, Y.-M., Johnson, D.B.: A survey of rollback-recovery protocols in message-passing systems. ACM Comput. Surv. 34, 375\u2013408 (2002)","journal-title":"ACM Comput. Surv."},{"issue":"7","key":"11_CR17","doi-asserted-by":"publisher","first-page":"54","DOI":"10.1145\/957717.957772","volume":"1","author":"W-C Feng","year":"2003","unstructured":"Feng, W.-C.: Making a case for efficient supercomputing. Queue 1(7), 54\u201364 (2003)","journal-title":"Queue"},{"key":"11_CR18","doi-asserted-by":"crossref","unstructured":"Fiala, D., Mueller, F., Engelmann, C., Riesen, R., Ferreira, K., Brightwell, R.: Detection and correction of silent data corruption for large-scale high-performance computing. In: Proceedings of the ACM\/IEEE SC International Conference SC 2012. IEEE Computer Society Press (2012)","DOI":"10.1109\/SC.2012.49"},{"key":"11_CR19","unstructured":"Heroux, M., Hoemmen, M.: Fault-tolerant iterative methods via selective reliability. Research report SAND2011-3915 C, Sandia National Laboratories (2011)"},{"key":"11_CR20","unstructured":"Hsu, C.-H., Chun Feng, W.: A power-aware run-time system for high-performance computing. In: Proceedings of the ACM\/IEEE Supercomputing Conference, pp. 1\u20139 (2005)"},{"issue":"6","key":"11_CR21","doi-asserted-by":"publisher","first-page":"518","DOI":"10.1109\/TC.1984.1676475","volume":"33","author":"K-H Huang","year":"1984","unstructured":"Huang, K.-H., Abraham, J.A.: Algorithm-based fault tolerance for matrix operations. IEEE Trans. Comput. 33(6), 518\u2013528 (1984)","journal-title":"IEEE Trans. Comput."},{"issue":"1","key":"11_CR22","doi-asserted-by":"publisher","first-page":"111","DOI":"10.1145\/2189750.2150989","volume":"40","author":"AA Hwang","year":"2012","unstructured":"Hwang, A.A., Stefanovici, I.A., Schroeder, B.: Cosmic rays don\u2019t strike twice: understanding the nature of dram errors and the implications for system design. SIGARCH Comput. Archit. News 40(1), 111\u2013122 (2012)","journal-title":"SIGARCH Comput. Archit. News"},{"key":"11_CR23","doi-asserted-by":"crossref","unstructured":"Lu, G., Zheng, Z., Chien, A.A.: When is multi-version checkpointing needed. In: 3rd Workshop for Fault-Tolerance at Extreme Scale (FTXS). ACM Press (2013). https:\/\/sites.google.com\/site\/uchicagolssg\/lssg\/research\/gvr","DOI":"10.1145\/2465813.2465821"},{"issue":"2","key":"11_CR24","doi-asserted-by":"publisher","first-page":"200","DOI":"10.1147\/rd.62.0200","volume":"6","author":"RE Lyons","year":"1962","unstructured":"Lyons, R.E., Vanderkulk, W.: The use of triple-modular redundancy to improve computer reliability. IBM J. Res. Dev. 6(2), 200\u2013209 (1962)","journal-title":"IBM J. Res. Dev."},{"key":"11_CR25","first-page":"130","volume":"3","author":"T Ozaki","year":"2006","unstructured":"Ozaki, T., Dohi, T., Okamura, H., Kaio, N.: Distribution-free checkpoint placement algorithms based on min-max principle. IEEE TDSC 3, 130\u2013140 (2006)","journal-title":"IEEE TDSC"},{"key":"11_CR26","doi-asserted-by":"crossref","unstructured":"Patterson, M.: The effect of data center temperature on energy efficiency. In: Proceedings of 11th Intersociety Conference on Thermal and Thermomechanical Phenomena in Electronic Systems, pp. 1167\u20131174 (2008)","DOI":"10.1109\/ITHERM.2008.4544393"},{"key":"11_CR27","volume-title":"Energy-Efficient Distributed Computing Systems","author":"NB Rizvandi","year":"2012","unstructured":"Rizvandi, N.B., Zomaya, A.Y., Lee, Y.C., Boloori, A.J., Taheri, J.: Multiple frequency selection in DVFS-enabled processors to minimize energy consumption. In: Zomaya, A.Y., Lee, Y.C. (eds.) Energy-Efficient Distributed Computing Systems. Wiley, Hoboken (2012)"},{"key":"11_CR28","doi-asserted-by":"crossref","unstructured":"Sao, P., Vuduc, R.:Self-stabilizing iterative solvers. In: Proceedings ScalA 2013. ACM (2013)","DOI":"10.1145\/2530268.2530272"},{"key":"11_CR29","doi-asserted-by":"crossref","unstructured":"Sarood, O., Meneses, E., Kale, L. V.: A \u2018cool\u2019 way of improving the reliability of HPC machines. In: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, pp. 58:1\u201358:12 (2013)","DOI":"10.1145\/2503210.2503228"},{"key":"11_CR30","doi-asserted-by":"crossref","unstructured":"Shantharam, M., Srinivasmurthy, S., Raghavan, P.: Fault tolerant preconditioned conjugate gradient for sparse linear system solution. In: Proceedings of the ICS 2012. ACM (2012)","DOI":"10.1145\/2304576.2304588"},{"issue":"3","key":"11_CR31","doi-asserted-by":"publisher","first-page":"630","DOI":"10.1137\/0213039","volume":"13","author":"S Toueg","year":"1984","unstructured":"Toueg, S., Babaoglu, \u00d6.: On the optimum checkpoint selection problem. SIAM J. Comput. 13(3), 630\u2013649 (1984)","journal-title":"SIAM J. Comput."},{"key":"11_CR32","unstructured":"Yao, F., Demers, A., Shenker, S.: A scheduling model for reduced CPU energy. In: Proceedings of the 36th Annual Symposium on Foundations of Computer Science (FOCS), p. 374 (1995)"},{"issue":"9","key":"11_CR33","doi-asserted-by":"publisher","first-page":"530","DOI":"10.1145\/361147.361115","volume":"17","author":"JW Young","year":"1974","unstructured":"Young, J.W.: A first order approximation to the optimum checkpoint interval. Comm. ACM 17(9), 530\u2013531 (1974)","journal-title":"Comm. ACM"},{"key":"11_CR34","doi-asserted-by":"crossref","unstructured":"Zhao, B., Aydin, H., Zhu, D.: Reliability-aware dynamic voltage scaling for energy-constrained real-time embedded systems. In: Proceedings of the IEEE International Conference on Computer Design (ICCD), pp. 633\u2013639 (2008)","DOI":"10.1109\/ICCD.2008.4751927"},{"key":"11_CR35","unstructured":"Zhu, D., Melhem, R., Mosse, D.: The effects of energy management on reliability in real-time embedded systems. In: Proceedings of the IEEE\/ACM International Conference on Computer-Aided Design (ICCAD), pp. 35\u201340 (2004)"}],"container-title":["Lecture Notes in Computer Science","High Performance Computing Systems. Performance Modeling, Benchmarking, and Simulation"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-17248-4_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,22]],"date-time":"2025-05-22T17:55:55Z","timestamp":1747936555000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-319-17248-4_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015]]},"ISBN":["9783319172477","9783319172484"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-17248-4_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2015]]},"assertion":[{"value":"18 April 2015","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}}