{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T16:36:08Z","timestamp":1772642168031,"version":"3.50.1"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030507428","type":"print"},{"value":"9783030507435","type":"electronic"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-50743-5_27","type":"book-chapter","created":{"date-parts":[[2020,6,15]],"date-time":"2020-06-15T19:03:45Z","timestamp":1592247825000},"page":"536-554","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["Reinit$$^{++}$$: Evaluating the Performance of Global-Restart Recovery Methods for MPI Fault Tolerance"],"prefix":"10.1007","author":[{"given":"Giorgis","family":"Georgakoudis","sequence":"first","affiliation":[]},{"given":"Luanzheng","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Ignacio","family":"Laguna","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,6,15]]},"reference":[{"key":"27_CR1","doi-asserted-by":"crossref","unstructured":"Adam, J., et al.: Transparent high-speed network checkpoint\/restart in MPI. In: Proceedings of the 25th European MPI Users\u2019 Group Meeting, p. 12 (2018)","DOI":"10.1145\/3236367.3236383"},{"key":"27_CR2","doi-asserted-by":"publisher","first-page":"204","DOI":"10.1016\/j.parco.2019.02.006","volume":"85","author":"J Adam","year":"2019","unstructured":"Adam, J., et al.: Checkpoint\/restart approaches for a thread-based MPI runtime. Parallel Comput. 85, 204\u2013219 (2019)","journal-title":"Parallel Comput."},{"key":"27_CR3","doi-asserted-by":"publisher","unstructured":"Bautista-Gomez, L., Tsuboi, S., Komatitsch, D., Cappello, F., Maruyama, N., Matsuoka, S.: FTI: high performance fault tolerance interface for hybrid systems. In: SC 2011: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201312, November 2011. https:\/\/doi.org\/10.1145\/2063384.2063427","DOI":"10.1145\/2063384.2063427"},{"issue":"3","key":"27_CR4","doi-asserted-by":"publisher","first-page":"244","DOI":"10.1177\/1094342013488238","volume":"27","author":"W Bland","year":"2013","unstructured":"Bland, W., Bouteiller, A., Herault, T., Bosilca, G., Dongarra, J.: Post-failure recovery of MPI communication capability: design and rationale. Int. J. High Performance Comput. Appl. 27(3), 244\u2013254 (2013)","journal-title":"Int. J. High Performance Comput. Appl."},{"key":"27_CR5","doi-asserted-by":"crossref","unstructured":"Bland, W., Lu, H., Seo, S., Balaji, P.: Lessons learned implementing user-level failure mitigation in mpich. In: 2015 15th IEEE\/ACM International Symposium on Cluster, Cloud and Grid Computing (2015)","DOI":"10.1109\/CCGrid.2015.51"},{"key":"27_CR6","doi-asserted-by":"crossref","unstructured":"Bosilca, G., et al.: Mpich-v: toward a scalable fault tolerant MPI for volatile nodes. In: SC 2002: Proceedings of the 2002 ACM\/IEEE Conference on Supercomputing, pp. 29\u201329. IEEE (2002)","DOI":"10.1109\/SC.2002.10048"},{"key":"27_CR7","doi-asserted-by":"crossref","unstructured":"Bosilca, G., et al.: Failure detection and propagation in HPC systems. In: SC 2016: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 312\u2013322 (2016)","DOI":"10.1109\/SC.2016.26"},{"key":"27_CR8","doi-asserted-by":"publisher","unstructured":"Bosilca, G., et al.: A failure detector for HPC platforms. Int. J. High Performance Comput. Appl. 32(1), 139\u2013158 (2018). https:\/\/doi.org\/10.1177\/1094342017711505","DOI":"10.1177\/1094342017711505"},{"key":"27_CR9","doi-asserted-by":"crossref","unstructured":"Bouteiller, A., Bosilca, G., Dongarra, J.J.: Plan B: Interruption of ongoing MPI operations to support failure recovery. In: Proceedings of the 22nd European MPI Users\u2019 Group Meeting, p. 11 (2015)","DOI":"10.1145\/2802658.2802668"},{"key":"27_CR10","doi-asserted-by":"crossref","unstructured":"Cao, J., et al.: System-level scalable checkpoint-restart for petascale computing. In: 2016 IEEE 22nd International Conference on Parallel and Distributed Systems (ICPADS) (2016)","DOI":"10.1109\/ICPADS.2016.0125"},{"key":"27_CR11","doi-asserted-by":"publisher","unstructured":"Chakraborty, S., et al.: Ereinit: scalable and efficient fault-tolerance for bulk-synchronous MPI applications. Concurrency and Computation: Practice and Experience, e4863. https:\/\/doi.org\/10.1002\/cpe.4863, https:\/\/onlinelibrary.wiley.com\/doi\/abs\/10.1002\/cpe.4863, e4863 cpe.4863","DOI":"10.1002\/cpe.4863"},{"key":"27_CR12","doi-asserted-by":"publisher","unstructured":"Dongarra, J., et al.: The international exascale software project roadmap. Int. J. High Perform. Comput. Appl. 25(1), 3\u201360 (2011). https:\/\/doi.org\/10.1177\/1094342010391989, http:\/\/dx.doi.org\/10.1177\/1094342010391989","DOI":"10.1177\/1094342010391989"},{"key":"27_CR13","doi-asserted-by":"publisher","unstructured":"Gamell, M., Katz, D.S., Kolla, H., Chen, J., Klasky, S., Parashar, M.: Exploring automatic, online failure recovery for scientific applications at extreme scales. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 895\u2013906. SC 2014, IEEE Press, Piscataway, NJ, USA (2014). https:\/\/doi.org\/10.1109\/SC.2014.78","DOI":"10.1109\/SC.2014.78"},{"key":"27_CR14","doi-asserted-by":"crossref","unstructured":"Gamell, M., et al.: Local recovery and failure masking for stencil-based applications at extreme scales. In: SC 2015: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201312 (2015)","DOI":"10.1145\/2807591.2807672"},{"key":"27_CR15","doi-asserted-by":"crossref","unstructured":"Hargrove, P.H., Duell, J.C.: Berkeley lab checkpoint\/restart (BLCR) for linux clusters. In: Journal of Physics: Conference Series. vol. 46, p. 494 (2006)","DOI":"10.1088\/1742-6596\/46\/1\/067"},{"key":"27_CR16","doi-asserted-by":"crossref","unstructured":"Herault, T., et al.: Practical scalable consensus for pseudo-synchronous distributed systems. In: SC 2015: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201312 (2015)","DOI":"10.1145\/2807591.2807665"},{"key":"27_CR17","doi-asserted-by":"crossref","unstructured":"Hori, A., Yoshinaga, K., Herault, T., Bouteiller, A., Bosilca, G., Ishikawa, Y.: Sliding substitution of failed nodes. In: Proceedings of the 22nd European MPI Users\u2019 Group Meeting, p. 14. ACM (2015)","DOI":"10.1145\/2802658.2802670"},{"key":"27_CR18","doi-asserted-by":"crossref","unstructured":"Katti, A., Di Fatta, G., Naughton, T., Engelmann, C.: Scalable and fault tolerant failure detection and consensus. In: Proceedings of the 22nd European MPI Users\u2019 Group Meeting, p. 13 (2015)","DOI":"10.1145\/2802658.2802660"},{"issue":"5","key":"27_CR19","doi-asserted-by":"publisher","first-page":"729","DOI":"10.1177\/1094342017690910","volume":"32","author":"A Katti","year":"2018","unstructured":"Katti, A., Di Fatta, G., Naughton, T., Engelmann, C.: Epidemic failure detection and consensus for extreme parallelism. Int. J. High Performance Comput. Appl. 32(5), 729\u2013743 (2018)","journal-title":"Int. J. High Performance Comput. Appl."},{"issue":"4","key":"27_CR20","doi-asserted-by":"publisher","first-page":"571","DOI":"10.1177\/1094342018767736","volume":"33","author":"N Kohl","year":"2019","unstructured":"Kohl, N., et al.: A scalable and extensible checkpointing scheme for massively parallel simulations. Int. J. High Performance Comput. Appl. 33(4), 571\u2013589 (2019)","journal-title":"Int. J. High Performance Comput. Appl."},{"key":"27_CR21","doi-asserted-by":"publisher","unstructured":"Laguna, I., Richards, D.F., Gamblin, T., Schulz, M., de Supinski, B.R.: Evaluating user-level fault tolerance for MPI applications. In: Proceedings of the 21st European MPI Users\u2019 Group Meeting, pp. 57:57\u201357:62. EuroMPI\/ASIA 2014, ACM, New York, NY, USA (2014). https:\/\/doi.org\/10.1145\/2642769.2642775, http:\/\/doi.acm.org\/10.1145\/2642769.2642775","DOI":"10.1145\/2642769.2642775"},{"issue":"3","key":"27_CR22","doi-asserted-by":"publisher","first-page":"305","DOI":"10.1177\/1094342015623623","volume":"30","author":"I Laguna","year":"2016","unstructured":"Laguna, I., et al.: Evaluating and extending user-level fault tolerance in MPI applications. Int. J. High Performance Comput. Appl. 30(3), 305\u2013319 (2016). https:\/\/doi.org\/10.1177\/1094342015623623","journal-title":"Int. J. High Performance Comput. Appl."},{"key":"27_CR23","doi-asserted-by":"crossref","unstructured":"Losada, N., Cores, I., Mart\u00edn, M.J., Gonz\u00e1lez, P.: Resilient MPI applications using an application-level checkpointing framework and ULFM. The Journal of Supercomputing 73(1) (2017)","DOI":"10.1007\/s11227-016-1629-7"},{"key":"27_CR24","doi-asserted-by":"publisher","unstructured":"Martino, C.D., Kalbarczyk, Z., Iyer, R.K., Baccanico, F., Fullop, J., Kramer, W.: Lessons learned from the analysis of system failures at petascale: The case of blue waters. In: 2014 44th Annual IEEE\/IFIP International Conference on Dependable Systems and Networks. pp. 610\u2013621, June 2014. https:\/\/doi.org\/10.1109\/DSN.2014.62","DOI":"10.1109\/DSN.2014.62"},{"issue":"9","key":"27_CR25","doi-asserted-by":"publisher","first-page":"2255","DOI":"10.1109\/TPDS.2013.100","volume":"25","author":"K Mohror","year":"2014","unstructured":"Mohror, K., Moody, A., Bronevetsky, G., de Supinski, B.R.: Detailed modeling and evaluation of a scalable multilevel checkpointing system. IEEE Trans. Parallel Distrib. Syst. 25(9), 2255\u20132263 (2014). https:\/\/doi.org\/10.1109\/TPDS.2013.100","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"27_CR26","first-page":"471","volume":"25","author":"S Pauli","year":"2014","unstructured":"Pauli, S., Kohler, M., Arbenz, P.: A fault tolerant implementation of multi-level monte carlo methods. Parallel Comput. Acceler. Comput. Sci. Eng. (CSE) 25, 471\u2013480 (2014)","journal-title":"Parallel Comput. Acceler. Comput. Sci. Eng. (CSE)"},{"issue":"4","key":"27_CR27","first-page":"479","volume":"19","author":"S Sankaran","year":"2005","unstructured":"Sankaran, S., et al.: The lam\/mpi checkpoint\/restart framework: system-initiated checkpointing. JHPCA 19(4), 479\u2013493 (2005)","journal-title":"JHPCA"},{"issue":"3","key":"27_CR28","doi-asserted-by":"publisher","first-page":"501","DOI":"10.1109\/TPDS.2018.2866794","volume":"30","author":"F Shahzad","year":"2018","unstructured":"Shahzad, F., Thies, J., Kreutzer, M., Zeiser, T., Hager, G., Wellein, G.: Craft: a library for easier application-level checkpoint\/restart and automatic fault tolerance. IEEE Trans. Parallel Distrib. Syst. 30(3), 501\u2013514 (2018)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"issue":"5","key":"27_CR29","doi-asserted-by":"publisher","first-page":"641","DOI":"10.1177\/1094342016669416","volume":"32","author":"O Subasi","year":"2018","unstructured":"Subasi, O., Martsinkevich, T., Zyulkyarov, F., Unsal, O., Labarta, J., Cappello, F.: Unified fault-tolerance framework for hybrid task-parallel message-passing applications. Int. J. High Performance Comput. Appl. 32(5), 641\u2013657 (2018)","journal-title":"Int. J. High Performance Comput. Appl."},{"key":"27_CR30","doi-asserted-by":"publisher","unstructured":"Sultana, N., R\u00fcfenacht, M., Skjellum, A., Laguna, I., Mohror, K.: Failure recovery for bulk synchronous applications with MPI stages. Parallel Comput. 84, 1\u201314 (2019). https:\/\/doi.org\/10.1016\/j.parco.2019.02.007, http:\/\/www.sciencedirect.com\/science\/article\/pii\/S0167819118303260","DOI":"10.1016\/j.parco.2019.02.007"},{"key":"27_CR31","doi-asserted-by":"crossref","unstructured":"Teranishi, K., Heroux, M.A.: Toward local failure local recovery resilience model using MPI-ULFM. In: Proceedings of the 21st European MPI Users\u2019 Group Meeting, p. 51 (2014)","DOI":"10.1145\/2642769.2642774"},{"issue":"8","key":"27_CR32","doi-asserted-by":"publisher","first-page":"1678","DOI":"10.1109\/TPDS.2018.2808519","volume":"29","author":"Z Wang","year":"2018","unstructured":"Wang, Z., Gao, L., Gu, Y., Bao, Y., Yu, G.: A fault-tolerant framework for asynchronous iterative computations in cloud environments. IEEE Trans. Parallel Distrib. Syst. 29(8), 1678\u20131692 (2018)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"27_CR33","doi-asserted-by":"publisher","unstructured":"Zheng, G., Xiang N., Kal\u00e9, L.V.: A scalable double in-memory checkpoint and restart scheme towards exascale. In: IEEE\/IFIP International Conference on Dependable Systems and Networks Workshops (DSN 2012), pp. 1\u20136, June 2012. https:\/\/doi.org\/10.1109\/DSNW.2012.6264677","DOI":"10.1109\/DSNW.2012.6264677"},{"key":"27_CR34","doi-asserted-by":"publisher","unstructured":"Zheng, G., Huang, C., Kal\u00e9, L.V.: Performance evaluation of automatic checkpoint-based fault tolerance for ampi and charm++. SIGOPS Oper. Syst. Rev. 40(2), 90\u201399 (2006). https:\/\/doi.org\/10.1145\/1131322.1131340, http:\/\/doi.acm.org\/10.1145\/1131322.1131340","DOI":"10.1145\/1131322.1131340"}],"container-title":["Lecture Notes in Computer Science","High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-50743-5_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,18]],"date-time":"2023-12-18T20:04:46Z","timestamp":1702929886000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-50743-5_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030507428","9783030507435"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-50743-5_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"15 June 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ISC High Performance","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on High Performance Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Frankfurt am Main","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22 June 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 June 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"35","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"supercomputing2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.isc-hpc.com\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Linklings","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"87","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"31% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.73","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.33","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}