{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,28]],"date-time":"2025-03-28T06:53:12Z","timestamp":1743144792714,"version":"3.40.3"},"publisher-location":"Cham","reference-count":30,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030507428"},{"type":"electronic","value":"9783030507435"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-50743-5_23","type":"book-chapter","created":{"date-parts":[[2020,6,15]],"date-time":"2020-06-15T19:03:45Z","timestamp":1592247825000},"page":"455-473","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["TeaMPI\u2014Replication-Based Resilience Without the (Performance) Pain"],"prefix":"10.1007","author":[{"given":"Philipp","family":"Samfass","sequence":"first","affiliation":[]},{"given":"Tobias","family":"Weinzierl","sequence":"additional","affiliation":[]},{"given":"Benjamin","family":"Hazelwood","sequence":"additional","affiliation":[]},{"given":"Michael","family":"Bader","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,6,15]]},"reference":[{"issue":"6","key":"23_CR1","doi-asserted-by":"publisher","first-page":"897","DOI":"10.1177\/1094342016684006","volume":"32","author":"M Altenbernd","year":"2018","unstructured":"Altenbernd, M., G\u00f6ddeke, D.: Soft fault detection and correction for multigrid. Int. J. High Perform. Comput. Appl. 32(6), 897\u2013912 (2018)","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"23_CR2","doi-asserted-by":"crossref","unstructured":"Biswas, S., de Supinski, B.R., Schulz, M., Franklin, D., Sherwood, T., Chong, F.T.: Exploiting data similarity to reduce memory footprints. In: 2011 IEEE International Parallel and Distributed Processing Symposium, pp. 152\u2013163 (2011)","DOI":"10.1109\/IPDPS.2011.24"},{"issue":"3","key":"23_CR3","doi-asserted-by":"publisher","first-page":"244","DOI":"10.1177\/1094342013488238","volume":"27","author":"W Bland","year":"2013","unstructured":"Bland, W., Bouteiller, A., Herault, T., Bosilca, G., Dongarra, J.: Post-failure recovery of MPI communication capability: design and rationale. Int. J. High Perform. Comput. Appl. 27(3), 244\u2013254 (2013)","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"23_CR4","doi-asserted-by":"crossref","unstructured":"Cao, C., Herault, T., Bosilca, G., Dongarra, J.: Design for a soft error resilient dynamic task-based runtime. In: 2015 IEEE International Parallel and Distributed Processing Symposium, pp. 765\u2013774 (2015)","DOI":"10.1109\/IPDPS.2015.81"},{"issue":"3","key":"23_CR5","doi-asserted-by":"publisher","first-page":"212","DOI":"10.1177\/1094342009106189","volume":"23","author":"F Cappello","year":"2009","unstructured":"Cappello, F.: Fault tolerance in petascale\/ exascale systems: current knowledge, challenges and research opportunities. Int. J. High Perform. Comput. Appl. 23(3), 212\u2013226 (2009)","journal-title":"Int. J. High Perform. Comput. Appl."},{"issue":"5","key":"23_CR6","doi-asserted-by":"publisher","first-page":"973","DOI":"10.1177\/1094342019842645","volume":"33","author":"DE Charrier","year":"2019","unstructured":"Charrier, D.E., et al.: Studies on the energy and deep memory behaviour of a cache-oblivious, task-based hyperbolic PDE solver. Int. J. High Perform. Comp. Appl. 33(5), 973\u2013986 (2019)","journal-title":"Int. J. High Perform. Comp. Appl."},{"key":"23_CR7","doi-asserted-by":"crossref","unstructured":"Charrier, D., Hazelwood, B., Weinzierl, T.: Enclave tasking for discontinuous Galerkin methods on dynamically adaptive meshes. SIAM J. Sci. Comput. 42(3), C69\u2013C96 (2020)","DOI":"10.1137\/19M1276194"},{"key":"23_CR8","doi-asserted-by":"crossref","unstructured":"Chen, Z., et al: Fault tolerant high performance computing by a coding approach. In: Proceedings of 10th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, pp. 213\u2013223. ACM (2005)","DOI":"10.1145\/1065944.1065973"},{"key":"23_CR9","doi-asserted-by":"crossref","unstructured":"Chung, J., et al.: Containment domains: a scalable, efficient, and flexible resilience scheme for exascale systems. In: SC 2012: Proceedings of the International Conference for HPC, Networking, Storage and Analysis, pp. 1\u201311 (2012)","DOI":"10.1109\/SC.2012.36"},{"key":"23_CR10","unstructured":"Day, S.M., et al.: Tests of 3D elastodynamics codes: final report for lifelines program task 1A02. Technical report (2003)"},{"key":"23_CR11","unstructured":"Dongarra, J., et al.: Applied mathematics research for exascale computing. Technical report, Lawrence Livermore National Lab (2014)"},{"key":"23_CR12","unstructured":"Engelmann, C., Ong, H.H., Scott, S.L.: The case for modular redundancy in large-scale high performance computing systems. In: Proceedings of 8th IASTED International Conference on Parallel and Distributed Computing and Networks, vol. 1, pp. 189\u2013194 (2009)"},{"key":"23_CR13","doi-asserted-by":"publisher","first-page":"59","DOI":"10.1016\/j.future.2013.04.014","volume":"30","author":"C Engelmann","year":"2014","unstructured":"Engelmann, C.: Scaling to a million cores and beyond: using light-weight simulation to understand the challenges ahead on the road to exascale. Future Gener. Comput. Syst. 30, 59\u201365 (2014)","journal-title":"Future Gener. Comput. Syst."},{"issue":"4","key":"23_CR14","doi-asserted-by":"publisher","first-page":"465","DOI":"10.1177\/1094342005056137","volume":"19","author":"GE Fagg","year":"2005","unstructured":"Fagg, G.E., et al.: Process fault tolerance: semantics, design and applications for high performance computing. Int. J. High Perform. Comput. Appl. 19(4), 465\u2013477 (2005)","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"23_CR15","doi-asserted-by":"crossref","unstructured":"Ferreira, K., et al.: Evaluating the viability of process replication reliability for exascale systems. In: 2011 International Conference for HPC, Networking, Storage and Analysis (SC), pp. 1\u201312 (2011)","DOI":"10.1145\/2063384.2063443"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"Fiala, D., Mueller, F., Engelmann, C., Ferreira, K., Brightwell, R., Riesen, R.: Detection and correction of silent data corruption for large-scale high-performance computing. In: Proceedings of 25th IEEE\/ACM International Conference on HPC, Networking, Storage and Analysis, pp. 78:1\u201378:12. ACM, November 2012","DOI":"10.2172\/1081941"},{"issue":"C","key":"23_CR17","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1016\/j.parco.2015.07.003","volume":"49","author":"D G\u00f6ddeke","year":"2015","unstructured":"G\u00f6ddeke, D., Altenbernd, M., Ribbrock, D.: Fault-tolerant finite-element multigrid algorithms with hierarchically compressed asynchronous checkpointing. Parallel Comput. 49(C), 117\u2013135 (2015)","journal-title":"Parallel Comput."},{"key":"23_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"635","DOI":"10.1007\/978-3-319-58943-5_51","volume-title":"Euro-Par 2016: Parallel Processing Workshops","author":"M Heene","year":"2017","unstructured":"Heene, M., Hinojosa, A.P., Bungartz, H.-J., Pfl\u00fcger, D.: A massively-parallel, fault-tolerant solver for high-dimensional PDEs. In: Desprez, F., et al. (eds.) Euro-Par 2016. LNCS, vol. 10104, pp. 635\u2013647. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-58943-5_51"},{"key":"23_CR19","doi-asserted-by":"crossref","unstructured":"Hoefler, T., Lumsdaine, A.: Message progression in parallel computing - to thread or not to thread? In: IEEE International Conference on Cluster Computing, pp. 213\u2013222 (2008)","DOI":"10.1109\/CLUSTR.2008.4663774"},{"key":"23_CR20","doi-asserted-by":"publisher","first-page":"55","DOI":"10.1016\/j.jpdc.2019.12.005","volume":"138","author":"J Klinkenberg","year":"2020","unstructured":"Klinkenberg, J., Samfass, P., Bader, M., Terboven, C., M\u00fcller, M.S.: Chameleon: reactive load balancing for hybrid MPI+OpenMP task-parallel applications. J. Parallel Distr. Comput. 138, 55\u201364 (2020)","journal-title":"J. Parallel Distr. Comput."},{"key":"23_CR21","doi-asserted-by":"crossref","unstructured":"Reinarz, A., et al.: ExaHyPE: an engine for parallel dynamically adaptive simulations of wave problems. Comput. Phys. Commun., 107251 (2020)","DOI":"10.1016\/j.cpc.2020.107251"},{"key":"23_CR22","doi-asserted-by":"crossref","unstructured":"Reinarz, A., Gallard, J.M., Bader, M.: Influence of a-posteriori subcell limiting on fault frequency in higher-order DG schemes. In: IEEE\/ACM 8th Workshop on Fault Tolerance for HPC at eXtreme Scale, FTXS@SC 2018, pp. 79\u201386 (2018)","DOI":"10.1109\/FTXS.2018.00012"},{"key":"23_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"271","DOI":"10.1007\/978-3-030-20656-7_14","volume-title":"High Performance Computing","author":"A Rezaei","year":"2019","unstructured":"Rezaei, A., Khetawat, H., Patil, O., Mueller, F., Hargrove, P., Roman, E.: End-to-end resilience for HPC applications. In: Weiland, M., Juckeland, G., Trinitis, C., Sadayappan, P. (eds.) ISC High Performance 2019. LNCS, vol. 11501, pp. 271\u2013290. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-20656-7_14"},{"key":"23_CR24","doi-asserted-by":"crossref","unstructured":"Riesen, R., Ferreira, K., Stearley, J.: See applications run and throughput jump: the case for redundant computing in HPC. In: Proceedings of International Conference on Dependable Systems and Networks, pp. 29\u201334 (2010)","DOI":"10.1109\/DSNW.2010.5542625"},{"key":"23_CR25","doi-asserted-by":"crossref","unstructured":"Samfass, P., Klinkenberg, J., Bader, M.: Hybrid MPI+OpenMP reactive work stealing in distributed memory in the PDE framework sam(oa)$$^2$$. In: IEEE International Conference on Cluster Computing, pp. 337\u2013347, September 2018","DOI":"10.1109\/CLUSTER.2018.00051"},{"key":"23_CR26","doi-asserted-by":"crossref","unstructured":"Samfass, P., Weinzierl, T., Charrier, D.E., Bader, M.: Lightweight task offloading exploiting MPI wait times for parallel adaptive mesh refinement. In: Concurrency and Computation: Practice and Experience (2020, to appear)","DOI":"10.1002\/cpe.5916"},{"issue":"04","key":"23_CR27","doi-asserted-by":"publisher","first-page":"337","DOI":"10.1109\/TDSC.2009.4","volume":"7","author":"B Schroeder","year":"2010","unstructured":"Schroeder, B., Gibson, G.A.: A large-scale study of failures in high-performance computing systems. IEEE Trans. Depend. Secur. Comput. 7(04), 337\u2013350 (2010)","journal-title":"IEEE Trans. Depend. Secur. Comput."},{"key":"23_CR28","unstructured":"Simon, T., Dorband, J.: Improving application resilience through probabilistic task replication. In: ACM Workshop on Algorithmic and Application Error Resilience, June 2013"},{"key":"23_CR29","doi-asserted-by":"crossref","unstructured":"Subasi, O., Yalcin, G., Zyulkyarov, F., Unsal, O., Labarta, J.: Designing and modelling selective replication for fault-tolerant HPC applications. In: 17th IEEE\/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGRID), pp. 452\u2013457 (2017)","DOI":"10.1109\/CCGRID.2017.40"},{"key":"23_CR30","doi-asserted-by":"publisher","first-page":"158","DOI":"10.1016\/j.jcp.2019.02.004","volume":"386","author":"M Tavelli","year":"2019","unstructured":"Tavelli, M., Dumbser, M., Charrier, D.E., Rannabauer, L., Weinzierl, T., Bader, M.: A simple diffuse interface approach on adaptive Cartesian grids for the linear elastic wave equations with complex topography. J. Comput. Phys. 386, 158\u2013189 (2019)","journal-title":"J. Comput. Phys."}],"container-title":["Lecture Notes in Computer Science","High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-50743-5_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,18]],"date-time":"2023-12-18T20:04:40Z","timestamp":1702929880000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-50743-5_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030507428","9783030507435"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-50743-5_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"15 June 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ISC High Performance","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on High Performance Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Frankfurt am Main","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22 June 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 June 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"35","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"supercomputing2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.isc-hpc.com\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Linklings","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"87","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"31% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.73","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.33","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}