{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T20:32:19Z","timestamp":1743107539585,"version":"3.40.3"},"publisher-location":"Cham","reference-count":26,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030105488"},{"type":"electronic","value":"9783030105495"}],"license":[{"start":{"date-parts":[[2018,12,31]],"date-time":"2018-12-31T00:00:00Z","timestamp":1546214400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2018,12,31]],"date-time":"2018-12-31T00:00:00Z","timestamp":1546214400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019]]},"DOI":"10.1007\/978-3-030-10549-5_61","type":"book-chapter","created":{"date-parts":[[2018,12,31]],"date-time":"2018-12-31T00:03:31Z","timestamp":1546214611000},"page":"787-799","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Do Moldable Applications Perform Better on Failure-Prone HPC Platforms?"],"prefix":"10.1007","author":[{"given":"Valentin","family":"Le F\u00e8vre","sequence":"first","affiliation":[]},{"given":"George","family":"Bosilca","sequence":"additional","affiliation":[]},{"given":"Aurelien","family":"Bouteiller","sequence":"additional","affiliation":[]},{"given":"Thomas","family":"Herault","sequence":"additional","affiliation":[]},{"given":"Atsushi","family":"Hori","sequence":"additional","affiliation":[]},{"given":"Yves","family":"Robert","sequence":"additional","affiliation":[]},{"given":"Jack","family":"Dongarra","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,12,31]]},"reference":[{"key":"61_CR1","unstructured":"Amdahl, G.: The validity of the single processor approach to achieving large scale computing capabilities. In: AFIPS Conference Proceedings, vol. 30, pp. 483\u2013485. AFIPS Press (1967)"},{"key":"61_CR2","doi-asserted-by":"crossref","unstructured":"Ashraf, R.A., Hukerikar, S., Engelmann, C.: Shrink or substitute: handling process failures in HPC systems using in-situ recovery. CoRR abs\/1801.04523 (2018). http:\/\/arxiv.org\/abs\/1801.04523","DOI":"10.1109\/PDP2018.2018.00032"},{"issue":"3","key":"61_CR3","doi-asserted-by":"publisher","first-page":"244","DOI":"10.1177\/1094342013488238","volume":"27","author":"Wesley Bland","year":"2013","unstructured":"Bland, W., Bouteiller, A., Herault, T., Bosilca, G., Dongarra, J.: Post-failure recovery of MPI communication capability: design and rationale. Int. J. High Perform. Comput. Appl. 27(3), 244\u2013254 (2013). https:\/\/doi.org\/10.1177\/1094342013488238, http:\/\/hpc.sagepub.com\/content\/27\/3\/244.abstract","journal-title":"The International Journal of High Performance Computing Applications"},{"issue":"1","key":"61_CR4","first-page":"5","volume":"1","author":"F Cappello","year":"2014","unstructured":"Cappello, F., Geist, A., Gropp, W., Kale, S., Kramer, B., Snir, M.: Toward exascale resilience: 2014 update. Supercomput. Front. Innov. 1(1), 5\u201328 (2014)","journal-title":"Supercomput. Front. Innov."},{"key":"61_CR5","doi-asserted-by":"crossref","unstructured":"Cavelan, A., Li, J., Robert, Y., Sun, H.: When Amdahl meets Young\/Daly. In: Cluster 2016. IEEE Computer Society Press (2016)","DOI":"10.1109\/CLUSTER.2016.17"},{"issue":"10","key":"61_CR6","doi-asserted-by":"publisher","first-page":"1571","DOI":"10.1016\/S0743-7315(02)91869-1","volume":"62","author":"W Cirne","year":"2002","unstructured":"Cirne, W., Berman, F.: Using moldability to improve the performance of supercomputer jobs. J. Parallel Distrib. Comput. 62(10), 1571\u20131601 (2002)","journal-title":"J. Parallel Distrib. Comput."},{"key":"61_CR7","unstructured":"CORAL: Collaboration of Oak Ridge, Argonne and Livermore National Laboratorie: Draft CORAL-2 build statement of work. Technical report LLNL-TM-7390608, Lawrence Livermore National Laboratory, 30 March 2018"},{"issue":"3","key":"61_CR8","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1016\/j.future.2004.11.016","volume":"22","author":"JT Daly","year":"2006","unstructured":"Daly, J.T.: A higher order estimate of the optimum checkpoint interval for restart dumps. Future Gener. Comp. Syst. 22(3), 303\u2013312 (2006)","journal-title":"Future Gener. Comp. Syst."},{"key":"61_CR9","doi-asserted-by":"crossref","unstructured":"Du, P., Bouteiller, A., et al.: Algorithm-based fault tolerance for dense matrix factorizations. In: PPoPP, pp. 225\u2013234. ACM (2012)","DOI":"10.1145\/2370036.2145845"},{"key":"61_CR10","unstructured":"Dutot, P., Mouni\u00e9, G., Trystram, D.: Scheduling parallel tasks approximation algorithms. In: Leung, J.Y. (ed.) Handbook of Scheduling - Algorithms, Models, and Performance Analysis. CRC Press (2004)"},{"key":"61_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"656","DOI":"10.1007\/978-3-319-27308-2_53","volume-title":"Parallel Processing Workshops","author":"A Fang","year":"2015","unstructured":"Fang, A., Fujita, H., Chien, A.A.: Towards understanding post-recovery efficiency for shrinking and non-shrinking recovery. In: Hunold, S., et al. (eds.) Euro-Par 2015. LNCS, vol. 9523, pp. 656\u2013668. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-27308-2_53"},{"key":"61_CR12","unstructured":"F\u00e8vre, V.L., et al.: Do moldable applications perform better on failure-prone HPC platforms? Research report RR-9174, INRIA (2018)"},{"key":"61_CR13","unstructured":"Guo, Y., Bland, W., Balaji, P., Zhou, X.: Fault tolerant MapReduce-MPI for HPC clusters. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2015, Austin, TX, USA, 15\u201320 November 2015, pp. 34:1\u201334:12 (2015)"},{"key":"61_CR14","doi-asserted-by":"crossref","unstructured":"Gupta, S., Patel, T., Engelmann, C., Tiwari, D.: Failures in large scale systems: long-term measurement, analysis, and implications. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2017, New York, NY, USA, pp. 44:1\u201344:12 (2017)","DOI":"10.1145\/3126908.3126937"},{"key":"61_CR15","doi-asserted-by":"publisher","unstructured":"Hori, A., Yoshinaga, K., Herault, T., Bouteiller, A., Bosilca, G., Ishikawa, Y.: Sliding substitution of failed nodes. In: Proceedings of the 22nd European MPI Users\u2019 Group Meeting, EuroMPI 2015, pp. 14:1\u201314:10. ACM, New York (2015). https:\/\/doi.org\/10.1145\/2802658.2802670","DOI":"10.1145\/2802658.2802670"},{"issue":"6","key":"61_CR16","doi-asserted-by":"publisher","first-page":"518","DOI":"10.1109\/TC.1984.1676475","volume":"33","author":"KH Huang","year":"1984","unstructured":"Huang, K.H., Abraham, J.A.: Algorithm-based fault tolerance for matrix operations. IEEE Trans. Comput. 33(6), 518\u2013528 (1984)","journal-title":"IEEE Trans. Comput."},{"key":"61_CR17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-20943-2","volume-title":"Fault-Tolerance Techniques for High-Performance Computing","year":"2015","unstructured":"H\u00e9rault, T., Robert, Y. (eds.): Fault-Tolerance Techniques for High-Performance Computing. Springer, Heidelberg (2015). https:\/\/doi.org\/10.1007\/978-3-319-20943-2"},{"key":"61_CR18","doi-asserted-by":"crossref","unstructured":"Jin, H., Chen, Y., Zhu, H., Sun, X.H.: Optimizing HPC fault-tolerant environment: an analytical approach. In: Proceedings of the ICPP 2010 (2010)","DOI":"10.1109\/ICPP.2010.80"},{"issue":"3","key":"61_CR19","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1147\/rd.413.0303","volume":"41","author":"JE Moreira","year":"1997","unstructured":"Moreira, J.E., Naik, V.K.: Dynamic resource management on distributed systems using reconfigurable applications. IBM J. Res. Dev. 41(3), 303\u2013330 (1997)","journal-title":"IBM J. Res. Dev."},{"key":"61_CR20","unstructured":"Prabhakaranw, S.: Dynamic resource management and job scheduling for high performance computing. Ph.D. thesis, Technische Universit\u00e4t Darmstadt (2016)"},{"key":"61_CR21","unstructured":"Simulation software: computing the yield (2018). https:\/\/github.com\/vlefevre\/continuability"},{"issue":"1","key":"61_CR22","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1016\/j.parco.2009.12.010","volume":"36","author":"R Sudarsan","year":"2010","unstructured":"Sudarsan, R., Ribbens, C.J.: Design and performance of a scheduling framework for resizable parallel applications. Parallel Comput. 36(1), 48\u201364 (2010)","journal-title":"Parallel Comput."},{"key":"61_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"175","DOI":"10.1007\/978-3-642-01970-8_18","volume-title":"Computational Science","author":"R Sudarsan","year":"2009","unstructured":"Sudarsan, R., Ribbens, C.J., Farkas, D.: Dynamic resizing of parallel scientific simulations: a case study using LAMMPS. In: Allen, G., Nabrzyski, J., Seidel, E., van Albada, G.D., Dongarra, J., Sloot, P.M.A. (eds.) ICCS 2009. LNCS, vol. 5544, pp. 175\u2013184. Springer, Heidelberg (2009). https:\/\/doi.org\/10.1007\/978-3-642-01970-8_18"},{"key":"61_CR24","doi-asserted-by":"publisher","first-page":"576","DOI":"10.1016\/j.procs.2014.05.052","volume":"29","author":"K Yamamoto","year":"2014","unstructured":"Yamamoto, K., et al.: The K computer operations: experiences and statistics. Procedia Comput. Sci. (ICCS) 29, 576\u2013585 (2014)","journal-title":"Procedia Comput. Sci. (ICCS)"},{"issue":"9","key":"61_CR25","doi-asserted-by":"publisher","first-page":"530","DOI":"10.1145\/361147.361115","volume":"17","author":"JW Young","year":"1974","unstructured":"Young, J.W.: A first order approximation to the optimum checkpoint interval. Commun. ACM 17(9), 530\u2013531 (1974)","journal-title":"Commun. ACM"},{"issue":"5","key":"61_CR26","doi-asserted-by":"publisher","first-page":"1402","DOI":"10.1109\/TC.2014.2317182","volume":"64","author":"Z Zheng","year":"2015","unstructured":"Zheng, Z., Yu, L., Lan, Z.: Reliability-aware speedup models for parallel applications with coordinated checkpointing\/restart. IEEE Trans. Comput. 64(5), 1402\u20131415 (2015)","journal-title":"IEEE Trans. Comput."}],"container-title":["Lecture Notes in Computer Science","Euro-Par 2018: Parallel Processing Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-10549-5_61","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T08:44:49Z","timestamp":1672562689000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-10549-5_61"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,12,31]]},"ISBN":["9783030105488","9783030105495"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-10549-5_61","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2018,12,31]]},"assertion":[{"value":"31 December 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"Euro-Par","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Parallel Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Turin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 August 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31 August 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"europar2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/europar2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}