{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,21]],"date-time":"2025-05-21T05:28:31Z","timestamp":1747805311271,"version":"3.38.0"},"publisher-location":"Berlin, Heidelberg","reference-count":21,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783642233999"},{"type":"electronic","value":"9783642234002"}],"license":[{"start":{"date-parts":[[2011,1,1]],"date-time":"2011-01-01T00:00:00Z","timestamp":1293840000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2011]]},"DOI":"10.1007\/978-3-642-23400-2_53","type":"book-chapter","created":{"date-parts":[[2011,8,17]],"date-time":"2011-08-17T11:34:15Z","timestamp":1313580855000},"page":"567-578","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["On the Use of Cluster-Based Partial Message Logging to Improve Fault Tolerance for MPI HPC Applications"],"prefix":"10.1007","author":[{"given":"Thomas","family":"Ropars","sequence":"first","affiliation":[]},{"given":"Amina","family":"Guermouche","sequence":"additional","affiliation":[]},{"given":"Bora","family":"U\u00e7ar","sequence":"additional","affiliation":[]},{"given":"Esteban","family":"Meneses","sequence":"additional","affiliation":[]},{"given":"Laxmikant V.","family":"Kal\u00e9","sequence":"additional","affiliation":[]},{"given":"Franck","family":"Cappello","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"key":"53_CR1","doi-asserted-by":"crossref","unstructured":"Antypas, K., Shalf, J., Wasserman, H.: NERSC-6 Workload Analysis and Benchmark Selection Process. Technical Report LBNL-1014E, Lawrence Berkeley National Laboratory, Berkeley (2008)","DOI":"10.2172\/938789"},{"key":"53_CR2","unstructured":"Asanovic, K., Bodik, R., Catanzaro, B.C., Gebis, J.J., Husbands, P., Keutzer, K., Patterson, D.A., Plishker, W.L., Shalf, J., Williams, S.W., Yelick, K.A.: The Landscape of Parallel Computing Research: A View from Berkeley. Technical Report UCB\/EECS-2006-183, University of California, Berkeley (2006)"},{"key":"53_CR3","unstructured":"Bailey, D., Harris, T., Saphir, W., van der Wilngaart, R., Woo, A., Yarrow, M.: The NAS Parallel Benchmarks 2.0. Technical Report NAS-95-020, NASA Ames Research Center (1995)"},{"key":"53_CR4","doi-asserted-by":"publisher","first-page":"153","DOI":"10.1016\/0020-0190(92)90140-Q","volume":"42","author":"T.N. Bui","year":"1992","unstructured":"Bui, T.N., Jones, C.: Finding good approximate vertex and edge partitions is NP-hard. Information Processing Letters\u00a042, 153\u2013159 (1992)","journal-title":"Information Processing Letters"},{"key":"53_CR5","doi-asserted-by":"publisher","first-page":"212","DOI":"10.1177\/1094342009106189","volume":"23","author":"F. Cappello","year":"2009","unstructured":"Cappello, F.: Fault tolerance in petascale\/exascale systems: Current knowledge, challenges and research opportunities. International Journal of High Performance Computing Applications\u00a023, 212\u2013226 (2009)","journal-title":"International Journal of High Performance Computing Applications"},{"key":"53_CR6","unstructured":"\u00c7ataly\u00fcrek, \u00dc.V., Aykanat, C.: PaToH: A multilevel hypergraph partitioning tool, version 3.0. Technical Report BU-CE-9915, Bilkent Univ.(1999)"},{"key":"53_CR7","doi-asserted-by":"publisher","first-page":"549","DOI":"10.1145\/3828.3829","volume":"32","author":"W.H. Cunningham","year":"1985","unstructured":"Cunningham, W.H.: Optimal attack and reinforcement of a network. J. ACM\u00a032, 549\u2013561 (1985)","journal-title":"J. ACM"},{"key":"53_CR8","first-page":"3","volume-title":"Proceedings of the 2003 International Conference on Computational Science, ICCS 2003","author":"J. Daly","year":"2003","unstructured":"Daly, J.: A model for predicting the optimum checkpoint interval for restart dumps. In: Proceedings of the 2003 International Conference on Computational Science, ICCS 2003, pp. 3\u201312. Springer, Heidelberg (2003)"},{"issue":"3","key":"53_CR9","doi-asserted-by":"publisher","first-page":"375","DOI":"10.1145\/568522.568525","volume":"34","author":"E.N.(M.) Elnozahy","year":"2002","unstructured":"Elnozahy, E.N(M.), Alvisi, L., Wang, Y.-M., Johnson, D.B.: A Survey of Rollback-Recovery Protocols in Message-Passing Systems. ACM Computing Surveys\u00a034(3), 375\u2013408 (2002)","journal-title":"ACM Computing Surveys"},{"key":"53_CR10","volume-title":"Computers and Intractability; A Guide to the Theory of NP-Completeness","author":"M.R. Garey","year":"1979","unstructured":"Garey, M.R., Johnson, D.S.: Computers and Intractability; A Guide to the Theory of NP-Completeness. W. H. Freeman & Co., New York (1979)"},{"key":"53_CR11","doi-asserted-by":"crossref","unstructured":"Guermouche, A., Ropars, T., Brunet, E., Snir, M., Cappello, F.: Uncoordinated Checkpointing Without Domino Effect for Send-Deterministic Message Passing Applications. In: 25th IEEE International Parallel & Distributed Processing Symposium (IPDPS 2011), Anchorage, USA (2011)","DOI":"10.1109\/IPDPS.2011.95"},{"key":"53_CR12","doi-asserted-by":"crossref","unstructured":"Ho, J.C.Y., Wang, C.-L., Lau, F.C.M.: Scalable Group-Based Checkpoint\/Restart for Large-Scale Message-Passing Systems. In: 22nd IEEE International Parallel and Distributed Processing Symposium, Miami, USA (2008)","DOI":"10.1109\/IPDPS.2008.4536302"},{"key":"53_CR13","doi-asserted-by":"crossref","unstructured":"Kamil, S., Shalf, J., Oliker, L., Skinner, D.: Understanding ultra-scale application communication requirements. In: Proceedings of the 2005 IEEE International Symposium on Workload Characterization, pp. 178\u2013187 (2005)","DOI":"10.1109\/IISWC.2005.1526015"},{"key":"53_CR14","unstructured":"Karypis, G., Kumar, V.: MeTiS: A Software Package for Partitioning Unstructured Graphs, Partitioning Meshes, and Computing Fill-Reducing Orderings of Sparse Matrices Version 4.0. Univ. Minnesota, Minneapolis (1998)"},{"key":"53_CR15","doi-asserted-by":"crossref","unstructured":"Meneses, E., Mendes, C.L., Kale, L.V.: Team-based Message Logging: Preliminary Results. In: 3rd Workshop on Resiliency in High Performance Computing (Resilience) in Clusters, Clouds, and Grids (CCGRID 2010) (May 2010)","DOI":"10.1109\/CCGRID.2010.110"},{"key":"53_CR16","doi-asserted-by":"publisher","first-page":"773","DOI":"10.1109\/CCGrid.2004.1336712","volume-title":"Proceedings of the 2004 IEEE International Symposium on Cluster Computing and the Grid (CCGRID 2004)","author":"S. Monnet","year":"2004","unstructured":"Monnet, S., Morin, C., Badrinath, R.: Hybrid Checkpointing for Parallel Applications in Cluster Federations. In: Proceedings of the 2004 IEEE International Symposium on Cluster Computing and the Grid (CCGRID 2004), pp. 773\u2013782. IEEE Computer Society, Washington, DC, USA (2004)"},{"key":"53_CR17","unstructured":"Pellegrini, F.: SCOTCH 5.1 User\u2019s Guide. LaBRI (2008)"},{"key":"53_CR18","volume-title":"Workshop on Communication Architecture for Clusters CAC 2006","author":"R. Riesen","year":"2006","unstructured":"Riesen, R.: Communication Patterns. In: Workshop on Communication Architecture for Clusters CAC 2006, Rhodes Island, Greece, IEEE, Los Alamitos (2006)"},{"issue":"1","key":"53_CR19","doi-asserted-by":"publisher","first-page":"49","DOI":"10.1177\/1094342005051521","volume":"19","author":"R. Thakur","year":"2005","unstructured":"Thakur, R., Rabenseifner, R., Gropp, W.: Optimization of Collective Communication Operations in MPICH. International Journal of High Performance Computing Applications\u00a019(1), 49\u201366 (2005)","journal-title":"International Journal of High Performance Computing Applications"},{"key":"53_CR20","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1016\/S0743-7315(03)00104-7","volume":"63","author":"J.S. Vetter","year":"2003","unstructured":"Vetter, J.S., Mueller, F.: Communication Characteristics of Large-Scale Scientific Applications for Contemporary Cluster Architectures. Journal of Parallel and Distributed Computing\u00a063, 853\u2013865 (2003)","journal-title":"Journal of Parallel and Distributed Computing"},{"key":"53_CR21","doi-asserted-by":"publisher","first-page":"819","DOI":"10.1002\/cpe.1364","volume":"21","author":"J.-M. Yang","year":"2009","unstructured":"Yang, J.-M., Li, K.F., Li, W.-W., Zhang, D.-F.: Trading Off Logging Overhead and Coordinating Overhead to Achieve Efficient Rollback Recovery. Concurrency and Computation: Practice and Experience\u00a021, 819\u2013853 (2009)","journal-title":"Concurrency and Computation : Practice and Experience"}],"container-title":["Lecture Notes in Computer Science","Euro-Par 2011 Parallel Processing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-23400-2_53","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,8]],"date-time":"2025-03-08T21:00:48Z","timestamp":1741467648000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-23400-2_53"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2011]]},"ISBN":["9783642233999","9783642234002"],"references-count":21,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-23400-2_53","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2011]]},"assertion":[{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}