{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,14]],"date-time":"2025-05-14T00:40:03Z","timestamp":1747183203862,"version":"3.40.5"},"publisher-location":"Cham","reference-count":25,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319143248"},{"type":"electronic","value":"9783319143255"}],"license":[{"start":{"date-parts":[[2014,1,1]],"date-time":"2014-01-01T00:00:00Z","timestamp":1388534400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2014]]},"DOI":"10.1007\/978-3-319-14325-5_46","type":"book-chapter","created":{"date-parts":[[2014,12,10]],"date-time":"2014-12-10T08:55:04Z","timestamp":1418201704000},"page":"535-546","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["The External Recovery Problem"],"prefix":"10.1007","author":[{"given":"Arkadiusz","family":"Danilecki","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mateusz","family":"Ho\u0142enko","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Anna","family":"Kobusi\u0144ska","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Piotr","family":"Zierhoffer","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","reference":[{"issue":"2","key":"46_CR1","doi-asserted-by":"publisher","first-page":"149","DOI":"10.1109\/32.666828","volume":"24","author":"L. Alvisi","year":"1998","unstructured":"Alvisi, L., Marzullo, K.: Message logging: Pessimistic, optimistic, causal, and optimal. Software Engineering\u00a024(2), 149\u2013159 (1998)","journal-title":"Software Engineering"},{"issue":"3","key":"46_CR2","doi-asserted-by":"publisher","first-page":"289","DOI":"10.1145\/1013202.1013205","volume":"4","author":"R.S. Barga","year":"2004","unstructured":"Barga, R.S., Lomet, D.B., Shegalov, G., Weikum, G.: Recovery guarantees for internet applications. ACM Trans. Internet Techn.\u00a04(3), 289\u2013328 (2004)","journal-title":"ACM Trans. Internet Techn."},{"key":"46_CR3","doi-asserted-by":"crossref","unstructured":"Bouteiller, A., Cappello, F., H\u00e9rault, T., Krawezik, G., Lemarinier, P., Magniette, F.: MPICH-V2: a fault tolerant MPI for volatile nodes based on pessimistic sender based message logging. In: SC, p. 25. ACM (2003)","DOI":"10.1145\/1048935.1050176"},{"issue":"4","key":"46_CR4","doi-asserted-by":"publisher","first-page":"572","DOI":"10.1002\/cpe.2859","volume":"25","author":"A. Bouteiller","year":"2013","unstructured":"Bouteiller, A., H\u00e9rault, T., Bosilca, G., Dongarra, J.J.: Correlated set coordination in fault tolerant message logging protocols for many-core clusters. Concurrency and Computation: Practice and Experience\u00a025(4), 572\u2013585 (2013)","journal-title":"Concurrency and Computation: Practice and Experience"},{"key":"46_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"71","DOI":"10.1007\/978-3-642-33074-2_6","volume-title":"Advances in Databases and Information Systems","author":"J. Brzezi\u0144ski","year":"2012","unstructured":"Brzezi\u0144ski, J., Danilecki, A., Ho\u0142enko, M., Kobusi\u0144ska, A., Kobusi\u0144ski, J., Zierhoffer, P.: D-reServE: Distributed reliable service environment. In: Morzy, T., H\u00e4rder, T., Wrembel, R. (eds.) ADBIS 2012. LNCS, vol.\u00a07503, pp. 71\u201384. Springer, Heidelberg (2012)"},{"key":"46_CR6","doi-asserted-by":"crossref","unstructured":"Cappello, F., Guermouche, A., Snir, M.: On communication determinism in parallel HPC applications. In: 2010 Proceedings of 19th International Conference on Computer Communications and Networks (ICCCN), pp. 1\u20138 (2010)","DOI":"10.1109\/ICCCN.2010.5560143"},{"issue":"3","key":"46_CR7","doi-asserted-by":"publisher","first-page":"375","DOI":"10.1145\/568522.568525","volume":"34","author":"E.N. Elnozahy","year":"2002","unstructured":"Elnozahy, E.N., Alvisi, L., Wang, Y.-M., Johnson, D.B.: A survey of rollback-recovery protocols in message-passing systems. ACM Comput. Surv.\u00a034(3), 375\u2013408 (2002)","journal-title":"ACM Comput. Surv."},{"key":"46_CR8","doi-asserted-by":"crossref","unstructured":"Guermouche, A., Ropars, T., Brunet, E., Snir, M., Cappello, F.: Uncoordinated checkpointing without domino effect for send-deterministic message passing applications. In: Accepted to the 25th IEEE International Parallel and Distributed Processing Symposium, IPDPS (May 2011)","DOI":"10.1109\/IPDPS.2011.95"},{"key":"46_CR9","doi-asserted-by":"crossref","unstructured":"Guermouche, A., Ropars, T., Snir, M., Cappello, F.: HydEE: Failure containment without event logging for large scale send-deterministic mpi applications. In: 2012 IEEE 26th International Parallel Distributed Processing Symposium (IPDPS), pp. 1216\u20131227 (2012)","DOI":"10.1109\/IPDPS.2012.111"},{"key":"46_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1007\/978-3-540-68083-3_4","volume-title":"Advances in Grid and Pervasive Computing","author":"B. Gupta","year":"2008","unstructured":"Gupta, B., Rahimi, S., Allam, V., Jupally, V.: Domino-effect free crash recovery for concurrent failures in cluster federation. In: Wu, S., Yang, L.T., Xu, T.L. (eds.) GPC 2008. LNCS, vol.\u00a05036, pp. 4\u201317. Springer, Heidelberg (2008)"},{"key":"46_CR11","doi-asserted-by":"publisher","first-page":"462","DOI":"10.1016\/0196-6774(90)90022-7","volume":"11","author":"D. Johnson","year":"1990","unstructured":"Johnson, D., Zwaenepoel, W.: Recovery in distributed systems using optimistic message logging and checkpointing. J Algorithms\u00a011, 462\u2013491 (1990)","journal-title":"J Algorithms"},{"issue":"7","key":"46_CR12","doi-asserted-by":"publisher","first-page":"558","DOI":"10.1145\/359545.359563","volume":"21","author":"L. Lamport","year":"1978","unstructured":"Lamport, L.: Time, clocks, and the ordering of events in a distributed system. Communications of the ACM\u00a021(7), 558\u2013565 (1978)","journal-title":"Communications of the ACM"},{"key":"46_CR13","doi-asserted-by":"crossref","unstructured":"Lemarinier, P., Bouteiller, A., Herault, T., Krawezik, G., Cappello, F.: Improved message logging versus improved coordinated checkpointing for fault tolerant MPI. In: CLUSTER 2004: Proceedings of the 2004 IEEE International Conference on Cluster Computing, Washington, DC, USA, pp. 115\u2013124 (2004)","DOI":"10.1109\/CLUSTR.2003.1253321"},{"issue":"8","key":"46_CR14","doi-asserted-by":"publisher","first-page":"1217","DOI":"10.1016\/j.future.2012.03.012","volume":"28","author":"Y. Luo","year":"2012","unstructured":"Luo, Y., Manivannan, D.: Hope: A hybrid optimistic checkpointing and selective pessimistic message logging protocol for large scale distributed systems. Future Generation Comp. Syst.\u00a028(8), 1217\u20131235 (2012)","journal-title":"Future Generation Comp. Syst."},{"issue":"12","key":"46_CR15","doi-asserted-by":"publisher","first-page":"1632","DOI":"10.1002\/cpe.1413","volume":"21","author":"A. Maloney","year":"2009","unstructured":"Maloney, A., Goscinski, A.: A survey and review of the current state of rollback-recovery for cluster systems. Concurrency and Computation: Practice and Experience\u00a021(12), 1632\u20131666 (2009)","journal-title":"Concurrency and Computation: Practice and Experience"},{"key":"46_CR16","unstructured":"Monnet, S., Morin, C., Badrinath, R.: A hierarchical checkpointing protocol for parallel applications in cluster federations. In: IPDPS (2004)"},{"issue":"2","key":"46_CR17","doi-asserted-by":"publisher","first-page":"165","DOI":"10.1109\/71.342127","volume":"6","author":"R.H.B. Netzer","year":"1995","unstructured":"Netzer, R.H.B., Xu, J.: Necessary and sufficient conditions for consistent global snapshots. IEEE Transactions on Parallel and Distributed Systems\u00a06(2), 165\u2013169 (1995)","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"issue":"11","key":"46_CR18","doi-asserted-by":"publisher","first-page":"1549","DOI":"10.1016\/S0167-8191(02)00165-5","volume":"28","author":"T. Park","year":"2002","unstructured":"Park, T., Lee, I., Yeom, H.Y.: An efficient causal logging scheme for recoverable distributed shared memory systems. Parallel Computing\u00a028(11), 1549\u20131572 (2002)","journal-title":"Parallel Computing"},{"issue":"2","key":"46_CR19","first-page":"221","volume":"1","author":"B. Randell","year":"1975","unstructured":"Randell, B.: System structure for software fault tolerance. IEEE Transactions on Software Engineering\u00a01(2), 221\u2013232 (1975)","journal-title":"IEEE Transactions on Software Engineering"},{"key":"46_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"567","DOI":"10.1007\/978-3-642-23400-2_53","volume-title":"Euro-Par 2011 Parallel Processing","author":"T. Ropars","year":"2011","unstructured":"Ropars, T., Guermouche, A., U\u00e7ar, B., Meneses, E., Kal\u00e9, L.V., Cappello, F.: On the use of cluster-based partial message logging to improve fault tolerance for MPI HPC applications. In: Jeannot, E., Namyst, R., Roman, J. (eds.) Euro-Par 2011, Part I. LNCS, vol.\u00a06852, pp. 567\u2013578. Springer, Heidelberg (2011)"},{"key":"46_CR21","doi-asserted-by":"crossref","unstructured":"Ropars, T., Martsinkevich, T.V., Guermouche, A., Schiper, A., Cappello, F.: Spbc: Leveraging the characteristics of mpi hpc applications for scalable checkpointing. In: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, SC 2013, pp. 8:1\u20138:12. ACM, New York (2013)","DOI":"10.1145\/2503210.2503271"},{"issue":"2","key":"46_CR22","doi-asserted-by":"publisher","first-page":"183","DOI":"10.1109\/TSE.1980.230469","volume":"6","author":"D.L. Russell","year":"1980","unstructured":"Russell, D.L.: State restoration in systems of communicating processes. IEEE Trans. Software Eng.\u00a06(2), 183\u2013194 (1980)","journal-title":"IEEE Trans. Software Eng."},{"issue":"3","key":"46_CR23","doi-asserted-by":"publisher","first-page":"204","DOI":"10.1145\/3959.3962","volume":"3","author":"R. Storm","year":"1985","unstructured":"Storm, R., Yemini, S.: Optimistic recovery in distributed systems. ACM Trans. Comput. Syst.\u00a03(3), 204\u2013226 (1985)","journal-title":"ACM Trans. Comput. Syst."},{"key":"46_CR24","doi-asserted-by":"crossref","unstructured":"Tarafdar, A., Garg, V.K.: Addressing false causality while detecting predicates in distributed programs. In: Proceedings of the 18th IEEE International Conference on Distributed Computing Systems (ICDCS 1998), pp. 94\u2013101 (1998)","DOI":"10.1109\/ICDCS.1998.679491"},{"issue":"4","key":"46_CR25","doi-asserted-by":"publisher","first-page":"287","DOI":"10.1109\/TDSC.2005.42","volume":"2","author":"J. Tsai","year":"2005","unstructured":"Tsai, J.: An efficient index-based checkpointing protocol with constant-size control information on messages. IEEE Trans. Dependable Sec. Comput.\u00a02(4), 287\u2013296 (2005)","journal-title":"IEEE Trans. Dependable Sec. Comput."}],"container-title":["Lecture Notes in Computer Science","Euro-Par 2014: Parallel Processing Workshops"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-14325-5_46","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,14]],"date-time":"2025-05-14T00:19:06Z","timestamp":1747181946000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-14325-5_46"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014]]},"ISBN":["9783319143248","9783319143255"],"references-count":25,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-14325-5_46","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2014]]},"assertion":[{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}