{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,21]],"date-time":"2025-05-21T05:28:49Z","timestamp":1747805329353,"version":"3.40.4"},"publisher-location":"Cham","reference-count":41,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319102139"},{"type":"electronic","value":"9783319102146"}],"license":[{"start":{"date-parts":[[2014,1,1]],"date-time":"2014-01-01T00:00:00Z","timestamp":1388534400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2014,1,1]],"date-time":"2014-01-01T00:00:00Z","timestamp":1388534400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2014]]},"DOI":"10.1007\/978-3-319-10214-6_5","type":"book-chapter","created":{"date-parts":[[2014,9,30]],"date-time":"2014-09-30T11:56:12Z","timestamp":1412078172000},"page":"91-114","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["Using Simulation to Evaluate the Performance of Resilience Strategies at Scale"],"prefix":"10.1007","author":[{"given":"Scott","family":"Levy","sequence":"first","affiliation":[]},{"given":"Bryan","family":"Topp","sequence":"additional","affiliation":[]},{"given":"Kurt B.","family":"Ferreira","sequence":"additional","affiliation":[]},{"given":"Dorian","family":"Arnold","sequence":"additional","affiliation":[]},{"given":"Torsten","family":"Hoefler","sequence":"additional","affiliation":[]},{"given":"Patrick","family":"Widener","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2014,10,1]]},"reference":[{"key":"5_CR1","unstructured":"Bergman, K., et al.: Exascale computing study: Technology challenges in achieving exascale systems (September 2008), http:\/\/www.science.energy.gov\/ascr\/Research\/CS\/DARPAexascale-hardware(2008).pdf"},{"issue":"3","key":"5_CR2","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1016\/j.future.2004.11.016","volume":"22","author":"JT Daly","year":"2006","unstructured":"Daly, J.T.: A higher order estimate of the optimum checkpoint interval for restart dumps. Future Gener. Comput. Syst. 22(3), 303\u2013312 (2006)","journal-title":"Future Gener. Comput. Syst."},{"key":"5_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"206","DOI":"10.1007\/978-3-642-14390-8_22","volume-title":"Parallel Processing and Applied Mathematics","author":"M-S Bouguerra","year":"2010","unstructured":"Bouguerra, M.-S., Gautier, T., Trystram, D., Vincent, J.-M.: A flexible checkpoint\/restart model in distributed systems. In: Wyrzykowski, R., Dongarra, J., Karczewski, K., Wasniewski, J. (eds.) PPAM 2009, Part I. LNCS, vol. 6067, pp. 206\u2013215. Springer, Heidelberg (2010)"},{"key":"5_CR4","doi-asserted-by":"crossref","unstructured":"Guermouche, A., Ropars, T., Brunet, E., Snir, M., Cappello, F.: Uncoordinated checkpointing without domino effect for send-deterministic MPI applications. In: International Parallel Distributed Processing Symposium (IPDPS), pp. 989\u20131000 (May 2011)","DOI":"10.1109\/IPDPS.2011.95"},{"key":"5_CR5","doi-asserted-by":"crossref","unstructured":"Alvisi, L., Elnozahy, E., Rao, S., Husain, S., de Mel, A.: An analysis of communication induced checkpointing. In: Twenty-Ninth Annual International Symposium on Fault-Tolerant Computing, Digest of Papers, pp. 242\u2013249 (1999)","DOI":"10.1109\/FTCS.1999.781058"},{"key":"5_CR6","doi-asserted-by":"crossref","unstructured":"Monnet, S., Morin, C., Badrinath, R.: A hierarchical checkpointing protocol for parallel applications in cluster federations. In: Proceedings of the 18th International Parallel and Distributed Processing Symposium, p. 211. IEEE (2004)","DOI":"10.1109\/IPDPS.2004.1303242"},{"issue":"3","key":"5_CR7","doi-asserted-by":"publisher","first-page":"375","DOI":"10.1145\/568522.568525","volume":"34","author":"ENM Elnozahy","year":"2002","unstructured":"Elnozahy, E.N.M., Alvisi, L., Wang, Y.-M., Johnson, D.B.: A survey of rollback-recovery protocols in message-passing systems. ACM Comput. Surv. 34(3), 375\u2013408 (2002)","journal-title":"ACM Comput. Surv."},{"key":"5_CR8","doi-asserted-by":"crossref","unstructured":"Oldfield, R.A., Arunagiri, S., Teller, P.J., Seelam, S., Varela, M.R., Riesen, R., Roth, P.C.: Modeling the impact of checkpoints on next-generation systems. In: 24th IEEE Conference on Mass Storage Systems and Technologies, pp. 30\u201346 (September 2007)","DOI":"10.1109\/MSST.2007.4367962"},{"key":"5_CR9","doi-asserted-by":"crossref","unstructured":"Ferreira, K., Riesen, R., Bridges, P., Arnold, D., Stearley, J., Laros III, J.H., Oldfield, R., Pedretti, K., Brightwell, R.: Evaluating the viability of process replication reliability for exascale systems. In: Lathrop, S., Costa, J., Kramer, W. (eds.) SC. ACM (November 2011)","DOI":"10.1145\/2063384.2063443"},{"key":"5_CR10","unstructured":"Schroeder, B., Gibson, G.A.: A large-scale study of failures in high-performance computing systems. In: International Conference on Dependable Systems and Networks (DSN) (June 2006)"},{"key":"5_CR11","doi-asserted-by":"crossref","unstructured":"Kannan, S., Gavrilovska, A., Schwan, K., Milojicic, D.: Optimizing checkpoints using NVM as virtual memory. In: Proceedings of the International Parallel and Distributed Processing Symposium, IPDPS 2013. ACM, New York (2013)","DOI":"10.1109\/IPDPS.2013.69"},{"key":"5_CR12","doi-asserted-by":"crossref","unstructured":"Dong, X., Muralimanohar, N., Jouppi, N., Kaufmann, R., Xie, Y.: Leveraging 3D PCRAM technologies to reduce checkpoint overhead for future exascale systems. In: Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis, SC 2009, pp. 57:1\u201357:12. ACM, New York (2009)","DOI":"10.1145\/1654059.1654117"},{"key":"5_CR13","doi-asserted-by":"crossref","unstructured":"Bronevetsky, G., Marques, D., Pingali, K., McKee, S., Rugina, R.: Compiler-enhanced incremental checkpointing for openmp applications. In: IEEE International Symposium on Parallel & Distributed Processing, pp. 1\u201312 (2009)","DOI":"10.1109\/IPDPS.2009.5160999"},{"key":"5_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"272","DOI":"10.1007\/978-3-642-24449-0_31","volume-title":"Recent Advances in the Message Passing Interface","author":"Kurt B Ferreira","year":"2011","unstructured":"Ferreira, Kurt B., Riesen, Rolf, Brighwell, Ron, Bridges, Patrick, Arnold, Dorian: libhashckpt: hash-based incremental checkpointing using GPU\u2019s. In: Cotronis, Yiannis, Danalis, Anthony, Nikolopoulos, Dimitrios S., Dongarra, Jack (eds.) EuroMPI 2011. LNCS, vol. 6960, pp. 272\u2013281. Springer, Heidelberg (2011)"},{"key":"5_CR15","doi-asserted-by":"crossref","unstructured":"Moody, A., Bronevetsky, G., Mohror, K., de Supinski, B.R.: Design, modeling, and evaluation of a scalable multi-level checkpointing system. In: ACM\/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis (SC 2010), pp. 1\u201311 (2010), http:\/\/dx.doi.org\/10.1109\/SC.2010.18","DOI":"10.1109\/SC.2010.18"},{"key":"5_CR16","doi-asserted-by":"crossref","unstructured":"Ibtesham, D., Arnold, D., Bridges, P.G., Ferreira, K.B., Brightwell, R.: On the viability of compression for reducing the overheads of checkpoint\/restart-based fault tolerance. In: 2012 41st International Conference on Parallel Processing, pp. 148\u2013157 (2012)","DOI":"10.1109\/ICPP.2012.45"},{"key":"5_CR17","doi-asserted-by":"crossref","unstructured":"Guermouche, A., Ropars, T., Snir, M., Cappello, F.: HydEE: Failure containment without event logging for large scale send-deterministic mpi applications. In: IPDPS, pp. 1216\u20131227. IEEE Computer Society (2012)","DOI":"10.1109\/IPDPS.2012.111"},{"key":"5_CR18","doi-asserted-by":"crossref","unstructured":"Mubarak, M., Carothers, C.D., Ross, R., Carns, P.: Modeling a million-node dragonfly network using massively parallel discrete-event simulation. In: 2012 SC Companion: High Performance Computing, Networking, Storage and Analysis (SCC), pp. 366\u2013376. IEEE (2012)","DOI":"10.1109\/SC.Companion.2012.56"},{"issue":"2\u20133","key":"5_CR19","doi-asserted-by":"publisher","first-page":"183","DOI":"10.1007\/s10766-005-3582-6","volume":"33","author":"G Zheng","year":"2005","unstructured":"Zheng, G., Wilmarth, T., Jagadishprasad, P., Kal\u00e9, L.V.: Simulation-based performance prediction for large parallel machines. International Journal of Parallel Programming 33(2\u20133), 183\u2013207 (2005)","journal-title":"International Journal of Parallel Programming"},{"key":"5_CR20","doi-asserted-by":"crossref","unstructured":"Hoefler, T., Schneider, T., Lumsdaine, A.: LogGOPSim - Simulating Large-Scale Applications in the LogGOPS Model. In: Proceedings of the 19th ACM International Symposium on High Performance Distributed Computing, pp. 597\u2013604. ACM (June 2010)","DOI":"10.1145\/1851476.1851564"},{"key":"5_CR21","doi-asserted-by":"crossref","unstructured":"Ferreira, K.B., Bridges, P., Brightwell, R.: Characterizing application sensitivity to os interference using kernel-level noise injection. In: Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing, p. 19. IEEE Press (2008)","DOI":"10.1109\/SC.2008.5219920"},{"key":"5_CR22","doi-asserted-by":"crossref","unstructured":"Hoefler, T., Schneider, T., Lumsdaine, A.: Characterizing the Influence of System Noise on Large-Scale Applications by Simulation. In: International Conference for High Performance Computing, Networking, Storage and Analysis (SC 2010) (November 2010)","DOI":"10.1109\/SC.2010.12"},{"key":"5_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/978-3-642-38718-0_1","volume-title":"High Performance Computing for Computational Science - VECPAR 2012","author":"Horst D Simon","year":"2013","unstructured":"Simon, Horst D.: Barriers to exascale computing. In: Dayd\u00e9, Michel, Marques, Osni, Nakajima, Kengo (eds.) VECPAR. LNCS, vol. 7851, pp. 1\u20133. Springer, Heidelberg (2013)"},{"issue":"3","key":"5_CR24","doi-asserted-by":"publisher","first-page":"375","DOI":"10.1145\/568522.568525","volume":"34","author":"EN Elnozahy","year":"2002","unstructured":"Elnozahy, E.N., Alvisi, L., Wang, Y.-M., Johnson, D.B.: A survey of rollback-recovery protocols in message-passing systems. ACM Computing Surveys 34(3), 375\u2013408 (2002)","journal-title":"ACM Computing Surveys"},{"issue":"10","key":"5_CR25","doi-asserted-by":"publisher","first-page":"972","DOI":"10.1109\/71.730527","volume":"9","author":"JS Plank","year":"1998","unstructured":"Plank, J.S., Li, K., Puening, M.A.: Diskless checkpointing. IEEE Transactions on Parallel and Distributed Systems 9(10), 972\u2013986 (1998)","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"key":"5_CR26","doi-asserted-by":"crossref","unstructured":"Plank, J.S., Kim, Y.B., Dongarra, J.J.: Algorithm-based diskless checkpointing for fault tolerant matrix operations. In: Twenty-Fifth International Symposium on Fault-Tolerant Computing, Digest of Papers, Pasadena, CA, USA, pp. 351\u2013360. IEEE Comput. Soc. Press, Los Alamitos (1995)","DOI":"10.1109\/FTCS.1995.466964"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Silva, L.M., Silva, J.G.: An experimental study about diskless checkpointing. In: 24th EUROMICRO Conference, Vasteras, Sweden, pp. 395\u2013402. IEEE Computer Society Press (August 1998)","DOI":"10.1109\/EURMIC.1998.711832"},{"key":"5_CR28","doi-asserted-by":"crossref","unstructured":"Monnet, S., Morin, C., Badrinath, R.: Hybrid checkpointing for parallel applications in cluster federations. In: IEEE International Symposium on Cluster Computing and the Grid, CCGrid 2004, pp. 773\u2013782. IEEE (2004)","DOI":"10.1109\/CCGrid.2004.1336712"},{"key":"5_CR29","doi-asserted-by":"crossref","unstructured":"Alvisi, L., Elnozahy, E., Rao, S., Husain, S.A., De Mel, A.: An analysis of communication induced checkpointing. In: Twenty-Ninth Annual International Symposium on Fault-Tolerant Computing. Digest of Papers, pp. 242\u2013249. IEEE (1999)","DOI":"10.1109\/FTCS.1999.781058"},{"key":"5_CR30","doi-asserted-by":"crossref","unstructured":"Gioiosa, R., Sancho, J.C., Jiang, S., Petrini, F., Davis, K.: Transparent, incremental checkpointing at kernel level: a foundation for fault tolerance for parallel computers. In: Proceedings of the 2005 ACM\/IEEE Conference on Supercomputing, p. 9. IEEE Computer Society (2005)","DOI":"10.1109\/SC.2005.76"},{"key":"5_CR31","doi-asserted-by":"crossref","unstructured":"Widener, P., Ferreira, K., Levy, S., Bridges, P.G., Arnold, D., Brightwell, R.: Asking the right questions: benchmarking fault-tolerant extreme-scale systems. In:Proc. 6th Workshop on Resiliency in High Performance Computing, Aachen,Germany (August 2013), in conjunction with Euro-Par 2013","DOI":"10.1007\/978-3-642-54420-0_70"},{"key":"5_CR32","unstructured":"Riesen, R., Ferreira, K., Stearley, J., Oldfield, R., Laros III, J.H., Pedretti, K., Brightwell, R., et al.: Redundant computing for exascale systems. Technical report SAND2010-8709. Sandia National Laboratories (2010)"},{"key":"5_CR33","unstructured":"Hoefler, T.: LogGOPSim - A LogGOPS (LogP, LogGP, LogGPS) Simulator and Simulation Framework (April 10, 2013), http:\/\/www.unixer.de\/research\/LogGOPSim\/"},{"issue":"7","key":"5_CR34","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/173284.155333","volume":"28","author":"D Culler","year":"1993","unstructured":"Culler, D., Karp, R., Patterson, D., Sahay, A., Schauser, K.E., Santos, E., Subramonian, R., von Eicken, T.: LogP: towards a realistic model of parallel computation. SIGPLAN Not. 28(7), 1\u201312 (1993)","journal-title":"SIGPLAN Not."},{"key":"5_CR35","doi-asserted-by":"crossref","unstructured":"Hoefler, T., Siebert, C., Lumsdaine, A.: Group Operation Assembly Language - a flexible way to express collective communication. In: ICPP-2009 - The 38th International Conference on Parallel Processing. IEEE (September 2009)","DOI":"10.1109\/ICPP.2009.70"},{"key":"5_CR36","doi-asserted-by":"crossref","unstructured":"Tikotekar, A., Vall\u00e9e, G., Naughton, T., Scott, S.L., Leangsuksun, C.: Evaluation of fault-tolerant policies using simulation. In: 2007 IEEE International Conference on Cluster Computing, pp. 303\u2013311. IEEE (2007)","DOI":"10.1109\/CLUSTR.2007.4629244"},{"key":"5_CR37","doi-asserted-by":"crossref","unstructured":"Bohm, S., Engelmann, C.: xSim: The extreme-scale simulator. In: 2011 International Conference on High Performance Computing and Simulation (HPCS), pp. 280\u2013286. IEEE (2011)","DOI":"10.1109\/HPCSim.2011.5999835"},{"key":"5_CR38","doi-asserted-by":"crossref","unstructured":"Boteanu, A., Dobre, C., Pop, F., Cristea, V.: Simulator for fault tolerance in large scale distributed systems. In: 2010 IEEE International Conference on Intelligent Computer Communication and Processing (ICCP), pp. 443\u2013450. IEEE (2010)","DOI":"10.1109\/ICCP.2010.5606401"},{"issue":"2","key":"5_CR39","doi-asserted-by":"publisher","first-page":"57","DOI":"10.4018\/jdst.2010040104","volume":"1","author":"CL Janssen","year":"2010","unstructured":"Janssen, C.L., Adalsteinsson, H., Cranford, S., Kenny, J.P., Pinar, A., Evensky, D.A., Mayo, J.: A simulator for large-scale parallel computer architectures. International Journal of Distributed Systems and Technologies (IJDST) 1(2), 57\u201373 (2010)","journal-title":"International Journal of Distributed Systems and Technologies (IJDST)"},{"key":"5_CR40","unstructured":"Sst: The structural simulation toolkit (2011), http:\/\/sst.sandia.gov\/about_sstmacro.html"},{"key":"5_CR41","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"193","DOI":"10.1007\/978-3-642-33518-1_24","volume-title":"Recent Advances in the Message Passing Interface","author":"W Bland","year":"2012","unstructured":"Bland, W., Bouteiller, A., Herault, T., Hursey, J., Bosilca, G., Dongarra, J.J.: An evaluation of user-level failure mitigation support in MPI. In: Tr\u00e4ff, J.L., Benkner, S., Dongarra, J.J. (eds.) EuroMPI 2012. LNCS, vol. 7490, pp. 193\u2013203. Springer, Heidelberg (2012)"}],"container-title":["Lecture Notes in Computer Science","High Performance Computing Systems. Performance Modeling, Benchmarking and Simulation"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-10214-6_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,4]],"date-time":"2025-05-04T22:49:23Z","timestamp":1746398963000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-319-10214-6_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014]]},"ISBN":["9783319102139","9783319102146"],"references-count":41,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-10214-6_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2014]]},"assertion":[{"value":"1 October 2014","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}}