{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,28]],"date-time":"2025-03-28T04:00:17Z","timestamp":1743134417570,"version":"3.40.3"},"publisher-location":"Cham","reference-count":24,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319509945"},{"type":"electronic","value":"9783319509952"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-50995-2_5","type":"book-chapter","created":{"date-parts":[[2016,12,14]],"date-time":"2016-12-14T07:34:49Z","timestamp":1481700889000},"page":"66-81","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Surviving Errors with OpenSHMEM"],"prefix":"10.1007","author":[{"given":"Aurelien","family":"Bouteiller","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"George","family":"Bosilca","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Manjunath Gorentla","family":"Venkata","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2016,12,15]]},"reference":[{"key":"5_CR1","unstructured":"Amarasinghe, S., et al.: Exascale programming challenges. In: Proceedings of the Workshop on Exascale Programming Challenges, Marina del Rey, CA, USA. U.S Department of Energy, Office of Science, Office of Advanced Scientific Computing Research (ASCR), July 2011. \n                      http:\/\/science.energy.gov\/~\/media\/ascr\/pdf\/program-documents\/docs\/ProgrammingChallengesWorkshopReport.pdf"},{"key":"5_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1007\/978-3-642-15646-5_4","volume-title":"Recent Advances in the Message Passing Interface","author":"P Balaji","year":"2010","unstructured":"Balaji, P., Buntinas, D., Goodell, D., Gropp, W., Krishna, J., Lusk, E., Thakur, R.: PMI: a scalable parallel process-management interface for extreme-scale systems. In: Keller, R., Gabriel, E., Resch, M., Dongarra, J. (eds.) EuroMPI 2010. LNCS, vol. 6305, pp. 31\u201341. Springer, Heidelberg (2010). doi:\n                      10.1007\/978-3-642-15646-5_4\n                      \n                    . \n                      http:\/\/dl.acm.org\/citation.cfm?id=1894122.1894127"},{"key":"5_CR3","doi-asserted-by":"crossref","unstructured":"Bautista-Gomez, L., Tsuboi, S., Komatitsch, D., Cappello, F., Maruyama, N., Matsuoka, S.: FTI: high performance fault tolerance interface for hybrid systems. In: International Conference on High Performance Computing, Networking, Storage and Analysis, SC 2011 (2011)","DOI":"10.1145\/2063384.2063427"},{"key":"5_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"215","DOI":"10.1007\/978-3-319-17248-4_11","volume-title":"High Performance Computing Systems. Performance Modeling, Benchmarking, and Simulation","author":"A Benoit","year":"2015","unstructured":"Benoit, A., Cavelan, A., Robert, Y., Sun, H.: Assessing general-purpose algorithms to cope with fail-stop and silent errors. In: Jarvis, S.A., Wright, S.A., Hammond, S.D. (eds.) PMBS 2014. LNCS, vol. 8966, pp. 215\u2013236. Springer, Heidelberg (2015). doi:\n                      10.1007\/978-3-319-17248-4_11"},{"issue":"3","key":"5_CR5","doi-asserted-by":"publisher","first-page":"244","DOI":"10.1177\/1094342013488238","volume":"27","author":"W Bland","year":"2013","unstructured":"Bland, W., Bouteiller, A., Herault, T., Bosilca, G., Dongarra, J.: Post-failure recovery of MPI communication capability: design and rationale. Int. J. High Perform. Comput. Appl. 27(3), 244\u2013254 (2013). \n                      http:\/\/hpc.sagepub.com\/content\/27\/3\/244.abstract","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"5_CR6","doi-asserted-by":"crossref","unstructured":"Bosilca, G., Bouteiller, A., Guermouche, A., Herault, T., Sens, P., Robert, Y., Dongarra, J.J.: Failure detection and propagation in HPC systems. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2016. ACM, New York (2016, to appear)","DOI":"10.1109\/SC.2016.26"},{"issue":"17","key":"5_CR7","doi-asserted-by":"publisher","first-page":"2772","DOI":"10.1002\/cpe.3173","volume":"26","author":"G Bosilca","year":"2014","unstructured":"Bosilca, G., Bouteiller, A., Brunet, E., Cappello, F., Dongarra, J., Guermouche, A., Herault, T., Robert, Y., Vivien, F., Zaidouni, D.: Unified model for assessing checkpointing protocols at extreme-scale. Concur. Comput. Pract. Exp. 26(17), 2772\u20132791 (2014). doi:\n                      10.1002\/cpe.3173","journal-title":"Concur. Comput. Pract. Exp."},{"key":"5_CR8","unstructured":"Bouteiller, A., Bosilca, G., Dongarra, J.J.: Plan B: Interruption of ongoing MPI operations to support failure recovery. In: Proceedings of the 22nd European MPI Users\u2019 Group Meeting, EuroMPI 2015, pp. 11:1\u201311:9 (2015). \n                      http:\/\/doi.acm.org\/10.1145\/2802658.2802668"},{"issue":"4","key":"5_CR9","doi-asserted-by":"publisher","first-page":"572","DOI":"10.1002\/cpe.2859","volume":"25","author":"A Bouteiller","year":"2013","unstructured":"Bouteiller, A., Herault, T., Bosilca, G., Dongarra, J.J.: Correlated set coordination in fault tolerant message logging protocols for many-core clusters. Concur. Comput. Pract. Exp. 25(4), 572\u2013585 (2013). doi:\n                      10.1002\/cpe.2859","journal-title":"Concur. Comput. Pract. Exp."},{"issue":"2","key":"5_CR10","doi-asserted-by":"publisher","first-page":"225","DOI":"10.1145\/226643.226647","volume":"43","author":"TD Chandra","year":"1996","unstructured":"Chandra, T.D., Toueg, S.: Unreliable failure detectors for reliable distributed systems. J. ACM (JACM) 43(2), 225\u2013267 (1996)","journal-title":"J. ACM (JACM)"},{"key":"5_CR11","doi-asserted-by":"crossref","unstructured":"Chen, Z.: Online-ABFT: an online algorithm based fault tolerance scheme for soft error detection in iterative methods. In: Proceedings of the PPoPP, pp. 167\u2013176 (2013)","DOI":"10.1145\/2517327.2442533"},{"key":"5_CR12","doi-asserted-by":"crossref","unstructured":"Davies, T., Karlsson, C., Liu, H., Ding, C., Chen, Z.: High performance linpack benchmark: a fault tolerant implementation without checkpointing. In: Proceedings of the 25th ACM International Conference on Supercomputing (ICS 2011). ACM (2011)","DOI":"10.1145\/1995896.1995923"},{"issue":"1","key":"5_CR13","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1177\/1094342010391989","volume":"25","author":"J Dongarra","year":"2011","unstructured":"Dongarra, J., et al.: The international exascale software project roadmap. Int. J. High Perform. Comput. Appl. 25(1), 3\u201360 (2011). doi:\n                      10.1177\/1094342010391989","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"5_CR14","unstructured":"Ferreira, K., Stearley, J., Laros III, J.H., Oldfield, R., Pedretti, K., Brightwell, R., Riesen, R., Bridges, P.G., Arnold, D.: Evaluating the viability of process replication reliability for exascale systems. In: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2011, pp. 44:1\u201344:12. ACM, New York (2011). \n                      http:\/\/doi.acm.org\/10.1145\/2063384.2063443"},{"key":"5_CR15","doi-asserted-by":"crossref","unstructured":"Fiala, D., Mueller, F., Engelmann, C., Riesen, R., Ferreira, K., Brightwell, R.: Detection and correction of silent data corruption for large-scale high-performance computing. In: Proceedings of the SC 2012, p. 78 (2012)","DOI":"10.1109\/SC.2012.49"},{"key":"5_CR16","unstructured":"Herault, T., Bouteiller, A., Bosilca, G., Gamell, M., Teranishi, K., Parashar, M., Dongarra, J.: Practical scalable consensus for pseudo-synchronous distributed systems. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2015, pp. 31:1\u201331:12. ACM, New York (2015). \n                      http:\/\/doi.acm.org\/10.1145\/2807591.2807665"},{"issue":"7","key":"5_CR17","doi-asserted-by":"publisher","first-page":"558","DOI":"10.1145\/359545.359563","volume":"21","author":"L Lamport","year":"1978","unstructured":"Lamport, L.: Time, clocks, and the ordering of events in a distributed system. Commun. ACM 21(7), 558\u2013565 (1978)","journal-title":"Commun. ACM"},{"issue":"3","key":"5_CR18","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1145\/357172.357176","volume":"4","author":"L Lamport","year":"1982","unstructured":"Lamport, L., Shostak, R., Pease, M.: The byzantine generals problem. ACM Trans. Program. Lang. Syst. 4(3), 382\u2013401 (1982). doi:\n                      10.1145\/357172.357176","journal-title":"ACM Trans. Program. Lang. Syst."},{"key":"5_CR19","doi-asserted-by":"crossref","unstructured":"Moody, A., Bronevetsky, G., Mohror, K., de Supinski, B.R.: Design, modeling, and evaluation of a scalable multi-level checkpointing system. In: Proceedings of the 2010 ACM\/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201311 (2010). \n                      http:\/\/dx.doi.org\/10.1109\/SC.2010.18","DOI":"10.1109\/SC.2010.18"},{"issue":"2","key":"5_CR20","doi-asserted-by":"publisher","first-page":"125","DOI":"10.1023\/A:1022852505633","volume":"6","author":"F Petrini","year":"2003","unstructured":"Petrini, F., Frachtenberg, E., Hoisie, A., Coll, S.: Performance evaluation of the quadrics interconnection network. Cluster Comput. 6(2), 125\u2013142 (2003). doi:\n                      10.1023\/A:1022852505633","journal-title":"Cluster Comput."},{"key":"5_CR21","first-page":"1379","volume-title":"Encyclopedia of Parallel Computing","author":"SW Poole","year":"2011","unstructured":"Poole, S.W., Hernandez, O.R., Kuehn, J.A., Shipman, G.M., Curtis, A., Feind, K.: OpenSHMEM - toward a unified RMA model. In: Padua, D.A. (ed.) Encyclopedia of Parallel Computing, pp. 1379\u20131391. Springer, Heidelberg (2011)"},{"key":"5_CR22","first-page":"12","volume":"78","author":"B Schroeder","year":"2007","unstructured":"Schroeder, B., Gibson, G.: Understanding failures in petascale computers. J. Phys.: Conf. Ser. 78, 12\u201322 (2007). IOP Publishing","journal-title":"J. Phys.: Conf. Ser."},{"key":"5_CR23","doi-asserted-by":"crossref","unstructured":"Shahzad, F., Kreutzer, M., Zeiser, T., Machado, R., Pieper, A., Hager, G., Wellein, G.: Building a fault tolerant application using the GASPI communication layer. In: Proceedings of the 2015 IEEE International Conference on Cluster Computing, CLUSTER 2015, pp. 580\u2013587. IEEE Computer Society, Washington (2015). \n                      http:\/\/dx.doi.org\/10.1109\/CLUSTER.2015.106","DOI":"10.1109\/CLUSTER.2015.106"},{"key":"5_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"124","DOI":"10.1007\/978-3-319-17353-5_11","volume-title":"High Performance Computing for Computational Science \u2013 VECPAR 2014","author":"Z Zheng","year":"2015","unstructured":"Zheng, Z., Chien, A.A., Teranishi, K.: Fault tolerance in an inner-outer solver: a GVR-enabled case study. In: Dayd\u00e9, M., Marques, O., Nakajima, K. (eds.) VECPAR 2014. LNCS, vol. 8969, pp. 124\u2013132. Springer, Heidelberg (2015). doi:\n                      10.1007\/978-3-319-17353-5_11"}],"container-title":["Lecture Notes in Computer Science","OpenSHMEM and Related Technologies. Enhancing OpenSHMEM for Hybrid Environments"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-50995-2_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,5,20]],"date-time":"2019-05-20T02:00:15Z","timestamp":1558317615000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-50995-2_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319509945","9783319509952"],"references-count":24,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-50995-2_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2016]]},"assertion":[{"value":"15 December 2016","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"OpenSHMEM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Workshop on OpenSHMEM and Related Technologies","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Baltimore","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"USA","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2016","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 August 2016","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 August 2016","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"openshmem2016","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}