{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,13]],"date-time":"2025-06-13T16:10:02Z","timestamp":1749831002226,"version":"3.41.0"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319509945"},{"type":"electronic","value":"9783319509952"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-50995-2_4","type":"book-chapter","created":{"date-parts":[[2016,12,14]],"date-time":"2016-12-14T12:34:49Z","timestamp":1481718889000},"page":"52-65","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["System-Level Transparent Checkpointing for OpenSHMEM"],"prefix":"10.1007","author":[{"given":"Rohan","family":"Garg","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"J\u00e9r\u00f4me","family":"Vienne","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gene","family":"Cooperman","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2016,12,15]]},"reference":[{"key":"4_CR1","doi-asserted-by":"publisher","DOI":"10.1109\/PDP.2011.72","volume-title":"A Redundant Communication Approach to Scalable Fault Tolerance in PGAS Programming Models","author":"N Ali","year":"2011","unstructured":"Ali, N., Krishnamoorthy, S., Govind, N., Palmer, B.J.: A Redundant Communication Approach to Scalable Fault Tolerance in PGAS Programming Models. IEEE Computer Society, Los Alamitos (2011)"},{"doi-asserted-by":"crossref","unstructured":"Ansel, J., Arya, K., Cooperman, G.: DMTCP: transparent checkpointing for cluster computations and the desktop. In: IEEE International Symposium on Parallel and Distributed Processing (IPDPS), pp. 1\u201312. IEEE Press (2009)","key":"4_CR2","DOI":"10.1109\/IPDPS.2009.5161063"},{"issue":"3","key":"4_CR3","doi-asserted-by":"publisher","first-page":"63","DOI":"10.1177\/109434209100500306","volume":"5","author":"DH Bailey","year":"1991","unstructured":"Bailey, D.H., Barszcz, E., Barton, J.T., Browning, D.S., Carter, R.L., Dagum, D., Fatoohi, R.A., Frederickson, P.O., Lasinski, T.A., Schreiber, R.S., Simon, H.D., Venkatakrishnan, V., Weeratunga, S.K.: The NAS parallel benchmarks. Intl. J. Supercomput. Appl. 5(3), 63\u201373 (1991)","journal-title":"Intl. J. Supercomput. Appl."},{"unstructured":"BLCR team: BLCR frequently asked questions (for version 0.8.5). https:\/\/upc-bugs.lbl.gov\/blcr\/doc\/html\/FAQ.html#limitations . Accessed June 2016","key":"4_CR4"},{"key":"4_CR5","doi-asserted-by":"publisher","first-page":"319","DOI":"10.1177\/1094342006067469","volume":"20","author":"A Bouteiler","year":"2006","unstructured":"Bouteiler, A., Herault, T., Krawezik, G., Lemarinier, P., Cappello, F.: MPICH-V project: a multiprotocol automatic fault tolerant MPI. Int. J. High Perform. Comput. Appl. 20, 319\u2013333 (2006)","journal-title":"Int. J. High Perform. Comput. Appl."},{"doi-asserted-by":"crossref","unstructured":"Bronevetsky, G., Marques, D., Pingali, K., Rugina, R., McKee, S.A.: Compiler-enhanced incremental checkpointing for OpenMP applications. In: Proceedings of IEEE International Parallel and Distributed Processing Symposium (IPDPS), May 2009","key":"4_CR6","DOI":"10.1109\/IPDPS.2009.5160999"},{"doi-asserted-by":"crossref","unstructured":"Bronevetsky, G., Marques, D., Pingali, K., Stodghill, P.: Automated application-level checkpointing of MPI programs. In: PPoPP 2003: Proceedings of the Ninth ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, NY, USA, pp. 84\u201394. ACM, New York (2003)","key":"4_CR7","DOI":"10.1145\/966049.781513"},{"doi-asserted-by":"crossref","unstructured":"Cao, J., Kerr, G., Arya, K., Cooperman, G.: Transparent checkpoint-restart over InfiniBand. In: Proceedings of the 23rd International Symposium on High-performance Parallel and Distributed Computing, pp. 13\u201324. ACM Press (2014)","key":"4_CR8","DOI":"10.1145\/2600212.2600219"},{"doi-asserted-by":"crossref","unstructured":"Chapman, B., Curtis, T., Pophale, S., Poole, S., Kuehn, J., Koelbel, C., Smith, L.: Introducing OpenSHMEM: SHMEM for the PGAS community. In: Proceedings of the Fourth Conference on Partitioned Global Address Space Programming Model, pp. 2:1\u20132:3, PGAS 2010, NY, USA. ACM, New York (2010)","key":"4_CR9","DOI":"10.1145\/2020373.2020375"},{"doi-asserted-by":"crossref","unstructured":"Duell, J., Hargrove, P., Roman, E.: The design and implementation of Berkeley lab\u2019s Linux checkpoint\/restart (BLCR). Technical report LBNL-54941, Lawrence Berkeley National Laboratory (2003)","key":"4_CR10","DOI":"10.2172\/793773"},{"unstructured":"Gao, Q., Yu, W., Huang, W., Panda, D.K.: Application-transparent checkpoint\/restart for MPI programs over InfiniBand. In: ICPP 2006: Proceedings of the 2006 International Conference on Parallel Processing, pp. 471\u2013478. IEEE Computer Society, Washington, DC (2006)","key":"4_CR11"},{"doi-asserted-by":"crossref","unstructured":"Graham, R.L., Woodall, T.S., Squyres, J.M.: Open MPI: a flexible high performance MPI. In: Proceedings of the 6th Annual International Conference on Parallel Processing and Applied Mathematics, Poznan, Poland, September 2005","key":"4_CR12","DOI":"10.1007\/11752578_29"},{"unstructured":"Hammond, J.: OSHMPI (06 2016). https:\/\/github.com\/jeffhammond\/oshmpi","key":"4_CR13"},{"key":"4_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"44","DOI":"10.1007\/978-3-319-05215-1_4","volume-title":"OpenSHMEM and Related Technologies. Experiences, Implementations, and Tools","author":"JR Hammond","year":"2014","unstructured":"Hammond, J.R., Ghosh, S., Chapman, B.M.: Implementing OpenSHMEM using MPI-3 one-sided communication. In: Poole, S., Hernandez, O., Shamis, P. (eds.) OpenSHMEM 2014. LNCS, vol. 8356, pp. 44\u201358. Springer, Heidelberg (2014). doi: 10.1007\/978-3-319-05215-1_4"},{"key":"4_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"36","DOI":"10.1007\/978-3-319-26428-8_3","volume-title":"OpenSHMEM and Related Technologies. Experiences, Implementations, and Technologies","author":"P Hao","year":"2015","unstructured":"Hao, P., Pophale, S., Shamis, P., Curtis, T., Chapman, B.: Check-pointing approach for fault tolerance in OpenSHMEM. In: Gorentla Venkata, M., Shamis, P., Imam, N., Lopez, M.G. (eds.) OpenSHMEM 2014. LNCS, vol. 9397, pp. 36\u201352. Springer, Heidelberg (2015). doi: 10.1007\/978-3-319-26428-8_3"},{"doi-asserted-by":"crossref","unstructured":"Hao, P., Shamis, P., Venkata, M.G., Pophale, S., Welch, A., Poole, S., Chapman, B.: Fault tolerance for OpenSHMEM. In: Proceedings of the 8th International Conference on Partitioned Global Address Space Programming Models, PGAS 2014, pp. 23:1\u201323:3 (2014)","key":"4_CR16","DOI":"10.1145\/2676870.2676894"},{"key":"4_CR17","doi-asserted-by":"publisher","first-page":"494","DOI":"10.1088\/1742-6596\/46\/1\/067","volume":"46","author":"P Hargrove","year":"2006","unstructured":"Hargrove, P., Duell, J.: Berkeley lab checkpoint\/restart (BLCR) for Linux clusters. J. Phys. Conf. Ser. 46, 494\u2013499 (2006)","journal-title":"J. Phys. Conf. Ser."},{"unstructured":"High Performance Computing Tools Group at the University of Houston, Extreme Scale Systems Center, Oak Ridge National Laboratory: OpenSHMEM Application Programming interface (version\u00a01.3). http:\/\/openshmem.org\/site\/sites\/default\/site_files\/OpenSHMEM-1.3.pdf . Accessed June 2016","key":"4_CR18"},{"doi-asserted-by":"crossref","unstructured":"Huang, W., Santhanaraman, G., Jin, H., Gao, Q., Panda, D.: Design and Implementation of High Performance MVAPICH2: MPI2 Over InfiniBand, May 2007","key":"4_CR19","DOI":"10.1109\/CCGRID.2006.32"},{"doi-asserted-by":"crossref","unstructured":"Hursey, J., Squyres, J.M., Mattox, T.I., Lumsdain, A.: The design and implementation of checkpoint\/restart process fault tolerance for open MPI. In: Proceedings of the 21st IEEE International Parallel and Distributed Processing Symposium (IPDPS)\/12th IEEE Workshop on Dependable Parallel, Distributed and Network-Centric Systems. IEEE Computer Society, March 2007","key":"4_CR20","DOI":"10.1109\/IPDPS.2007.370605"},{"doi-asserted-by":"crossref","unstructured":"Janakiraman, G., Santos, J., Subhraveti, D., Turner, Y.: Cruz: application-transparent distributed checkpoint-restart on standard operating systems. In: Dependable Systems and Networks (DSN 2005), pp. 260\u2013269 (2005)","key":"4_CR21","DOI":"10.1109\/DSN.2005.33"},{"doi-asserted-by":"crossref","unstructured":"Jose, J., Hamidouche, K., Zhang, J., Venkatesh, A., Panda, D.: Optimizing collective communication in UPC, May 2014","key":"4_CR22","DOI":"10.1109\/IPDPSW.2014.49"},{"key":"4_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1007\/978-3-319-05215-1_2","volume-title":"OpenSHMEM and Related Technologies. Experiences, Implementations, and Tools","author":"J Jose","year":"2014","unstructured":"Jose, J., Zhang, J., Venkatesh, A., Potluri, S., Panda, D.K.D.: A comprehensive performance evaluation of OpenSHMEM libraries on InfiniBand clusters. In: Poole, S., Hernandez, O., Shamis, P. (eds.) OpenSHMEM 2014. LNCS, vol. 8356, pp. 14\u201328. Springer, Heidelberg (2014). doi: 10.1007\/978-3-319-05215-1_2"},{"unstructured":"Laadan, O., Nieh, J.: Transparent checkpoint-restart of multiple processes for commodity clusters. In: 2007 USENIX Annual Technical Conference, pp. 323\u2013336 (2007)","key":"4_CR24"},{"unstructured":"Laadan, O., Phung, D., Nieh, J.: Transparent networked checkpoint-restart for commodity clusters. In: 2005 IEEE International Conference on Cluster Computing. IEEE Press (2005)","key":"4_CR25"},{"unstructured":"Laboratory, N.B.C.: MVAPICH2 (06 2016). http:\/\/mvapich.cse.ohio-state.edu\/","key":"4_CR26"},{"unstructured":"Laboratory, N.B.C.: MVAPICH2-X (06 2016). http:\/\/mvapich.cse.ohio-state.edu\/","key":"4_CR27"},{"unstructured":"NASA Advanced Supercomputing Division: NAS Parallel Benchmarks. http:\/\/www.nas.nasa.gov\/publications\/npb.html . Accessed Apr 2016","key":"4_CR28"},{"unstructured":"Pophale, S., Nanjegowda, R., Curtis, T., Chapman, B., Jin, H., Poole, S., Kuehn, J.: OpenSHMEM performance and potential: a NPB experimental study. In: The 6th Conference on Partitioned Global Address Space Programming Models (PGAS 2012). Citeseer (2012)","key":"4_CR29"},{"issue":"4","key":"4_CR30","doi-asserted-by":"publisher","first-page":"479","DOI":"10.1177\/1094342005056139","volume":"19","author":"S Sankaran","year":"2005","unstructured":"Sankaran, S., Squyres, J.M., Barrett, B., Sahay, V., Lumsdaine, A., Duell, J., Hargrove, P., Roman, E.: The LAM\/MPI checkpoint\/restart framework: system-initiated checkpointing. Int. J. High Perform. Comput. Appl. 19(4), 479\u2013493 (2005)","journal-title":"Int. J. High Perform. Comput. Appl."},{"doi-asserted-by":"crossref","unstructured":"Sudakov, O.O., Meshcheriakov, I.S., Boyko, Y.V.: CHPOX: transparent checkpointing system for Linux clusters. In: IEEE International Workshop on Intelligent Data Acquisition and Advanced Computing Systems: Technology and Applications, pp. 159\u2013164 (2007). software available at http:\/\/freshmeat.net\/projects\/chpox\/","key":"4_CR31","DOI":"10.1109\/IDAACS.2007.4488396"},{"unstructured":"TOP500 supercomputer sites (Jun 2016). http:\/\/top500.org\/list\/2016\/06\/","key":"4_CR32"},{"doi-asserted-by":"crossref","unstructured":"Vienne, J., Chen, J., Wasi-Ur-Rahman, M., Islam, N.S., Subramoni, H., Panda, D.K.: Performance analysis and evaluation of InfiniBand FDR and 40GigE RoCE on HPC and cloud computing systems. In: Hot Interconnects, pp. 48\u201355 (2012)","key":"4_CR33","DOI":"10.1109\/HOTI.2012.19"},{"doi-asserted-by":"crossref","unstructured":"Wong, F.C., Martin, R.P., Arpaci-Dusseau, R.H., Culler, D.E.: Architectural requirements and scalability of the NAS parallel benchmarks. In: Supercomputing (1999)","key":"4_CR34","DOI":"10.1145\/331532.331573"}],"container-title":["Lecture Notes in Computer Science","OpenSHMEM and Related Technologies. Enhancing OpenSHMEM for Hybrid Environments"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-50995-2_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,13]],"date-time":"2025-06-13T15:56:47Z","timestamp":1749830207000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-50995-2_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319509945","9783319509952"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-50995-2_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2016]]},"assertion":[{"value":"15 December 2016","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"OpenSHMEM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Workshop on OpenSHMEM and Related Technologies","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Baltimore","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"USA","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2016","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 August 2016","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 August 2016","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"openshmem2016","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}