{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T00:07:22Z","timestamp":1758931642434,"version":"3.44.0"},"publisher-location":"Cham","reference-count":31,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031488023"},{"type":"electronic","value":"9783031488030"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-48803-0_9","type":"book-chapter","created":{"date-parts":[[2024,4,13]],"date-time":"2024-04-13T03:02:21Z","timestamp":1712977341000},"page":"102-114","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Task-Level Checkpointing for\u00a0Nested Fork-Join Programs Using Work Stealing"],"prefix":"10.1007","author":[{"given":"Mia","family":"Reitz","sequence":"first","affiliation":[]},{"given":"Claudia","family":"Fohry","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,4,14]]},"reference":[{"key":"9_CR1","unstructured":"Fohry, C.: Checkpointing and localized recovery for nested fork-join programs. In: International Symposium on Checkpointing for Supercomputing (SuperCheck) (2021). https:\/\/arxiv.org\/abs\/2102.12941"},{"key":"9_CR2","unstructured":"Laboratory, O.R.N.: Frontier. https:\/\/www.olcf.ornl.gov\/frontier"},{"key":"9_CR3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-20943-2","volume-title":"Fault-Tolerance Techniques for High-Performance Computing","author":"T Herault","year":"2015","unstructured":"Herault, T., Robert, Y.: Fault-Tolerance Techniques for High-Performance Computing. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-20943-2"},{"key":"9_CR4","doi-asserted-by":"crossref","unstructured":"Benoit, A., Herault, T., F\u00e8vre, V.L., Robert, Y.: Replication is more efficient than you think. In: Proceedings of International Conference for High Performance Computing, Networking, Storage and Analysis (SC), pp. 1\u201314. ACM (2019)","DOI":"10.1145\/3295500.3356171"},{"key":"9_CR5","doi-asserted-by":"publisher","first-page":"467","DOI":"10.1016\/j.future.2020.01.026","volume":"106","author":"N Losada","year":"2020","unstructured":"Losada, N., Gonz\u00e1lez, P., Mart\u00ecn, M.J., Bosilca, G., Bouteiller, A., Teranishi, K.: Fault tolerance of MPI applications in exascale systems: the ULFM solution. Future Generation Comput. Syst. (FGCS) 106, 467\u2013481 (2020)","journal-title":"Future Generation Comput. Syst. (FGCS)"},{"issue":"4","key":"9_CR6","doi-asserted-by":"publisher","first-page":"1340011","DOI":"10.1142\/S0129626413400112","volume":"23","author":"F Shahzad","year":"2013","unstructured":"Shahzad, F., Wittmann, M., Kreutzer, M., Zeise, T., Hager, G., Wellein, G.: A survey of checkpoint\/restart techniques on distributed memory systems. Parallel Process. Lett. (PPL) 23(4), 1340011\u20131340030 (2013)","journal-title":"Parallel Process. Lett. (PPL)"},{"key":"9_CR7","doi-asserted-by":"crossref","unstructured":"Lion, R., Thibault, S.: From tasks graphs to asynchronous distributed checkpointing with local restart. In: Proceedings of International Conference on High Performance Computing, Networking, Storage and Analysis (SC) Workshops (FTXS), pp. 31\u201340. ACM (2020)","DOI":"10.1109\/FTXS51974.2020.00009"},{"key":"9_CR8","doi-asserted-by":"publisher","first-page":"119","DOI":"10.1016\/j.future.2019.11.031","volume":"105","author":"J Posner","year":"2019","unstructured":"Posner, J., Reitz, M., Fohry, C.: A comparison of application-level fault tolerance schemes for task pools. Future Generation Comput. Syst. (FGCS) 105, 119\u2013134 (2019)","journal-title":"Future Generation Comput. Syst. (FGCS)"},{"key":"9_CR9","doi-asserted-by":"crossref","unstructured":"Kaiser, H., Heller, T., Adelstein-Lelbach, B., Serio, A., Fey, D.: HPX: a task based programming model in a global address space. In: Proceedings of International Conference on Partitioned Global Address Space Programming Models (PGAS), pp. 1\u201311. ACM (2014)","DOI":"10.1145\/2676870.2676883"},{"key":"9_CR10","unstructured":"OpenMP Architecture Review Board: OpenMP application programming interface (version 5.2). openmp.org (2021)"},{"issue":"3","key":"9_CR11","first-page":"91","volume":"21","author":"BL Chamberlain","year":"2007","unstructured":"Chamberlain, B.L., Callahan, D., Zima, H.P.: Parallel programmability and the Chapel language. Int. J. High Perform. Comput. Appl. (IJHPCA) 21(3), 91\u2013312 (2007)","journal-title":"Int. J. High Perform. Comput. Appl. (IJHPCA)"},{"issue":"5","key":"9_CR12","doi-asserted-by":"publisher","first-page":"720","DOI":"10.1145\/324133.324234","volume":"46","author":"RD Blumofe","year":"1999","unstructured":"Blumofe, R.D., Leiserson, C.E.: Scheduling multithreaded computations by work stealing. J. ACM 46(5), 720\u2013748 (1999)","journal-title":"J. ACM"},{"key":"9_CR13","unstructured":"Fohry, C.: An overview of task-based parallel programming models. In: Tutorial at European Network on High-performance Embedded Architecture and Compilation Conference (HiPEAC) (2019)"},{"key":"9_CR14","doi-asserted-by":"crossref","unstructured":"Posner, J., Reitz, M., Fohry, C.: Task-level resilience: checkpointing vs. supervision. Special Issue Int. J. Netw. Comput. (IJNC) 12(1), 47\u201372 (2022)","DOI":"10.15803\/ijnc.12.1_47"},{"key":"9_CR15","doi-asserted-by":"crossref","unstructured":"Reitz, M., Fohry, C.: Lifeline-based load balancing schemes for asynchronous many-task runtimes in clusters. Special Issue J. Parallel Comput. (PARCO) 116, 103020 (2023)","DOI":"10.1016\/j.parco.2023.103020"},{"key":"9_CR16","doi-asserted-by":"crossref","unstructured":"Saraswat, V.A., Kambadur, P., Kodali, S., Grove, D., Krishnamoorthy, S.: Lifeline-based global load balancing. In: Proceedings of SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP), pp. 201\u2013212. ACM (2011)","DOI":"10.1145\/1941553.1941582"},{"key":"9_CR17","unstructured":"Blumofe, R.D., Lisiecki, P.A.: Adaptive and reliable parallel computing on networks of workstations. In: Proceedings of Annual Conference on USENIX, pp. 1\u201310 (1997)"},{"key":"9_CR18","doi-asserted-by":"crossref","unstructured":"Kestor, G., Krishnamoorthy, S., Ma, W.: Localized fault recovery for nested fork-join programs. In: Proceedings of Internetional Symposium on Parallel and Distributed Processing (IPDPS), pp. 397\u2013408. IEEE (2017)","DOI":"10.1109\/IPDPS.2017.75"},{"issue":"13","key":"9_CR19","doi-asserted-by":"publisher","first-page":"925","DOI":"10.4236\/jsea.2017.1013053","volume":"10","author":"C Fohry","year":"2017","unstructured":"Fohry, C., Bungart, M., Plock, P.: Fault tolerance for lifeline-based global load balancing. J. Softw. Eng. Appl. (JSEA) 10(13), 925\u2013958 (2017)","journal-title":"J. Softw. Eng. Appl. (JSEA)"},{"key":"9_CR20","doi-asserted-by":"crossref","unstructured":"Tardieu, O.: The APGAS library: resilient parallel and distributed programming in Java 8. In: Proceedings of SIGPLAN Workshop on X10, pp. 25\u201326. ACM (2015)","DOI":"10.1145\/2771774.2771780"},{"key":"9_CR21","unstructured":"Hazelcast: The leading open source in-memory data grid. http:\/\/hazelcast.org"},{"key":"9_CR22","doi-asserted-by":"crossref","unstructured":"Lea, D.: A Java fork\/join framework. In: Proceedings of the Conference on Java Grande, pp. 36\u201343. ACM (2000)","DOI":"10.1145\/337449.337465"},{"key":"9_CR23","unstructured":"TOP500.org: Goethe-HLR. https:\/\/www.top500.org\/system\/179588"},{"key":"9_CR24","unstructured":"IBM: The APGAS library for fault-tolerant distributed programming in Java 8 (version from Oct 10, 2016). https:\/\/github.com\/x10-lang\/apgas"},{"key":"9_CR25","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"346","DOI":"10.1007\/978-3-030-29400-7_25","volume-title":"Euro-Par 2019: Parallel and Distributed Computing","author":"SR Paul","year":"2019","unstructured":"Paul, S.R., et al.: Enabling resilience in asynchronous many-task programming models. In: Yahyapour, R. (ed.) Euro-Par 2019. LNCS, vol. 11725, pp. 346\u2013360. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-29400-7_25"},{"key":"9_CR26","doi-asserted-by":"crossref","unstructured":"Gupta, N., Mayo, J.R., Lemoine, A.S., Kaiser, H.: Towards distributed software resilience in asynchronous many- task programming models. In: Workshop on Fault Tolerance for HPC at eXtreme Scale (FTXS), pp. 11\u201320 (2020)","DOI":"10.1109\/FTXS51974.2020.00007"},{"key":"9_CR27","doi-asserted-by":"crossref","unstructured":"Kurt, M.C., Krishnamoorthy, S., Agrawal, K., Agrawal, G.: Fault-tolerant dynamic task graph scheduling. In: Proceedings of International Conference for High Performance Computing, Networking, Storage and Analysis (SC), pp. 719\u2013730. ACM (2014)","DOI":"10.1109\/SC.2014.64"},{"key":"9_CR28","doi-asserted-by":"crossref","unstructured":"Subasi, O., Yalcin, G., Zyulkyarov, F., Unsal, O., Labarta, J.: Designing and modelling selective replication for fault-tolerant HPC applications. In: International Symposium on Cluster, Cloud and Grid Computing (CCGRID), pp. 452\u2013457 (2017)","DOI":"10.1109\/CCGRID.2017.40"},{"key":"9_CR29","doi-asserted-by":"publisher","first-page":"233","DOI":"10.1504\/IJHPCN.2019.106124","volume":"15","author":"A Semmoud","year":"2019","unstructured":"Semmoud, A., Hakem, M., Benmammar, B.: A survey of load balancing in distributed systems. Int. J. High Perform. Comput. Netw. 15, 233 (2019)","journal-title":"Int. J. High Perform. Comput. Netw."},{"key":"9_CR30","doi-asserted-by":"crossref","unstructured":"Finnerty, P., Kamada, T., Ohta, C.: Self-adjusting task granularity for global load balancer library on clusters of many-core processors. In: Proceedings of International Workshop on Programming Models and Applications for Multicores and Manycores (PMAM). ACM (2020)","DOI":"10.1145\/3380536.3380539"},{"key":"9_CR31","unstructured":"Reitz, M.: Task-level checkpointing for nested fork-join programs. In: Proceedings of International Parallel and Distributed Processing Symposium (IPDPS), Ph.D. Forum, Extended Abstract. IEEE (2021)"}],"container-title":["Lecture Notes in Computer Science","Euro-Par 2023: Parallel Processing Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-48803-0_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,26]],"date-time":"2025-09-26T11:14:13Z","timestamp":1758885253000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-48803-0_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031488023","9783031488030"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-48803-0_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"14 April 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"Euro-Par","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Parallel Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Limassol","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Cyprus","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 September 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"europar2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/2023.euro-par.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}