{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T00:05:48Z","timestamp":1781136348379,"version":"3.54.1"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031521850","type":"print"},{"value":"9783031521867","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-52186-7_6","type":"book-chapter","created":{"date-parts":[[2024,1,27]],"date-time":"2024-01-27T09:02:26Z","timestamp":1706346146000},"page":"77-94","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Towards Fault Tolerance and\u00a0Resilience in\u00a0the\u00a0Sequential Codelet Model"],"prefix":"10.1007","author":[{"given":"Diego A. Roa","family":"Perdomo","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rafael A. Herrera","family":"Guaitero","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dawson","family":"Fox","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Herv\u00e9","family":"Yviquel","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Siddhisanket","family":"Raskar","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaoming","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jose M. Monsalve","family":"Diaz","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,1,28]]},"reference":[{"key":"6_CR1","unstructured":"Argonne leadership computing facility. https:\/\/www.alcf.anl.gov\/, Accessed 22 July 2023"},{"key":"6_CR2","unstructured":"GitHub - josemonsalve2\/SCM: Sequential Codelet Model of Program Execution \u2013 github.com. https:\/\/github.com\/josemonsalve2\/SCM\/. Accessed 22 July 2023"},{"key":"6_CR3","doi-asserted-by":"publisher","unstructured":"Aguilera, M., Chen, W., Toueg, S.: Heartbeat: a timeout-free failure detector for quiescent reliable communication, vol. 1320, pp. 126\u2013140 (1997). https:\/\/doi.org\/10.1007\/BFb0030680","DOI":"10.1007\/BFb0030680"},{"key":"6_CR4","doi-asserted-by":"publisher","unstructured":"Ahmad, I., Yu-Kwong Kwok, Y.K.K.: A new approach to scheduling parallel programs using task duplication. In: 1994 International Conference on Parallel Processing, vol. 2, pp. 47\u201351 (1994). https:\/\/doi.org\/10.1109\/ICPP.1994.37","DOI":"10.1109\/ICPP.1994.37"},{"key":"6_CR5","doi-asserted-by":"publisher","unstructured":"Ansel, J., Arya, K., Cooperman, G.: DMTCP: transparent checkpointing for cluster computations and the desktop. In: 2009 IEEE International Symposium on Parallel & Distributed Processing, pp. 1\u201312 (2009). https:\/\/doi.org\/10.1109\/IPDPS.2009.5161063","DOI":"10.1109\/IPDPS.2009.5161063"},{"key":"6_CR6","doi-asserted-by":"publisher","unstructured":"Bolchini, C., Miele, A., Sciuto, D.: An adaptive approach for online fault management in many-core architectures (2012). https:\/\/doi.org\/10.1109\/DATE.2012.6176589","DOI":"10.1109\/DATE.2012.6176589"},{"key":"6_CR7","doi-asserted-by":"crossref","unstructured":"Bosilca, G., Delmas, R., Dongarra, J., Langou, J.: Algorithmic based fault tolerance applied to high performance computing (2008)","DOI":"10.1016\/j.jpdc.2008.12.002"},{"key":"6_CR8","doi-asserted-by":"publisher","unstructured":"Dennis, J.: A parallel program execution model supporting modular software construction. In: Proceedings. Third Working Conference on Massively Parallel Programming Models (Cat. No. 97TB100228), pp. 50\u201360 (1997). https:\/\/doi.org\/10.1109\/MPPM.1997.715961","DOI":"10.1109\/MPPM.1997.715961"},{"key":"6_CR9","unstructured":"Diaz, J.M.M.: Sequential Codelet Model A SuperCodelet Program Execution Model and Architecture. Phd thesis, University of Delaware, Newark, DE (2021)"},{"key":"6_CR10","doi-asserted-by":"publisher","unstructured":"Diaz, J.M.M., Harms, K., Guaitero, R.A.H., Perdomo, D.A.R., Kumaran, K., Gao, G.R.: The supercodelet architecture. In: Proceedings of the 1st International Workshop on Extreme Heterogeneity Solutions. ExHET 2022. Association for Computing Machinery, New York (2022). https:\/\/doi.org\/10.1145\/3529336.3530823","DOI":"10.1145\/3529336.3530823"},{"key":"6_CR11","doi-asserted-by":"publisher","unstructured":"DiTomaso, D., Kodi, A., Louri, A.: QORE: a fault tolerant network-on-chip architecture with power-efficient quad-function channel (qfc) buffers. In: 2014 IEEE 20th International Symposium on High Performance Computer Architecture (HPCA), pp. 320\u2013331 (2014). https:\/\/doi.org\/10.1109\/HPCA.2014.6835942","DOI":"10.1109\/HPCA.2014.6835942"},{"issue":"3","key":"6_CR12","doi-asserted-by":"publisher","first-page":"1302","DOI":"10.1007\/s11227-013-0884-0","volume":"65","author":"IP Egwutuoha","year":"2013","unstructured":"Egwutuoha, I.P., Levy, D., Selic, B., Chen, S.: A survey of fault tolerance mechanisms and checkpoint\/restart implementations for high performance computing systems. J. Supercomput. 65(3), 1302\u20131326 (2013). https:\/\/doi.org\/10.1007\/s11227-013-0884-0","journal-title":"J. Supercomput."},{"key":"6_CR13","doi-asserted-by":"publisher","unstructured":"Fang, Y., Zou, C., Elmore, A.J., Chien, A.A.: UDP: a programmable accelerator for extract-transform-load workloads and more. In: Proceedings of the 50th Annual IEEE\/ACM International Symposium on Microarchitecture, MICRO-50 2017, pp. 55\u201368. Association for Computing Machinery, New York (2017). https:\/\/doi.org\/10.1145\/3123939.3123983","DOI":"10.1145\/3123939.3123983"},{"key":"6_CR14","unstructured":"Fox, D., Diaz, J.M.M., Li, X.: Chiplets and the codelet model (2022)"},{"key":"6_CR15","unstructured":"Fox, D., Diaz, J.M., Li, X.: On memory codelets: prefetching, recoding, moving and streaming data (2023)"},{"key":"6_CR16","unstructured":"Gao, G., Suetterlein, J., Zuckerman, S.: Toward an Execution Model for Extreme-Scale Systems - Runnemede and Beyond (2011). technical Memo"},{"key":"6_CR17","doi-asserted-by":"publisher","unstructured":"Gizopoulos, D., et al.: Architectures for online error detection and recovery in multicore processors. In: 2011 Design, Automation & Test in Europe (2011). https:\/\/doi.org\/10.1109\/date.2011.5763096","DOI":"10.1109\/date.2011.5763096"},{"key":"6_CR18","unstructured":"IEC: Functional safety of electrical\/electronic\/programmable electronic safety-related systems. Standard IEC 61508\u20131:2010. International Electrotechnical Commission, Geneva, CH (2010). https:\/\/webstore.iec.ch\/publication\/5515"},{"issue":"6","key":"6_CR19","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1109\/MM.2005.119","volume":"25","author":"R Iyer","year":"2005","unstructured":"Iyer, R., Nakka, N., Kalbarczyk, Z., Mitra, S.: Recent advances and new avenues in hardware-level reliability support. IEEE Micro 25(6), 18\u201329 (2005). https:\/\/doi.org\/10.1109\/MM.2005.119","journal-title":"IEEE Micro"},{"key":"6_CR20","doi-asserted-by":"publisher","unstructured":"Kadri, N., Koudil, M.: A survey on fault-tolerant application mapping techniques for network-on-chip. J. Syst. Arch. 92, 39\u201352 (2019). https:\/\/doi.org\/10.1016\/j.sysarc.2018.10.001. https:\/\/www.sciencedirect.com\/science\/article\/pii\/S1383762118301498","DOI":"10.1016\/j.sysarc.2018.10.001"},{"key":"6_CR21","doi-asserted-by":"publisher","unstructured":"Kasap, S., W\u00e4chter, E.W., Zhai, X., Ehsan, S., McDonald-Maier, K.D.: Novel lockstep-based fault mitigation approach for socs with roll-back and roll-forward recovery. Microelectron. Reliabil. 124, 114297 (2021). https:\/\/doi.org\/10.1016\/j.microrel.2021.114297. https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0026271421002638","DOI":"10.1016\/j.microrel.2021.114297"},{"key":"6_CR22","doi-asserted-by":"crossref","unstructured":"Koren, I., Krishna, C.M.: Fault-Tolerant Systems. Organ Kaufmann (2007)","DOI":"10.1016\/B978-012088525-1\/50007-9"},{"key":"6_CR23","unstructured":"Landwehr, A.: An experimental exploration of self-aware systems for exascale architectures (2016)"},{"issue":"4","key":"6_CR24","doi-asserted-by":"publisher","first-page":"746","DOI":"10.1109\/JPROC.2019.2896848","volume":"107","author":"L Linguaglossa","year":"2019","unstructured":"Linguaglossa, L., et al.: Survey of performance acceleration techniques for network function virtualization. Proc. IEEE 107(4), 746\u2013764 (2019). https:\/\/doi.org\/10.1109\/JPROC.2019.2896848","journal-title":"Proc. IEEE"},{"key":"6_CR25","doi-asserted-by":"publisher","unstructured":"Monsalve, J., Harms, K., Kalyan, K., Gao, G.: Sequential codelet model of program execution - a super-codelet model based on the hierarchical turing machine. In: 2019 IEEE\/ACM Third Annual Workshop on Emerging Parallel and Distributed Runtime Systems and Middleware (IPDRM), pp. 1\u20138 (2019). https:\/\/doi.org\/10.1109\/IPDRM49579.2019.00005","DOI":"10.1109\/IPDRM49579.2019.00005"},{"key":"6_CR26","doi-asserted-by":"publisher","unstructured":"Nicolae, B., Moody, A., Gonsiorowski, E., Mohror, K., Cappello, F.: Veloc: towards high performance adaptive asynchronous checkpointing at large scale. In: 2019 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 911\u2013920 (2019). https:\/\/doi.org\/10.1109\/IPDPS.2019.00099","DOI":"10.1109\/IPDPS.2019.00099"},{"key":"6_CR27","volume-title":"Computer Architecture: A Quantitative Approach","author":"DA Patterson","year":"1990","unstructured":"Patterson, D.A., Hennessy, J.L.: Computer Architecture: A Quantitative Approach. Morgan Kaufmann Publishers Inc., San Francisco (1990)"},{"key":"6_CR28","doi-asserted-by":"publisher","unstructured":"Platunov, A., Sterkhov, A.: Whatchdog mechanisms in embedded systems. Sci. Tech. J. Inf. Technol. Mech. Opt. 301\u2013311 (2017). https:\/\/doi.org\/10.17586\/2226-1494-2017-17-2-301-311","DOI":"10.17586\/2226-1494-2017-17-2-301-311"},{"key":"6_CR29","volume-title":"Fault-Tolerant Real-Time Systems: The Problem of Replica Determinism","author":"S Poledna","year":"1996","unstructured":"Poledna, S.: Fault-Tolerant Real-Time Systems: The Problem of Replica Determinism. Kluwer Academic Publishers, Boston (1996)"},{"key":"6_CR30","doi-asserted-by":"publisher","unstructured":"Qu, P., Yan, J., Zhang, Y., Gao, G.: Parallel turing machine, a proposal. J. Comput. Sci. Technol. 32, 269\u2013285 (2017). https:\/\/doi.org\/10.1007\/s11390-017-1721-3","DOI":"10.1007\/s11390-017-1721-3"},{"key":"6_CR31","doi-asserted-by":"crossref","unstructured":"Rozo Duque, L.A., Monsalve Diaz, J.M., Yang, C.: Improving mpsoc reliability through adapting runtime task schedule based on time-correlated fault behavior. In: 2015 Design, Automation & Test in Europe Conference & Exhibition (DATE), pp. 818\u2013823 (2015)","DOI":"10.7873\/DATE.2015.0450"},{"key":"6_CR32","doi-asserted-by":"publisher","first-page":"12229","DOI":"10.1109\/ACCESS.2022.3144217","volume":"10","author":"S Safari","year":"2022","unstructured":"Safari, S., et al.: A survey of fault-tolerance techniques for embedded systems from the perspective of power, energy, and thermal issues. IEEE Access 10, 12229\u201312251 (2022). https:\/\/doi.org\/10.1109\/ACCESS.2022.3144217","journal-title":"IEEE Access"},{"key":"6_CR33","doi-asserted-by":"publisher","unstructured":"Sahoo, S.S., Ranjbar, B., Kumar, A.: Reliability-aware resource management in multi-\/many-core systems: a perspective paper. J. Low Power Electron. Appl. 11(1) (2021). https:\/\/doi.org\/10.3390\/jlpea11010007. https:\/\/www.mdpi.com\/2079-9268\/11\/1\/7","DOI":"10.3390\/jlpea11010007"},{"key":"6_CR34","doi-asserted-by":"publisher","unstructured":"Salehi, M., Khavari Tavana, M., Rehman, S., Shafique, M., Ejlali, A., Henkel, J.: Two-state checkpointing for energy-efficient fault tolerance in hard real-time systems. IEEE Trans. Very Large Scale Integr. (VLSI) Syst. 24(7), 2426\u20132437 (2016). https:\/\/doi.org\/10.1109\/TVLSI.2015.2512839","DOI":"10.1109\/TVLSI.2015.2512839"},{"key":"6_CR35","doi-asserted-by":"publisher","unstructured":"Sastry Hari, S.K., Li, M.L., Ramachandran, P., Choi, B., Adve, S.V.: Mswat: low-cost hardware fault detection and diagnosis for multicore systems. In: Proceedings of the 42nd Annual IEEE\/ACM International Symposium on Microarchitecture, MICRO 42, pp. 122\u2013132. Association for Computing Machinery, New York (2009). https:\/\/doi.org\/10.1145\/1669112.1669129","DOI":"10.1145\/1669112.1669129"},{"key":"6_CR36","doi-asserted-by":"publisher","unstructured":"Subasi, O., Unsal, O., Krishnamoorthy, S.: Automatic risk-based selective redundancy for fault-tolerant task-parallel hpc applications. In: Proceedings of the Third International Workshop on Extreme Scale Programming Models and Middleware, ESPM22017. Association for Computing Machinery, New York (2017). https:\/\/doi.org\/10.1145\/3152041.3152083","DOI":"10.1145\/3152041.3152083"},{"key":"6_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"633","DOI":"10.1007\/978-3-642-40047-6_63","volume-title":"Euro-Par 2013 Parallel Processing","author":"J Suettlerlein","year":"2013","unstructured":"Suettlerlein, J., Zuckerman, S., Gao, G.R.: An implementation of the codelet model. In: Wolf, F., Mohr, B., an Mey, D. (eds.) Euro-Par 2013. LNCS, vol. 8097, pp. 633\u2013644. Springer, Heidelberg (2013). https:\/\/doi.org\/10.1007\/978-3-642-40047-6_63"},{"issue":"1","key":"6_CR38","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1147\/rd.111.0025","volume":"11","author":"RM Tomasulo","year":"1967","unstructured":"Tomasulo, R.M.: An efficient algorithm for exploiting multiple arithmetic units. IBM J. Res. Dev. 11(1), 25\u201333 (1967). https:\/\/doi.org\/10.1147\/rd.111.0025","journal-title":"IBM J. Res. Dev."},{"key":"6_CR39","doi-asserted-by":"publisher","unstructured":"Weis, S., Garbade, A., Fechner, B., Mendelson, A., Giorgi, R., Ungerer, T.: Architectural support for fault tolerance in a teradevice dataflow system. Int. J. Parallel Program. (2014). https:\/\/doi.org\/10.1007\/s10766-014-0312-y","DOI":"10.1007\/s10766-014-0312-y"},{"key":"6_CR40","doi-asserted-by":"publisher","unstructured":"Weis, S., et al.: A fault detection and recovery architecture for a teradevice dataflow system. In: 2011 First Workshop on Data-Flow Execution Models for Extreme Scale Computing, pp. 38\u201344 (2011). https:\/\/doi.org\/10.1109\/DFM.2011.9","DOI":"10.1109\/DFM.2011.9"}],"container-title":["Communications in Computer and Information Science","High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-52186-7_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,27]],"date-time":"2024-01-27T09:03:43Z","timestamp":1706346223000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-52186-7_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031521850","9783031521867"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-52186-7_6","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"value":"1865-0929","type":"print"},{"value":"1865-0937","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"28 January 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CARLA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Latin American High Performance Computing Conference","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Cartagena","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Colombia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 September 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22 September 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"carla2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EquinOCS","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"26","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"14","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"54% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.2","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1 Invited paper","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}