{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:40:16Z","timestamp":1740123616757,"version":"3.37.3"},"reference-count":25,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2022,3,27]],"date-time":"2022-03-27T00:00:00Z","timestamp":1648339200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,3,27]],"date-time":"2022-03-27T00:00:00Z","timestamp":1648339200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100013076","name":"national major science and technology projects of china","doi-asserted-by":"publisher","award":["2016YFB0200605"],"award-info":[{"award-number":["2016YFB0200605"]}],"id":[{"id":"10.13039\/501100013076","id-type":"DOI","asserted-by":"publisher"}]},{"name":"guangdong province key laboratory of computational science at the sun yat-sen university","award":["2020B1212060032"],"award-info":[{"award-number":["2020B1212060032"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2022,8]]},"DOI":"10.1007\/s11227-022-04347-0","type":"journal-article","created":{"date-parts":[[2022,3,28]],"date-time":"2022-03-28T08:02:37Z","timestamp":1648454557000},"page":"14009-14033","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Failure detection algorithm for Fail-Lagging model applied to HPC"],"prefix":"10.1007","volume":"78","author":[{"given":"Yingjun","family":"Ye","sequence":"first","affiliation":[]},{"given":"Yongdong","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Weicai","family":"Ye","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,3,27]]},"reference":[{"key":"4347_CR1","doi-asserted-by":"publisher","unstructured":"Aguilera MK, Chen W, Toueg S (1998) Failure detection and consensus in the crash-recovery model. In: Kutten S (ed) Distributed Computing, 12th International Symposium, DISC \u201998, Andros, Greece, September 24-26, Proceedings, Lecture Notes in Computer Science, vol 1499, pp 231\u2013245. Springer. https:\/\/doi.org\/10.1007\/BFb0056486","DOI":"10.1007\/BFb0056486"},{"key":"4347_CR2","unstructured":"Albrecht JR, Tuttle C, Snoeren AC, Vahdat A (2006) Loose synchronization for large-scale networked systems. In: Adya A, Nahum EM (eds) Proceedings of the 2006 USENIX Annual Technical Conference, Boston, MA, USA, May 30\u2013June 3, pp 301\u2013314. USENIX. http:\/\/www.usenix.org\/events\/usenix06\/tech\/albrecht.html"},{"key":"4347_CR3","doi-asserted-by":"publisher","unstructured":"Angskun T, Bosilca G, Dongarra JJ (2007) Binomial graph: a scalable and fault-tolerant logical network topology. In: Stojmenovic I, Thulasiram RK, Yang LT, Jia W, Guo M, de\u00a0Mello RF (eds) Parallel and Distributed Processing and Applications, 5th International Symposium, ISPA 2007, Niagara Falls, Canada, August 29\u201331, 2007, Proceedings, Lecture Notes in Computer Science, vol 4742, pp 471\u2013482. Springer. https:\/\/doi.org\/10.1007\/978-3-540-74742-0_43","DOI":"10.1007\/978-3-540-74742-0_43"},{"key":"4347_CR4","doi-asserted-by":"publisher","unstructured":"Angskun T, Fagg GE, Bosilca G, Pjesivac-Grbovic J, Dongarra JJ (2006) Scalable fault tolerant protocol for parallel runtime environments. In: Mohr B, Tr\u00e4ff JL, Worringen J, Dongarra JJ (eds) Recent Advances in Parallel Virtual Machine and Message Passing Interface, 13th European PVM\/MPI User\u2019s Group Meeting, Bonn, Germany, September 17\u201320, Proceedings, Lecture Notes in Computer Science, vol 4192, pp 141\u2013149. Springer. https:\/\/doi.org\/10.1007\/11846802_25","DOI":"10.1007\/11846802_25"},{"key":"4347_CR5","doi-asserted-by":"publisher","unstructured":"Arpaci-Dusseau RH, Arpaci-Dusseau AC (2001) Fail-stutter fault tolerance. In: Proceedings of HotOS-VIII: 8th Workshop on Hot Topics in Operating Systems, May 20\u201323, Elmau\/Oberbayern, Germany, pp 33\u201338. IEEE Computer Society. https:\/\/doi.org\/10.1109\/HOTOS.2001.990058","DOI":"10.1109\/HOTOS.2001.990058"},{"key":"4347_CR6","doi-asserted-by":"publisher","unstructured":"Bosilca G, Bouteiller A, Guermouche A, H\u00e9rault T, Robert Y, Sens P, Dongarra JJ (2016) Failure detection and propagation in HPC systems. In: West J, Pancake CM (eds) Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2016, Salt Lake City, UT, USA, November 13\u201318, pp 312\u2013322. IEEE Computer Society. https:\/\/doi.org\/10.1109\/SC.2016.26","DOI":"10.1109\/SC.2016.26"},{"issue":"1","key":"4347_CR7","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1177\/1094342017711505","volume":"32","author":"G Bosilca","year":"2018","unstructured":"Bosilca G, Bouteiller A, Guermouche A, H\u00e9rault T, Robert Y, Sens P, Dongarra JJ (2018) A failure detector for HPC platforms. Int J High Perform Comput Appl 32(1):139\u2013158. https:\/\/doi.org\/10.1177\/1094342017711505","journal-title":"Int J High Perform Comput Appl"},{"issue":"2","key":"4347_CR8","doi-asserted-by":"publisher","first-page":"225","DOI":"10.1145\/226643.226647","volume":"43","author":"Tushar Deepak Chandra","year":"1996","unstructured":"Chandra TD, Toueg S (1996) Unreliable failure detectors for reliable distributed systems. J ACM 43(2):225\u2013267. https:\/\/doi.org\/10.1145\/226643.226647","journal-title":"J ACM"},{"issue":"12","key":"4347_CR9","doi-asserted-by":"publisher","first-page":"1628","DOI":"10.1109\/TPDS.2008.58","volume":"19","author":"Z Chen","year":"2008","unstructured":"Chen Z, Dongarra JJ (2008) Algorithm-based fault tolerance for fail-stop failures. IEEE Trans Parallel Distrib Syst 19(12):1628\u20131641. https:\/\/doi.org\/10.1109\/TPDS.2008.58","journal-title":"IEEE Trans Parallel Distrib Syst"},{"key":"4347_CR10","unstructured":"Dwork C, Lynch NA, Stockmeyer LJ (1984) Consensus in the presence of partial synchrony (preliminary version). In: Kameda T, Misra J, Peters JG, Santoro N (eds) Proceedings of the Third Annual ACM Symposium on Principles of Distributed Computing, Vancouver, B. C., Canada, August 27\u201329, pp 103\u2013118. ACM. https:\/\/dl.acm.org\/citation.cfm?id=1599406"},{"issue":"3","key":"4347_CR11","doi-asserted-by":"publisher","first-page":"1302","DOI":"10.1007\/s11227-013-0884-0","volume":"65","author":"IP Egwutuoha","year":"2013","unstructured":"Egwutuoha IP, Levy D, Selic B, Chen S (2013) A survey of fault tolerance mechanisms and checkpoint\/restart implementations for high performance computing systems. J Supercomput 65(3):1302\u20131326. https:\/\/doi.org\/10.1007\/s11227-013-0884-0","journal-title":"J Supercomput"},{"key":"4347_CR12","doi-asserted-by":"publisher","unstructured":"Ferreira K, Stearley J, Laros JH, Oldfield R, Pedretti K, Brightwell R, Riesen R, Bridges PG, Arnold D (2011) Evaluating the viability of process replication reliability for exascale systems. In: SC \u201911: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis, pp 1\u201312. https:\/\/doi.org\/10.1145\/2063384.2063443","DOI":"10.1145\/2063384.2063443"},{"issue":"2","key":"4347_CR13","doi-asserted-by":"publisher","first-page":"280","DOI":"10.1006\/inco.1993.1010","volume":"102","author":"N Graham","year":"1993","unstructured":"Graham N, Harary F, Livingston M, Stout QF (1993) Subcube fault-tolerance in hypercubes. Inf Comput 102(2):280\u2013314. https:\/\/doi.org\/10.1006\/inco.1993.1010","journal-title":"Inf Comput"},{"issue":"3","key":"4347_CR14","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3242086","volume":"14","author":"HS Gunawi","year":"2018","unstructured":"Gunawi HS, Suminto RO, Sears R, Golliher C, Sundararaman S, Lin X, Emami T, Sheng W, Bidokhti N, McCaffrey C, Srinivasan D, Panda B, Baptist A, Grider G, Fields PM, Harms K, Ross RB, Jacobson A, Ricci R, Webb K, Alvaro P, Runesha HB, Hao M, Li H (2018) Fail-slow at scale: evidence of hardware performance faults in large production systems. ACM Trans Storage (TOS) 14(3):1\u201326. https:\/\/doi.org\/10.1145\/3242086","journal-title":"ACM Trans Storage (TOS)"},{"key":"4347_CR15","doi-asserted-by":"publisher","unstructured":"Gupta S, Tiwari D, Jantzi C, Rogers JH, Maxwell D (2015) Understanding and exploiting spatial properties of system failures on extreme-scale HPC systems. In: 45th Annual IEEE\/IFIP International Conference on Dependable Systems and Networks, DSN 2015, Rio de Janeiro, Brazil, June 22\u201325, pp 37\u201344. IEEE Computer Society. https:\/\/doi.org\/10.1109\/DSN.2015.52","DOI":"10.1109\/DSN.2015.52"},{"key":"4347_CR16","doi-asserted-by":"publisher","unstructured":"Hurfin M, Most\u00e9faoui A, Raynal M (1998) Consensus in asynchronous systems where processes can crash and recover. In: The Seventeenth Symposium on Reliable Distributed Systems, SRDS 1998, West Lafayette, Indiana, USA, October 20\u201322, Proceedings, pp 280\u2013286. IEEE Computer Society. https:\/\/doi.org\/10.1109\/RELDIS.1998.740510","DOI":"10.1109\/RELDIS.1998.740510"},{"key":"4347_CR17","doi-asserted-by":"publisher","unstructured":"Hursey J, Graham RL (2011) Building a fault tolerant mpi application: a ring communication example. In: 2011 IEEE International Symposium on Parallel and Distributed Processing Workshops and Phd Forum, pp 1549\u20131556. https:\/\/doi.org\/10.1109\/RELDIS.1998.740510","DOI":"10.1109\/RELDIS.1998.740510"},{"key":"4347_CR18","doi-asserted-by":"publisher","unstructured":"Kharbas K, Kim D, Hoefler T, Mueller F (2012) Assessing HPC failure detectors for MPI jobs. In: Stotzka R, Schiffers M, Cotronis Y (eds) Proceedings of the 20th Euromicro International Conference on Parallel, Distributed and Network-Based Processing, PDP 2012, Munich, Germany, February 15\u201317, pp 81\u201388. IEEE. https:\/\/doi.org\/10.1109\/PDP.2012.11","DOI":"10.1109\/PDP.2012.11"},{"issue":"3","key":"4347_CR19","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1145\/357172.357176","volume":"4","author":"L Lamport","year":"1982","unstructured":"Lamport L, Shostak RE, Pease MC (1982) The byzantine generals problem. ACM Trans Program Lang Syst 4(3):382\u2013401. https:\/\/doi.org\/10.1145\/357172.357176","journal-title":"ACM Trans Program Lang Syst"},{"key":"4347_CR20","doi-asserted-by":"publisher","first-page":"467","DOI":"10.1016\/j.future.2020.01.026","volume":"106","author":"N Losada","year":"2020","unstructured":"Losada N, Gonz\u00e1lez P, Mart\u00edn MJ, Bosilca G, Bouteiller A, Teranishi K (2020) Fault tolerance of MPI applications in exascale systems: the ULFM solution. Future Gener Comput Syst 106:467\u2013481. https:\/\/doi.org\/10.1016\/j.future.2020.01.026","journal-title":"Future Gener Comput Syst"},{"issue":"3","key":"4347_CR21","doi-asserted-by":"publisher","first-page":"222","DOI":"10.1145\/357369.357371","volume":"1","author":"RD Schlichting","year":"1983","unstructured":"Schlichting RD, Schneider FB (1983) Fail-stop processors: an approach to designing fault-tolerant computing systems. ACM Trans Comput Syst (TOCS) 1(3):222\u2013238. https:\/\/doi.org\/10.1145\/357369.357371","journal-title":"ACM Trans Comput Syst (TOCS)"},{"key":"4347_CR22","doi-asserted-by":"publisher","unstructured":"Sloan J, Kumar R, Bronevetsky G (2013) An algorithmic approach to error localization and partial recomputation for low-overhead fault tolerance. In: 2013 43rd Annual IEEE\/IFIP International Conference on Dependable Systems and Networks (DSN), Budapest, Hungary, June 24\u201327, pp 1\u201312. IEEE Computer Society. https:\/\/doi.org\/10.1109\/DSN.2013.6575309","DOI":"10.1109\/DSN.2013.6575309"},{"key":"4347_CR23","doi-asserted-by":"publisher","unstructured":"Ye Y, Zhang Y, Ye W (2021) An application-level failure detection algorithm based on a robust and efficient torus-tree for HPC. In: 2021 IEEE Intl Conf on Parallel & Distributed Processing with Applications, Big Data & Cloud Computing, Sustainable Computing & Communications, Social Computing & Networking (ISPA\/BDCloud\/SocialCom\/SustainCom), New York City, NY, USA, September 30\u2013Oct. 3, pp 484\u2013492. IEEE. https:\/\/doi.org\/10.1109\/ISPA-BDCloud-SocialCom-SustainCom52081.2021.00073","DOI":"10.1109\/ISPA-BDCloud-SocialCom-SustainCom52081.2021.00073"},{"issue":"10","key":"4347_CR24","doi-asserted-by":"publisher","first-page":"1261","DOI":"10.1631\/FITEE.1800442","volume":"19","author":"J Zhai","year":"2018","unstructured":"Zhai J, Chen W (2018) A vision of post-exascale programming. Front Inf Technol Electron Eng 19(10):1261\u20131266. https:\/\/doi.org\/10.1631\/FITEE.1800442","journal-title":"Front Inf Technol Electron Eng"},{"key":"4347_CR25","doi-asserted-by":"publisher","unstructured":"Zhong D, Bouteiller A, Luo X, Bosilca G (2019) Runtime level failure detection and propagation in HPC systems. In: Hoefler T, Tr\u00e4ff JL (eds) Proceedings of the 26th European MPI Users\u2019 Group Meeting, EuroMPI 2019, Z\u00fcrich, Switzerland, September 11\u201313, pp 14:1\u201314:11. ACM. https:\/\/doi.org\/10.1145\/3343211.3343225","DOI":"10.1145\/3343211.3343225"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-022-04347-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-022-04347-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-022-04347-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T16:37:22Z","timestamp":1658162242000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-022-04347-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3,27]]},"references-count":25,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2022,8]]}},"alternative-id":["4347"],"URL":"https:\/\/doi.org\/10.1007\/s11227-022-04347-0","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"type":"print","value":"0920-8542"},{"type":"electronic","value":"1573-0484"}],"subject":[],"published":{"date-parts":[[2022,3,27]]},"assertion":[{"value":"30 January 2022","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 March 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}