{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T00:48:44Z","timestamp":1742950124754,"version":"3.40.3"},"publisher-location":"Cham","reference-count":30,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783031061554"},{"type":"electronic","value":"9783031061561"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-06156-1_24","type":"book-chapter","created":{"date-parts":[[2022,6,8]],"date-time":"2022-06-08T20:29:39Z","timestamp":1654720179000},"page":"298-309","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Exploring the Impact of Node Failures on the Resource Allocation for Parallel Jobs"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5461-556X","authenticated-orcid":false,"given":"Ioannis","family":"Vardas","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2173-062X","authenticated-orcid":false,"given":"Manolis","family":"Ploumidis","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4768-3289","authenticated-orcid":false,"given":"Manolis","family":"Marazakis","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,6,9]]},"reference":[{"key":"24_CR1","unstructured":"Cappello, F., Al, G., Gropp, W., Kale, S., Kramer, B., Snir, M.: Toward exascale resilience: 2014 update. Supercomput. Front. Innov.: Int. J. 1(1), 5\u201328 (2014)"},{"issue":"10","key":"24_CR2","doi-asserted-by":"publisher","first-page":"2899","DOI":"10.1016\/j.jpdc.2014.06.008","volume":"74","author":"H Casanova","year":"2014","unstructured":"Casanova, H., Giersch, A., Legrand, A., Quinson, M., Suter, F.: Versatile, scalable, and accurate simulation of distributed applications and platforms. J. Parallel Distrib. Comput. 74(10), 2899\u20132917 (2014)","journal-title":"J. Parallel Distrib. Comput."},{"issue":"3","key":"24_CR3","doi-asserted-by":"publisher","first-page":"308","DOI":"10.1109\/71.993209","volume":"13","author":"A Dogan","year":"2002","unstructured":"Dogan, A., Ozguner, F.: Matching and scheduling algorithms for minimizing execution time and failure probability of applications in heterogeneous computing. IEEE Trans. Parallel Distrib. Syst. 13(3), 308\u2013323 (2002)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"24_CR4","doi-asserted-by":"crossref","unstructured":"El-Sayed, N., Zhu, H., Schroeder, B.: Learning from failure across multiple clusters: a trace-driven approach to understanding, predicting, and mitigating job terminations. In: 2017 IEEE 37th International Conference on Distributed Computing Systems (ICDCS), pp. 1333\u20131344 (2017)","DOI":"10.1109\/ICDCS.2017.317"},{"key":"24_CR5","unstructured":"Elnozahy, E.N.: System resilience at extreme scale. Technical report. Defense Advanced Research Project Agency (2008)"},{"key":"24_CR6","doi-asserted-by":"crossref","unstructured":"Engelmann, C., Lauer, F.: Facilitating co-design for extreme-scale systems through lightweight simulation. In: 2010 IEEE International Conference on Cluster Computing Workshops and Posters (CLUSTER WORKSHOPS), pp. 1\u20138 (2010)","DOI":"10.1109\/CLUSTERWKSP.2010.5613113"},{"key":"24_CR7","doi-asserted-by":"crossref","unstructured":"Engelmann, C., Naughton, T.: Toward a performance\/resilience tool for hardware\/software co-design of high-performance computing systems. In: 2013 42nd International Conference on Parallel Processing, pp. 960\u2013969 (2013)","DOI":"10.1109\/ICPP.2013.114"},{"key":"24_CR8","unstructured":"ETP4HPC-SRA 4: Strategic Research Agenda for High Performance Computing in Europe. https:\/\/www.etp4hpc.eu\/pujades\/files\/ETP4HPC_SRA4_2020_web.pdf"},{"issue":"4","key":"24_CR9","doi-asserted-by":"publisher","first-page":"384","DOI":"10.1016\/j.jpdc.2010.01.002","volume":"70","author":"S Fu","year":"2010","unstructured":"Fu, S.: Failure-aware resource management for high-availability computing clusters with distributed virtual machines. J. Parallel Distrib. Comput. 70(4), 384\u2013393 (2010)","journal-title":"J. Parallel Distrib. Comput."},{"key":"24_CR10","doi-asserted-by":"crossref","unstructured":"Gottumukkala, N.R., Leangsuksun, C.B., Taerat, N., Nassar, R., Scott, S.L.: Reliability-aware resource allocation in HPC systems. In: 2007 IEEE International Conference on Cluster Computing, pp. 312\u2013321 (2007)","DOI":"10.1109\/CLUSTR.2007.4629245"},{"issue":"1","key":"24_CR11","doi-asserted-by":"publisher","first-page":"162","DOI":"10.1109\/TR.2009.2034291","volume":"59","author":"NR Gottumukkala","year":"2010","unstructured":"Gottumukkala, N.R., Nassar, R., Paun, M., Leangsuksun, C.B., Scott, S.L.: Reliability of a system of k nodes for high performance computing applications. IEEE Trans. Reliab. 59(1), 162\u2013169 (2010)","journal-title":"IEEE Trans. Reliab."},{"key":"24_CR12","doi-asserted-by":"crossref","unstructured":"Hakem, M., Butelle, F.: Reliability and scheduling on systems subject to failures. In: 2007 International Conference on Parallel Processing (ICPP 2007), pp. 38\u201338 (2007)","DOI":"10.1109\/ICPP.2007.72"},{"key":"24_CR13","doi-asserted-by":"crossref","unstructured":"Heien, E., LaPine, D., Kondo, D., Kramer, B., Gainaru, A., Cappello, F.: Modeling and tolerating heterogeneous failures in large parallel systems. In: SC 2011: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201311 (2011)","DOI":"10.1145\/2063384.2063444"},{"issue":"5","key":"24_CR14","doi-asserted-by":"publisher","first-page":"475","DOI":"10.1109\/71.852400","volume":"11","author":"H Choo","year":"2000","unstructured":"Choo, H., Yoo, S.-M., Youn, H.Y.: Processor scheduling and allocation for 3D torus multicomputer systems. IEEE Trans. Parallel Distrib. Syst. 11(5), 475\u2013484 (2000)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"24_CR15","doi-asserted-by":"crossref","unstructured":"Jauk, D., Yang, D., Schulz, M.: Predicting faults in high performance computing systems: an in-depth survey of the state-of-the-practice. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2019, pp. 30:1\u201330:13. ACM, New York (2019)","DOI":"10.1145\/3295500.3356185"},{"key":"24_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1007\/978-3-319-10214-6_5","volume-title":"High Performance Computing Systems. Performance Modeling, Benchmarking and Simulation","author":"S Levy","year":"2014","unstructured":"Levy, S., Topp, B., Ferreira, K.B., Arnold, D., Hoefler, T., Widener, P.: Using simulation to evaluate the performance of resilience strategies at scale. In: Jarvis, S.A., Wright, S.A., Hammond, S.D. (eds.) PMBS 2013. LNCS, vol. 8551, pp. 91\u2013114. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10214-6_5"},{"key":"24_CR17","doi-asserted-by":"crossref","unstructured":"Machida, F., Kawato, M., Maeno, Y.: Redundant virtual machine placement for fault-tolerant consolidated server clusters. In: 2010 IEEE Network Operations and Management Symposium - NOMS 2010, pp. 32\u201339 (2010)","DOI":"10.1109\/NOMS.2010.5488431"},{"key":"24_CR18","doi-asserted-by":"crossref","unstructured":"Martino, C.D., Kramer, W., Kalbarczyk, Z., Iyer, R.: Measuring and understanding extreme-scale application resilience: a field study of 5,000,000 HPC application runs. In: 2015 45th Annual IEEE\/IFIP International Conference on Dependable Systems and Networks, pp. 25\u201336 (2015)","DOI":"10.1109\/DSN.2015.50"},{"key":"24_CR19","doi-asserted-by":"crossref","unstructured":"Oliner, A.J., Sahoo, R.K., Moreira, J.E., Gupta, M., Sivasubramaniam, A.: Fault-aware job scheduling for BlueGene\/L systems. In: Proceedings of the 18th International Parallel and Distributed Processing Symposium, p. 64 (2004)","DOI":"10.1109\/IPDPS.2004.1302991"},{"key":"24_CR20","doi-asserted-by":"crossref","unstructured":"Schroeder, B., Gibson, G.: Understanding failures in Petascale computers. J. Phys.: Conf. Ser. 78 (2007)","DOI":"10.1088\/1742-6596\/78\/1\/012022"},{"issue":"4","key":"24_CR21","doi-asserted-by":"publisher","first-page":"337","DOI":"10.1109\/TDSC.2009.4","volume":"7","author":"B Schroeder","year":"2010","unstructured":"Schroeder, B., Gibson, G.A.: A large-scale study of failures in high-performance computing systems. IEEE Trans. Depend. Secure Comput. 7(4), 337\u2013350 (2010)","journal-title":"IEEE Trans. Depend. Secure Comput."},{"key":"24_CR22","unstructured":"Slurm Resource Selection Plugin. https:\/\/slurm.schedmd.com\/selectplugins.html"},{"key":"24_CR23","unstructured":"Snir, M., et al.: Addressing failures in exascale computing. In: ICiS Workshop ANL\/MCS-TM-332, April 2013"},{"key":"24_CR24","doi-asserted-by":"crossref","unstructured":"Tikotekar, A., Vallee, G., Naughton, T., Scott, S.L., Leangsuksun, C.: Evaluation of fault-tolerant policies using simulation. In: 2007 IEEE International Conference on Cluster Computing, pp. 303\u2013311 (2007)","DOI":"10.1109\/CLUSTR.2007.4629244"},{"key":"24_CR25","doi-asserted-by":"crossref","unstructured":"Tiwari, D., Gupta, S., Vazhkudai, S.S.: Lazy checkpointing: exploiting temporal locality in failures to mitigate checkpointing overheads on extreme-scale systems. In: 2014 44th Annual IEEE\/IFIP International Conference on Dependable Systems and Networks, pp. 25\u201336 (2014)","DOI":"10.1109\/DSN.2014.101"},{"key":"24_CR26","doi-asserted-by":"crossref","unstructured":"Vardas, I., Ploumidis, M., Marazakis, M.: Towards communication profile, topology and node failure aware process placement. In: 2020 IEEE 32nd International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD), pp. 241\u2013248 (2020)","DOI":"10.1109\/SBAC-PAD49847.2020.00041"},{"key":"24_CR27","doi-asserted-by":"crossref","unstructured":"Li, Y., Lan, Z.: Exploit failure prediction for adaptive fault-tolerance in cluster computing. In: Sixth IEEE International Symposium on Cluster Computing and the Grid (CCGRID 2006), vol. 1, pp. 8 p. 538 (2006)","DOI":"10.1109\/CCGRID.2006.45"},{"key":"24_CR28","doi-asserted-by":"crossref","unstructured":"Yigitbasi, N., Gallet, M., Kondo, D., Iosup, A., Epema, D.: Analysis and modeling of time-correlated failures in large-scale distributed systems. In: 2010 11th IEEE\/ACM International Conference on Grid Computing, pp. 65\u201372 (2010)","DOI":"10.1109\/GRID.2010.5697961"},{"key":"24_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"44","DOI":"10.1007\/10968987_3","volume-title":"Job Scheduling Strategies for Parallel Processing","author":"AB Yoo","year":"2003","unstructured":"Yoo, A.B., Jette, M.A., Grondona, M.: SLURM: simple Linux utility for resource management. In: Feitelson, D., Rudolph, L., Schwiegelshohn, U. (eds.) JSSPP 2003. LNCS, vol. 2862, pp. 44\u201360. Springer, Heidelberg (2003). https:\/\/doi.org\/10.1007\/10968987_3"},{"key":"24_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"233","DOI":"10.1007\/11407522_13","volume-title":"Job Scheduling Strategies for Parallel Processing","author":"Y Zhang","year":"2005","unstructured":"Zhang, Y., Squillante, M.S., Sivasubramaniam, A., Sahoo, R.K.: Performance implications of failures in large-scale cluster scheduling. In: Feitelson, D.G., Rudolph, L., Schwiegelshohn, U. (eds.) JSSPP 2004. LNCS, vol. 3277, pp. 233\u2013252. Springer, Heidelberg (2005). https:\/\/doi.org\/10.1007\/11407522_13"}],"container-title":["Lecture Notes in Computer Science","Euro-Par 2021: Parallel Processing Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-06156-1_24","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,26]],"date-time":"2024-09-26T17:42:16Z","timestamp":1727372536000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-06156-1_24"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031061554","9783031061561"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-06156-1_24","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"9 June 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"Euro-Par","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Parallel Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lisbon","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Portugal","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 August 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31 August 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"europar2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2021.euro-par.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"136","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"39","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"29% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"6","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}