{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T11:38:32Z","timestamp":1730288312167,"version":"3.28.0"},"reference-count":36,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2012,2]]},"DOI":"10.1109\/pdp.2012.11","type":"proceedings-article","created":{"date-parts":[[2012,3,26]],"date-time":"2012-03-26T21:47:31Z","timestamp":1332798451000},"page":"81-88","source":"Crossref","is-referenced-by-count":9,"title":["Assessing HPC Failure Detectors for MPI Jobs"],"prefix":"10.1109","author":[{"given":"Kishor","family":"Kharbas","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Donghoon","family":"Kim","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Torsten","family":"Hoefler","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Frank","family":"Mueller","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"doi-asserted-by":"publisher","key":"19","DOI":"10.1002\/(SICI)1097-024X(199709)27:9<1103::AID-SPE130>3.0.CO;2-2"},{"key":"35","article-title":"Proactive process-level live migration in hpc environments","author":"wang","year":"2008","journal-title":"Supercomputing"},{"doi-asserted-by":"publisher","key":"17","DOI":"10.1109\/SC.2005.76"},{"doi-asserted-by":"publisher","key":"36","DOI":"10.1145\/1141277.1141620"},{"doi-asserted-by":"publisher","key":"18","DOI":"10.1145\/1066677.1067026"},{"doi-asserted-by":"publisher","key":"33","DOI":"10.1145\/1183401.1183433"},{"key":"15","first-page":"346","article-title":"FT-MPI: Fault Tolerant MPI, supporting dynamic applications in a dynamic world","volume":"1908","author":"fagg","year":"2000","journal-title":"Euro PVM\/MPI Meeting"},{"doi-asserted-by":"publisher","key":"34","DOI":"10.1109\/IPDPS.2007.370307"},{"doi-asserted-by":"publisher","key":"16","DOI":"10.1109\/IPDPS.2009.5160969"},{"year":"2000","author":"duell","journal-title":"The Design and Implementation of Berkeley Lab's Linux Checkpoint\/Restart","key":"13"},{"doi-asserted-by":"publisher","key":"14","DOI":"10.1109\/12.142678"},{"doi-asserted-by":"publisher","key":"11","DOI":"10.1145\/226643.226647"},{"doi-asserted-by":"publisher","key":"12","DOI":"10.1002\/spe.4380210802"},{"doi-asserted-by":"publisher","key":"21","DOI":"10.1109\/IPDPS.2007.370603"},{"doi-asserted-by":"publisher","key":"20","DOI":"10.1109\/IPDPS.2007.370605"},{"key":"22","article-title":"Incremental checkpointing for grids","author":"mehnert-spahn","year":"2009","journal-title":"Linux Symposium"},{"doi-asserted-by":"publisher","key":"23","DOI":"10.1145\/367701.367728"},{"doi-asserted-by":"publisher","key":"24","DOI":"10.1109\/SC.2010.18"},{"key":"25","article-title":"Software failures and the road to a petaflop machine","author":"philp","year":"2005","journal-title":"HPCRI 1st Workshop on High Performance Computing Reliability Issues"},{"doi-asserted-by":"publisher","key":"26","DOI":"10.1145\/773379.806619"},{"doi-asserted-by":"publisher","key":"27","DOI":"10.1109\/IPDPS.2007.370309"},{"doi-asserted-by":"publisher","key":"28","DOI":"10.1007\/978-3-642-15277-1_8"},{"key":"29","article-title":"The LAM\/MPI checkpoint\/restart framework: Systeminitiated checkpointing","author":"sankaran","year":"2003","journal-title":"Proceedings LACSI Symposium"},{"doi-asserted-by":"publisher","key":"3","DOI":"10.1109\/HPDC.1999.805295"},{"doi-asserted-by":"publisher","key":"2","DOI":"10.1145\/1006209.1006248"},{"doi-asserted-by":"publisher","key":"10","DOI":"10.1109\/IPDPS.2007.370310"},{"year":"0","key":"1"},{"doi-asserted-by":"publisher","key":"30","DOI":"10.1109\/IPDPS.2009.5161050"},{"doi-asserted-by":"publisher","key":"7","DOI":"10.1145\/1048935.1050176"},{"key":"6","article-title":"MPICHV: Toward a scalable fault tolerant MPI for volatile nodes","author":"bosilca","year":"2002","journal-title":"Supercomputing"},{"doi-asserted-by":"publisher","key":"32","DOI":"10.1109\/IPPS.1996.508106"},{"key":"5","article-title":"Analysis of the component architecture overhead in OpenMPI","author":"barrett","year":"2005","journal-title":"EuropeanPVM\/MPI Users' Group Meeting"},{"doi-asserted-by":"publisher","key":"31","DOI":"10.1109\/DSN.2006.5"},{"doi-asserted-by":"publisher","key":"4","DOI":"10.1177\/109434209100500306"},{"key":"9","article-title":"Proactive fault tolerance in MPI applications via task migration","author":"chakravorty","year":"2006","journal-title":"International Conference on High Performance Computing"},{"key":"8","article-title":"Proactive fault tolerance in large systems","author":"chakravorty","year":"2005","journal-title":"HPCRI 1st Workshop on High Performance Computing Reliability Issues in Proceedings of the 11th International Symposium on High Performance Computer Architecture (HPCA-11)"}],"event":{"name":"2012 20th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)","start":{"date-parts":[[2012,2,15]]},"location":"Munich, Germany","end":{"date-parts":[[2012,2,17]]}},"container-title":["2012 20th Euromicro International Conference on Parallel, Distributed and Network-based Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx5\/6168524\/6169521\/06169533.pdf?arnumber=6169533","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2017,3,21]],"date-time":"2017-03-21T17:03:42Z","timestamp":1490115822000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/6169533\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012,2]]},"references-count":36,"URL":"https:\/\/doi.org\/10.1109\/pdp.2012.11","relation":{},"subject":[],"published":{"date-parts":[[2012,2]]}}}