{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T13:27:51Z","timestamp":1730208471690,"version":"3.28.0"},"reference-count":21,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2007]]},"DOI":"10.1109\/clustr.2007.4629264","type":"proceedings-article","created":{"date-parts":[[2008,9,24]],"date-time":"2008-09-24T14:24:42Z","timestamp":1222266282000},"page":"452-457","source":"Crossref","is-referenced-by-count":21,"title":["A reliability-aware approach for an optimal checkpoint\/restart model in HPC environments"],"prefix":"10.1109","author":[{"given":"Yudan","family":"Liu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Raja","family":"Nassar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chockchai","family":"Leangsuksun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nichamon","family":"Naksinehaboon","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mihaela","family":"Paun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Stephen","family":"Scott","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/FTCS.1999.781059"},{"article-title":"Stochastic Processes","year":"1995","author":"ross","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2005.337"},{"key":"ref13","article-title":"The LAM\/MPI Checkpoint\/Restart Framework: System-Initiated Checkpoint","author":"sankaran","year":"2003","journal-title":"The 2003 Los Alamos Computer Science Institute Symposium"},{"key":"ref14","article-title":"On the survivability of standard MPI applications","author":"tikotekar","year":"2006","journal-title":"Proceedings of 7th LCI International Conference on Linux Clusters The HPC Revolution"},{"key":"ref15","article-title":"A survey of fault-tolerance and fault-recovery techniques in parallel systems","author":"treaster","year":"2005","journal-title":"Technical Report cs DC\/ 0501002 ACM Computing Research Repository (CoRR)"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/HPDC.1993.263838"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/361147.361115"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2002.10017"},{"key":"ref19","first-page":"249","article-title":"A large-scale study of failures in highperformance computing systems","author":"schroeder","year":"2006","journal-title":"Proceedings of International Symposium on Dependable Systems and Networks (DSN)"},{"key":"ref4","article-title":"A Higher Order Estimate of the Optimum Checkpoint Interval for Restart Dumps","author":"daly","year":"2004","journal-title":"Future Generation Computer Systems"},{"key":"ref3","first-page":"3","article-title":"A Model for Predicting the Optimum Checkpoint Interval for Restart Dumps","volume":"4","author":"daly","year":"2003","journal-title":"ICCS 2003 LNCS 2660 Proceedings"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/24.9847"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2004.15"},{"key":"ref8","first-page":"167","article-title":"Checkpointing and the Modeling of Program Execution Time","author":"nicola","year":"1995","journal-title":"Software Fault Tolerance"},{"article-title":"Reliability-Aware Optimal Checkpoint\/Restart Model In High Performance Computing","year":"2007","author":"liu","key":"ref7"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.1975.6312824"},{"key":"ref1","first-page":"40","article-title":"A survey of analytic models of roll-back and recovery strategies","volume":"5","author":"chandy","year":"1975","journal-title":"Computer 8"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/1183401.1183406"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/12.936236"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2006.22"}],"event":{"name":"2007 IEEE International Conference on Cluster Computing (CLUSTER)","start":{"date-parts":[[2007,9,17]]},"location":"Austin, TX, USA","end":{"date-parts":[[2007,9,20]]}},"container-title":["2007 IEEE International Conference on Cluster Computing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx5\/4623687\/4629185\/04629264.pdf?arnumber=4629264","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2017,3,17]],"date-time":"2017-03-17T11:21:53Z","timestamp":1489749713000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/4629264\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2007]]},"references-count":21,"URL":"https:\/\/doi.org\/10.1109\/clustr.2007.4629264","relation":{},"subject":[],"published":{"date-parts":[[2007]]}}}