{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T15:33:47Z","timestamp":1763480027528,"version":"3.28.0"},"reference-count":19,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2014,9]]},"DOI":"10.1109\/cluster.2014.6968777","type":"proceedings-article","created":{"date-parts":[[2014,12,2]],"date-time":"2014-12-02T22:40:03Z","timestamp":1417560003000},"page":"84-92","source":"Crossref","is-referenced-by-count":7,"title":["Checkpoint\/restart in practice: When &amp;#x2018;simple is better&amp;#x2019;"],"prefix":"10.1109","author":[{"given":"Nosayba","family":"El-Sayed","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bianca","family":"Schroeder","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"19","doi-asserted-by":"publisher","DOI":"10.1145\/361147.361115"},{"key":"17","doi-asserted-by":"publisher","DOI":"10.1088\/1742-6596\/78\/1\/012022"},{"key":"18","doi-asserted-by":"publisher","DOI":"10.1109\/ICPADS.2010.48"},{"key":"15","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2008.109"},{"key":"16","article-title":"A large-scale study of failures in highperformance computing systems","author":"schroeder","year":"0","journal-title":"Proc of DSN'06"},{"key":"13","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTR.2007.4629264"},{"key":"14","first-page":"2690","article-title":"Analysis of dependencies of checkpoint cost and checkpoint interval of fault tolerant mpi applications","volume":"2","author":"shastry","year":"2010","journal-title":"Intl Journal on Computer Science and Engineering (IJCSE)"},{"key":"11","article-title":"On the viability of checkpoint compression for extreme scale fault tolerance","author":"ibtesham","year":"0","journal-title":"Proc of Euro-Par'11"},{"key":"12","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2010.80"},{"key":"3","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2005.14"},{"year":"0","key":"2"},{"year":"0","key":"1"},{"key":"10","article-title":"Modeling and tolerating heterogeneous failures in large parallel systems","author":"heien","year":"0","journal-title":"Proc of SC'11"},{"key":"7","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2004.11.016"},{"key":"6","article-title":"A flexible checkpoint\/restart model in distributed systems","author":"bouguerra","year":"0","journal-title":"Proc of PPAM'09"},{"key":"5","article-title":"Checkpointing strategies for parallel jobs","author":"bougeret","year":"0","journal-title":"Proc of SC'11"},{"key":"4","doi-asserted-by":"publisher","DOI":"10.1145\/1654059.1654081"},{"key":"9","article-title":"Checkpoint\/restart in practice: When 'simple is better'","author":"el-sayed","year":"2014","journal-title":"Technical Report TECHNICAL REPORT CSRG-622"},{"key":"8","article-title":"Reading between the lines of failure logs: Understanding how hpc systems fail","author":"el-sayed","year":"0","journal-title":"Proceedings of DSN '11"}],"event":{"name":"2014 IEEE International Conference On Cluster Computing (CLUSTER)","start":{"date-parts":[[2014,9,22]]},"location":"Madrid, Spain","end":{"date-parts":[[2014,9,26]]}},"container-title":["2014 IEEE International Conference on Cluster Computing (CLUSTER)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6957006\/6968662\/06968777.pdf?arnumber=6968777","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,10,14]],"date-time":"2020-10-14T15:11:50Z","timestamp":1602688310000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/6968777"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014,9]]},"references-count":19,"URL":"https:\/\/doi.org\/10.1109\/cluster.2014.6968777","relation":{},"subject":[],"published":{"date-parts":[[2014,9]]}}}