{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,6]],"date-time":"2024-09-06T08:49:21Z","timestamp":1725612561996},"reference-count":32,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1109\/igcc.2016.7892626","type":"proceedings-article","created":{"date-parts":[[2017,4,7]],"date-time":"2017-04-07T03:56:23Z","timestamp":1491537383000},"page":"1-8","source":"Crossref","is-referenced-by-count":2,"title":["Monitoring strategies for scalable dynamic checkpointing"],"prefix":"10.1109","author":[{"given":"Swann","family":"Perarnau","sequence":"first","affiliation":[]},{"given":"Leonardo","family":"Bautista-Gomez","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"doi-asserted-by":"publisher","key":"ref32","DOI":"10.1109\/IPDPS.2011.83"},{"key":"ref31","article-title":"Modeling event-driven time series with generalized hidden semi-markov models","author":"salfner","year":"2006","journal-title":"Technical Report 208 Department of Computer Science Humboldt University"},{"key":"ref30","article-title":"CIFTS: A coordinated infrastructure for fanit-tolerant systems","author":"gupta","year":"2009","journal-title":"TICP"},{"year":"0","author":"beckman","article-title":"Argo: An exascale operating system and runtime research project","key":"ref10"},{"year":"0","journal-title":"Intel Corporation","article-title":"Intel 64 and IA-32 architectures software developer's manual","key":"ref11"},{"doi-asserted-by":"publisher","key":"ref12","DOI":"10.1109\/ARES.2009.105"},{"key":"ref13","article-title":"Path-based failure and evolution management","author":"chen","year":"2004","journal-title":"NSDI"},{"key":"ref14","article-title":"An analysis of clustered failures on large supercomputing systems","author":"hacker","year":"2009","journal-title":"JPDC"},{"doi-asserted-by":"publisher","key":"ref15","DOI":"10.1145\/2063384.2063427"},{"year":"2015","author":"bautista-gomez","article-title":"Dynamic version of FTI","key":"ref16"},{"doi-asserted-by":"publisher","key":"ref17","DOI":"10.1109\/INM.2007.374780"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.1147\/rd.452.0311"},{"key":"ref19","article-title":"To-ward efficient failure detection and recovery in hpc","author":"rani","year":"2006","journal-title":"High Availability and Performance Workshop"},{"doi-asserted-by":"publisher","key":"ref28","DOI":"10.1145\/1081870.1081927"},{"doi-asserted-by":"publisher","key":"ref4","DOI":"10.1016\/j.future.2004.11.016"},{"key":"ref27","article-title":"Quantifying temporal and spatial fault event correlation for proactive failure management","author":"fu","year":"2007","journal-title":"SRDS"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.1145\/361147.361115"},{"key":"ref6","article-title":"Failure data analysis of hpc systems","author":"lu","year":"2013","journal-title":"Technical Report CoRR abs\/1302 4779"},{"year":"2011","author":"becklehimer","article-title":"Real time health monitoring of the cray xt3\/xt4 using the simple event correlator (sec)","key":"ref29"},{"doi-asserted-by":"publisher","key":"ref5","DOI":"10.1109\/DSN.2014.101"},{"doi-asserted-by":"publisher","key":"ref8","DOI":"10.1145\/2063384.2063444"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1109\/IPDPS.2016.100"},{"key":"ref2","article-title":"A large-scale study of failures in highperformance computing systems","author":"schroeder","year":"2010","journal-title":"TDSC"},{"doi-asserted-by":"publisher","key":"ref9","DOI":"10.1109\/IPDPS.2013.74"},{"doi-asserted-by":"publisher","key":"ref1","DOI":"10.1109\/MM.2005.110"},{"doi-asserted-by":"publisher","key":"ref20","DOI":"10.1088\/1742-6596\/78\/1\/012022"},{"doi-asserted-by":"publisher","key":"ref22","DOI":"10.1109\/HPCA.2015.7056044"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.1109\/HPCA.2016.7446091"},{"doi-asserted-by":"publisher","key":"ref24","DOI":"10.1109\/DSN.2014.62"},{"doi-asserted-by":"publisher","key":"ref23","DOI":"10.1145\/2807591.2807666"},{"doi-asserted-by":"publisher","key":"ref26","DOI":"10.1109\/DSN.2015.52"},{"key":"ref25","doi-asserted-by":"crossref","DOI":"10.1145\/2189750.2150989","article-title":"Cosmic rays don't strike twice: understanding the nature of dram errors and the implications for system design","author":"hwang","year":"2012","journal-title":"SIGARCH Comput Archit News"}],"event":{"name":"2016 Seventh International Green and Sustainable Computing Conference (IGSC)","start":{"date-parts":[[2016,11,7]]},"location":"Hangzhou, China","end":{"date-parts":[[2016,11,9]]}},"container-title":["2016 Seventh International Green and Sustainable Computing Conference (IGSC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7888616\/7892581\/07892626.pdf?arnumber=7892626","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,9,20]],"date-time":"2019-09-20T19:09:20Z","timestamp":1569006560000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/7892626\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"references-count":32,"URL":"https:\/\/doi.org\/10.1109\/igcc.2016.7892626","relation":{},"subject":[],"published":{"date-parts":[[2016]]}}}