{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T05:49:25Z","timestamp":1740116965586,"version":"3.37.3"},"reference-count":40,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2015,1,1]],"date-time":"2015-01-01T00:00:00Z","timestamp":1420070400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61221491","61303071","61303068","61120106005"],"award-info":[{"award-number":["61221491","61303071","61303068","61120106005"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guangzhou Science and Information Technology Bureau","award":["134200026"],"award-info":[{"award-number":["134200026"]}]},{"name":"State Key Laboratory of High Performance Computing","award":["201303-01"],"award-info":[{"award-number":["201303-01"]}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Parallel Computing"],"published-print":{"date-parts":[[2015,1]]},"DOI":"10.1016\/j.parco.2014.11.003","type":"journal-article","created":{"date-parts":[[2014,11,25]],"date-time":"2014-11-25T20:50:34Z","timestamp":1416948634000},"page":"50-65","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":8,"special_numbering":"C","title":["GS-DMR: Low-overhead soft error detection scheme for stencil-based computation"],"prefix":"10.1016","volume":"41","author":[{"given":"Ren","family":"Xiaoguang","sequence":"first","affiliation":[]},{"given":"Xu","family":"Xinhai","sequence":"additional","affiliation":[]},{"given":"Wang","family":"Qian","sequence":"additional","affiliation":[]},{"given":"Chen","family":"Juan","sequence":"additional","affiliation":[]},{"given":"Wang","family":"Miao","sequence":"additional","affiliation":[]},{"given":"Yang","family":"Xuejun","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"unstructured":"<http:\/\/www.nccs.gov\/jaguar\/>.","key":"10.1016\/j.parco.2014.11.003_b0005"},{"key":"10.1016\/j.parco.2014.11.003_b0010","series-title":"IEEE International Symposium on Parallel and Distributed Processing, IPDPS 2008","first-page":"1","article-title":"Efficient software checking for fault tolerance","author":"Yu","year":"2008"},{"key":"10.1016\/j.parco.2014.11.003_b0015","series-title":"Proceedings of the 22nd Annual International Conference on Supercomputing","first-page":"155","article-title":"Soft error vulnerability of iterative linear algebra methods","author":"Bronevetsky","year":"2008"},{"unstructured":"K. Asanovic, R. Bodik, B.C. Catanzaro, J.J. Gebis, P. Husbands, K. Keutzer, D.A. Patterson, W.L. Plishker, J. Shalf, S.W. Williams, et al., The landscape of parallel computing research: a view from berkeley, Tech. rep., Technical Report UCB\/EECS-2006-183, EECS Department, University of California, Berkeley, 2006.","key":"10.1016\/j.parco.2014.11.003_b0020"},{"key":"10.1016\/j.parco.2014.11.003_b0025","series-title":"Euro-Par 2009 Parallel Processing","first-page":"772","article-title":"Optimized stencil computation using in-place calculation on modern multicore systems","author":"Augustin","year":"2009"},{"volume":"vol. 1725","year":"2000","author":"Wolf-Gladrow","key":"10.1016\/j.parco.2014.11.003_b0030"},{"key":"10.1016\/j.parco.2014.11.003_b0035","series-title":"Proceedings of the International Conference on Supercomputing","first-page":"152","article-title":"Characterizing the impact of soft errors on iterative methods in scientific computing","author":"Shantharam","year":"2011"},{"issue":"3","key":"10.1016\/j.parco.2014.11.003_b0040","doi-asserted-by":"crossref","first-page":"303","DOI":"10.1016\/j.future.2004.11.016","article-title":"A higher order estimate of the optimum checkpoint interval for restart dumps","volume":"22","author":"Daly","year":"2006","journal-title":"Future Gen. Comput. Syst."},{"unstructured":"<http:\/\/www.top500.org\/>.","key":"10.1016\/j.parco.2014.11.003_b0045"},{"issue":"2","key":"10.1016\/j.parco.2014.11.003_b0050","doi-asserted-by":"crossref","first-page":"329","DOI":"10.1006\/jcph.1995.1103","article-title":"Simulation of cavity flow by the lattice boltzmann method","volume":"118","author":"Hou","year":"1995","journal-title":"J. Comput. Phys."},{"issue":"3","key":"10.1016\/j.parco.2014.11.003_b0055","doi-asserted-by":"crossref","first-page":"2780","DOI":"10.1103\/PhysRevE.55.2780","article-title":"Simulation of Rayleigh\u2013B\u00e9nard convection using a lattice boltzmann method","volume":"55","author":"Shan","year":"1997","journal-title":"Phys. Rev. E"},{"issue":"1","key":"10.1016\/j.parco.2014.11.003_b0060","doi-asserted-by":"crossref","first-page":"273","DOI":"10.1017\/S0022112004001272","article-title":"Numerical study of flow past an impulsively started cylinder by the lattice-boltzmann method","volume":"519","author":"Li","year":"2004","journal-title":"J. Fluid Mech."},{"key":"10.1016\/j.parco.2014.11.003_b0065","series-title":"Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis","first-page":"57","article-title":"Classifying soft error vulnerabilities in extreme-scale scientific applications using a binary instrumentation tool","author":"Li","year":"2012"},{"unstructured":"V. Sridharan, D.A. Liberty, D.R. Kaeli, A taxonomy to enable error recovery and correction in software, in: Workshop on Quality-Aware Design, 2008.","key":"10.1016\/j.parco.2014.11.003_b0070"},{"unstructured":"J. Elliott, F. Mueller, M. Stoyanov, C. Webster, Quantifying the impact of single bit flips on floating point arithmetic.","key":"10.1016\/j.parco.2014.11.003_b0075"},{"key":"10.1016\/j.parco.2014.11.003_b0080","series-title":"Proceedings of the 2011 International Conference for High Performance Computing, Networking, Storage and Analysis (SC)","article-title":"Fault-tolerant iterative methods via selective reliability","author":"Hoemmen","year":"2011"},{"issue":"6","key":"10.1016\/j.parco.2014.11.003_b0085","doi-asserted-by":"crossref","first-page":"518","DOI":"10.1109\/TC.1984.1676475","article-title":"Algorithm-based fault tolerance for matrix operations","volume":"100","author":"Huang","year":"1984","journal-title":"IEEE Trans. Comput."},{"issue":"9","key":"10.1016\/j.parco.2014.11.003_b0090","doi-asserted-by":"crossref","first-page":"1089","DOI":"10.1109\/TC.2003.1228507","article-title":"An algorithm-based error detection scheme for the multigrid method","volume":"52","author":"Mishra","year":"2003","journal-title":"IEEE Trans. Comput."},{"key":"10.1016\/j.parco.2014.11.003_b0095","series-title":"Proceedings of the 23rd International Symposium on High-performance Parallel and Distributed Computing, HPDC \u201914","first-page":"49","article-title":"Ft-scalapack: correcting soft errors on-line for scalapack cholesky, qr, and lu factorization routines","author":"Wu","year":"2014"},{"key":"10.1016\/j.parco.2014.11.003_b0100","series-title":"Proceedings of the 22Nd International Symposium on High-performance Parallel and Distributed Computing, HPDC \u201913","first-page":"167","article-title":"Correcting soft errors online in lu factorization","author":"Davies","year":"2013"},{"doi-asserted-by":"crossref","unstructured":"D. Hakkarinen, P. Wu, Z. Chen, Fail-stop failure algorithm-based fault tolerance for cholesky decomposition, 2014. http:\/\/dx.doi.org\/10.1109\/TPDS.2014.2320502.","key":"10.1016\/j.parco.2014.11.003_b0105","DOI":"10.1109\/TPDS.2014.2320502"},{"key":"10.1016\/j.parco.2014.11.003_b0110","series-title":"Proceedings of the 18th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP \u201913","first-page":"167","article-title":"Online-abft: an online algorithm based fault tolerance scheme for soft error detection in iterative methods","author":"Chen","year":"2013"},{"key":"10.1016\/j.parco.2014.11.003_b0115","series-title":"Proceedings of the 20th International Symposium on High Performance Distributed Computing","first-page":"73","article-title":"Algorithm-based recovery for iterative methods without checkpointing","author":"Chen","year":"2011"},{"key":"10.1016\/j.parco.2014.11.003_b0120","series-title":"Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, SC \u201913","first-page":"44:1","article-title":"Rethinking algorithm-based fault tolerance with a cooperative software-hardware approach","author":"Li","year":"2013"},{"issue":"6","key":"10.1016\/j.parco.2014.11.003_b0125","doi-asserted-by":"crossref","first-page":"457","DOI":"10.1016\/j.jocs.2013.01.004","article-title":"Soft error resilient qr factorization for hybrid system with gpgpu","volume":"4","author":"Du","year":"2013","journal-title":"J. Comput. Sci."},{"doi-asserted-by":"crossref","unstructured":"J. Tan, N. Goswami, T. Li, X. Fu, Analyzing soft-error vulnerability on gpgpu microarchitecture, in: 2011 IEEE International Symposium on Workload Characterization (IISWC), 2011, pp. 226\u2013235. http:\/\/dx.doi.org\/10.1109\/IISWC.2011.6114182.","key":"10.1016\/j.parco.2014.11.003_b0130","DOI":"10.1109\/IISWC.2011.6114182"},{"doi-asserted-by":"crossref","unstructured":"B. Mills, T. Znati, R. Melhem, Shadow computing: an energy-aware fault tolerant computing model, in: 2014 International Conference on Computing, Networking and Communications (ICNC), 2014, pp. 73\u201377. http:\/\/dx.doi.org\/10.1109\/ICCNC.2014.6785308.","key":"10.1016\/j.parco.2014.11.003_b0135","DOI":"10.1109\/ICCNC.2014.6785308"},{"issue":"9","key":"10.1016\/j.parco.2014.11.003_b0140","doi-asserted-by":"crossref","first-page":"530","DOI":"10.1145\/361147.361115","article-title":"A first order approximation to the optimum checkpoint interval","volume":"17","author":"Young","year":"1974","journal-title":"Commun. ACM"},{"issue":"3","key":"10.1016\/j.parco.2014.11.003_b0145","doi-asserted-by":"crossref","first-page":"303","DOI":"10.1016\/j.future.2004.11.016","article-title":"A higher order estimate of the optimum checkpoint interval for restart dumps","volume":"22","author":"Daly","year":"2006","journal-title":"Future Gener. Comput. Syst."},{"key":"10.1016\/j.parco.2014.11.003_b0150","series-title":"Proceedings of the 19th ACM International Symposium on High Performance Distributed Computing, HPDC \u201910","first-page":"276","article-title":"Impact of sub-optimal checkpoint intervals on application efficiency in computational clusters","author":"Jones","year":"2010"},{"doi-asserted-by":"crossref","unstructured":"Y. Liu, R. Nassar, C. Leangsuksun, N. Naksinehaboon, M. Paun, S. Scott, An optimal checkpoint\/restart model for a large scale high performance computing system, in: IEEE International Symposium on Parallel and Distributed Processing, IPDPS 2008, 2008, pp. 1\u20139. http:\/\/dx.doi.org\/10.1109\/IPDPS.2008.4536279.","key":"10.1016\/j.parco.2014.11.003_b0155","DOI":"10.1109\/IPDPS.2008.4536279"},{"key":"10.1016\/j.parco.2014.11.003_b0160","series-title":"Proceedings of the 3rd Workshop on Fault-tolerance for HPC at Extreme Scale, FTXS \u201913","first-page":"49","article-title":"When is multi-version checkpointing needed","author":"Lu","year":"2013"},{"doi-asserted-by":"crossref","unstructured":"G. Aupy, A. Benoit, T. Herault, Y. Robert, F. Vivien, D. Zaidouni, On the combination of silent error detection and checkpointing, in: IEEE 19th Pacific Rim International Symposium on Dependable Computing (PRDC), 2013, pp. 11\u201320. http:\/\/dx.doi.org\/10.1109\/PRDC.2013.10.","key":"10.1016\/j.parco.2014.11.003_b0165","DOI":"10.1109\/PRDC.2013.10"},{"key":"10.1016\/j.parco.2014.11.003_b0170","series-title":"Proceedings of the Fifteenth ACM Symposium on Operating Systems Principles, SOSP \u201995","first-page":"12","article-title":"Hive: fault containment for shared-memory multiprocessors","author":"Chapin","year":"1995"},{"issue":"1","key":"10.1016\/j.parco.2014.11.003_b0175","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/311531.311532","article-title":"Fundamentals of fault-tolerant distributed computing in asynchronous environments","volume":"31","author":"G\u00e4rtner","year":"1999","journal-title":"ACM Comput. Surv."},{"issue":"4","key":"10.1016\/j.parco.2014.11.003_b0180","doi-asserted-by":"crossref","first-page":"299","DOI":"10.1145\/98163.98167","article-title":"Implementing fault-tolerant services using the state machine approach: a tutorial","volume":"22","author":"Schneider","year":"1990","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.parco.2014.11.003_b0185","series-title":"Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis, SC \u201911","first-page":"44:1","article-title":"Evaluating the viability of process replication reliability for exascale systems","author":"Ferreira","year":"2011"},{"doi-asserted-by":"crossref","unstructured":"B. Mills, T. Znati, R. Melhem, K. Ferreira, R. Grant, Energy consumption of resilience mechanisms in large scale systems, in: 22nd Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP), 2014, pp. 528\u2013535. http:\/\/dx.doi.org\/10.1109\/PDP.2014.111.","key":"10.1016\/j.parco.2014.11.003_b0190","DOI":"10.1109\/PDP.2014.111"},{"doi-asserted-by":"crossref","unstructured":"S. Rangarajan, S. Garg, Y. Huang, Checkpoints-on-demand with active replication, in: Proceedings, Seventeenth IEEE Symposium on Reliable Distributed Systems, 1998, pp. 75\u201383. http:\/\/dx.doi.org\/10.1109\/RELDIS.1998.740477.","key":"10.1016\/j.parco.2014.11.003_b0195","DOI":"10.1109\/RELDIS.1998.740477"},{"key":"10.1016\/j.parco.2014.11.003_b0200","first-page":"201","article-title":"Sampling+ dmr: practical and low-overhead permanent fault detection","volume":"vol. 39","author":"Nomura","year":"2011"}],"container-title":["Parallel Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167819114001409?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167819114001409?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2019,8,17]],"date-time":"2019-08-17T16:40:11Z","timestamp":1566060011000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167819114001409"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,1]]},"references-count":40,"alternative-id":["S0167819114001409"],"URL":"https:\/\/doi.org\/10.1016\/j.parco.2014.11.003","relation":{},"ISSN":["0167-8191"],"issn-type":[{"type":"print","value":"0167-8191"}],"subject":[],"published":{"date-parts":[[2015,1]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"GS-DMR: Low-overhead soft error detection scheme for stencil-based computation","name":"articletitle","label":"Article Title"},{"value":"Parallel Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.parco.2014.11.003","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"Copyright \u00a9 2014 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}]}}