{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:40:06Z","timestamp":1740123606267,"version":"3.37.3"},"reference-count":33,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2017,8,20]],"date-time":"2017-08-20T00:00:00Z","timestamp":1503187200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61402488"],"award-info":[{"award-number":["61402488"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2019,8]]},"DOI":"10.1007\/s11227-017-2116-5","type":"journal-article","created":{"date-parts":[[2017,8,20]],"date-time":"2017-08-20T05:54:21Z","timestamp":1503208461000},"page":"4226-4247","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Toward fault-tolerant hybrid programming over large-scale heterogeneous clusters via checkpointing\/restart optimization"],"prefix":"10.1007","volume":"75","author":[{"given":"Cheng","family":"Chen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yunfei","family":"Du","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ke","family":"Zuo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianbin","family":"Fang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Canqun","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,8,20]]},"reference":[{"key":"2116_CR1","doi-asserted-by":"publisher","first-page":"344","DOI":"10.1007\/s02011-011-1137-8","volume":"26","author":"X Yang","year":"2011","unstructured":"Yang X, Liao X, Lu K, Hu Q, Song J, Su J (2011) The Tianhe-1A supercomputer: its hardware and software. J Comput Sci Technol 26:344\u2013351","journal-title":"J Comput Sci Technol"},{"key":"2116_CR2","unstructured":"www.top500.org"},{"key":"2116_CR3","doi-asserted-by":"publisher","first-page":"345","DOI":"10.1007\/s11704-014-3501-3","volume":"8","author":"X Liao","year":"2014","unstructured":"Liao X, Xiao L, Yang C, Lv Y (2014) Milkyway-2 supercomputer: system and application. Front Comput Sci 8:345\u2013356","journal-title":"Front Comput Sci"},{"key":"2116_CR4","doi-asserted-by":"crossref","unstructured":"Liao X, Yang C, Tang T, Yi H, Wang F, Wu Q, Xue J (2014) OpenMC: towards simplifying programming for tianhe supercomputers. J Comput Sci Technol 29(3):532\u2013546","DOI":"10.1007\/s11390-014-1447-4"},{"key":"2116_CR5","doi-asserted-by":"publisher","first-page":"83","DOI":"10.1109\/MCSE.2015.40","volume":"17","author":"A Dubrow","year":"2015","unstructured":"Dubrow A (2015) What got done in one year at NSF\u2019s Stampede supercomputer. Comput Sci Eng 17:83\u201388","journal-title":"Comput Sci Eng"},{"key":"2116_CR6","doi-asserted-by":"crossref","unstructured":"Chen C, Fang J, Tang T, Yang C (2017) LU factorization on heterogeneous systems: an energy-efficient approach towards high performance. Computing 99(8):791\u2013811","DOI":"10.1007\/s00607-016-0537-2"},{"key":"2116_CR7","doi-asserted-by":"crossref","unstructured":"Karablieh F, Bazzi RA (2002) Heterogeneous checkpointing for multithreaded applications. In: 21st IEEE Symposium on Reliable Distributed Systems. IEEE, pp 140\u2013149","DOI":"10.1109\/RELDIS.2002.1180182"},{"key":"2116_CR8","doi-asserted-by":"crossref","unstructured":"Di Martino C, Kramer W, Kalbarczyk Z, Iyer R (2015) Measuring and understanding extreme-scale application resilience: a field study of 5,000,000 HPC application runs. In: IEEE\/IFIP International Conference on Dependable Systems and Networks, pp 25\u201336","DOI":"10.1109\/DSN.2015.50"},{"key":"2116_CR9","doi-asserted-by":"crossref","unstructured":"Gomez LB, Nukada A, Maruyama N, Cappello F (2010) Low-overhead diskless checkpoint for hybrid computing systems. In: 2010 International Conference on High Performance Computing, pp 1\u201310","DOI":"10.1109\/HIPC.2010.5713163"},{"key":"2116_CR10","doi-asserted-by":"crossref","unstructured":"Zheng G, Ni X, Kal\u00e9 LV (2012) A scalable double in-memory checkpoint and restart scheme towards exascale. In: 2012 IEEE\/IFIP 42nd International Conference on Dependable Systems and Networks Workshops. IEEE, pp 1\u20136","DOI":"10.1109\/DSNW.2012.6264677"},{"key":"2116_CR11","doi-asserted-by":"crossref","unstructured":"Sato K, Maruyama N, Mohror K, Moody A, Gamblin T, de Supinski BR, Matsuoka S (2012) Design and modeling of a non-blocking checkpointing system. In: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, pp 19:1\u201319:10","DOI":"10.1109\/SC.2012.46"},{"key":"2116_CR12","doi-asserted-by":"crossref","unstructured":"Gomez LAB, Maruyama N, Cappello F, Matsuoka S (2010) Distributed diskless checkpoint for large scale systems. In: Proceedings of the 2010 10th IEEE\/ACM International Conference on Cluster, Cloud and Grid Computing, pp 63\u201372","DOI":"10.1109\/CCGRID.2010.40"},{"key":"2116_CR13","doi-asserted-by":"crossref","unstructured":"Ropars T, Martsinkevich TV, Guermouche A, Schiper A, Cappello F (2013) SPBC: leveraging the characteristics of MPI HPC applications for scalable checkpointing. In: High performance computing, networking, storage and analysis, pp 1\u201312","DOI":"10.1145\/2503210.2503271"},{"key":"2116_CR14","doi-asserted-by":"crossref","unstructured":"Dong X, Wen M, Chai J, Cai X, Zhao M, Zhang C (2015) Communication-hiding programming for clusters with multi-coprocessor nodes. Concurr Comput Pract Exp 27(16):4172\u20134185","DOI":"10.1002\/cpe.3507"},{"key":"2116_CR15","doi-asserted-by":"publisher","first-page":"146","DOI":"10.1016\/j.jpdc.2017.01.020","volume":"104","author":"BB Fraguela","year":"2017","unstructured":"Fraguela BB, Losada N, Gonz\u00e1lez P, Mart\u00edn MJ (2017) A portable and adaptable fault tolerance solution for heterogeneous applications. J Parallel Distrib Comput 104:146\u2013158","journal-title":"J Parallel Distrib Comput"},{"key":"2116_CR16","doi-asserted-by":"crossref","unstructured":"Kannan S, Farooqui N, Gavrilovska A, Schwan K (2014) HeteroCheckpoint: efficient checkpointing for accelerator-based systems. In: Proceedings of the 2014 44th Annual IEEE\/IFIP International Conference on Dependable Systems and Networks, pp 738\u2013743","DOI":"10.1109\/DSN.2014.76"},{"key":"2116_CR17","doi-asserted-by":"crossref","unstructured":"Takizawa H, Koyama K, Sato K, Komatsu K, Kobayashi H (2011) CheCL: transparent checkpointing and process migration of OpenCL applications. In: 2011 IEEE International on Parallel and Distributed Processing Symposium. IEEE, pp 864\u2013876","DOI":"10.1109\/IPDPS.2011.85"},{"key":"2116_CR18","doi-asserted-by":"crossref","unstructured":"Takizawa H, Sato K, Komatsu K, Kobayashi H (2009) CheCUDA: a checkpoint\/restart tool for CUDA applications. In: 2009 International Conference on Parallel and Distributed Computing, Applications and Technologies. IEEE, pp 408\u2013413","DOI":"10.1109\/PDCAT.2009.78"},{"key":"2116_CR19","doi-asserted-by":"crossref","unstructured":"Nukada A, Takizawa H, Matsuoka S (2011) NVCR: a transparent checkpoint-restart library for NVIDIA CUDA. In: 2011 IEEE International Symposium on Parallel and Distributed Processing Workshops and Phd Forum (IPDPSW), pp 104\u2013113","DOI":"10.1109\/IPDPS.2011.131"},{"key":"2116_CR20","doi-asserted-by":"crossref","unstructured":"Rezaei A, Coviello G, Li C-H, Chakradhar S, Mueller F (2014) Snapify: capturing snapshots of offload applications on Xeon Phi manycore processors. In: Proceedings of the 23rd International Symposium on High-Performance Parallel and Distributed Computing. New York, NY, USA. ACM, pp 1\u201312","DOI":"10.1145\/2600212.2600215"},{"key":"2116_CR21","unstructured":"Knights Corner software developers guide. April 27 (2012)"},{"key":"2116_CR22","unstructured":"Schulz KW, Ulerich R, Malaya N, Bauman PT, Stogner R, Simmons C (2012) Early experiences porting scientific applications to the many integrated core (MIC) platform. In: TACC-Intel Highly Parallel Computing Symposium, Austin, TX"},{"key":"2116_CR23","doi-asserted-by":"crossref","unstructured":"Koo R, Toueg S (1987) Checkpointing and rollback-recovery for distributed systems. In: IEEE Transactions on Software Engineering, no 1. IEEE, pp 23\u201331","DOI":"10.1109\/TSE.1987.232562"},{"key":"2116_CR24","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611971538","volume-title":"Templates for the solution of linear systems: building blocks for iterative methods","author":"R Barrett","year":"1994","unstructured":"Barrett R, Berry MW, Chan TF, Demmel J, Donato J, Dongarra J, Eijkhout V, Pozo R, Romine C, Van der Vorst H (1994) Templates for the solution of linear systems: building blocks for iterative methods, vol 43. Siam, Philadelphia"},{"key":"2116_CR25","doi-asserted-by":"crossref","unstructured":"Yang C, Wang F, Du Y, Chen J, Liu J, Yi H, Lu K (2010) Adaptive optimization for petascale heterogeneous CPU\/GPU computing. In: 2010 IEEE International Conference on Cluster Computing. IEEE, pp 19\u201328","DOI":"10.1109\/CLUSTER.2010.12"},{"key":"2116_CR26","doi-asserted-by":"publisher","first-page":"1477","DOI":"10.1002\/qua.21665","volume":"108","author":"S Shahbazian","year":"2008","unstructured":"Shahbazian S (2008) Revisiting the foundations of quantum theory of atoms in molecules: the variational procedure and the zero-flux conditions. Int J Quantum Chem 108:1477\u20131484","journal-title":"Int J Quantum Chem"},{"issue":"9","key":"2116_CR27","doi-asserted-by":"publisher","first-page":"530","DOI":"10.1145\/361147.361115","volume":"17","author":"JW Young","year":"1974","unstructured":"Young JW (1974) A first order approximation to the optimum checkpoint interval. Commun ACM 17(9):530\u2013531","journal-title":"Commun ACM"},{"key":"2116_CR28","doi-asserted-by":"crossref","unstructured":"Xu X, Lin Y, Tang T, Lin Y (2010) HiAL-Ckpt: a hierarchical application-level checkpointing for CPU-GPU hybrid systems. In: 2010 5th International Conference on Computer Science Education, pp 1895\u20131899","DOI":"10.1109\/ICCSE.2010.5593819"},{"key":"2116_CR29","unstructured":"Laosooksathit S, Naksinehaboon N, Leangsuksan C, Dhungana A, Chandler C, Chanchio K, Farbin A (2010) Lightweight checkpoint mechanism and modeling in GPGPU environment. In: 4th workshop on system level virtualization for high performance computing (HPCVirt 2010), April 2010"},{"key":"2116_CR30","doi-asserted-by":"crossref","unstructured":"Guo X, Jiang H, Li KC (2013) A checkpoint\/restart scheme for CUDA applications with complex memory hierarchy. In: 2013 14th ACIS International Conference on Software Engineering, Artificial Intelligence, Networking and Parallel\/Distributed Computing, pp 247\u2013252","DOI":"10.1109\/SNPD.2013.5"},{"key":"2116_CR31","doi-asserted-by":"crossref","unstructured":"Pe\u00f1a AJ, Bland W, Balaji P (2015) VOCL-FT: introducing techniques for efficient soft error coprocessor recovery. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp 71:1\u201371:12","DOI":"10.1145\/2807591.2807640"},{"key":"2116_CR32","doi-asserted-by":"crossref","unstructured":"Rajachandrasekar R, Potluri S, Venkatesh A, Hamidouche K, Wasi-ur Rahman Md, Panda DK (2014) MIC-Check: a distributed check pointing framework for the intel many integrated cores architecture. In: Proceedings of the 23rd International Symposium on High-Performance Parallel and Distributed Computing. ACM, pp 121\u2013124","DOI":"10.1145\/2600212.2600713"},{"key":"2116_CR33","doi-asserted-by":"crossref","unstructured":"Chen C, Du Y, Xu Z, Yang C (2015) FT-Offload: a scalable fault-tolerance programing model on MIC cluster. In: Proceeding of 15th International Conference on Algorithms and Architectures for Parallel Processing, pp 3\u201317","DOI":"10.1007\/978-3-319-27140-8_1"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-017-2116-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-017-2116-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-017-2116-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,9,23]],"date-time":"2019-09-23T11:46:08Z","timestamp":1569239168000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-017-2116-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,8,20]]},"references-count":33,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2019,8]]}},"alternative-id":["2116"],"URL":"https:\/\/doi.org\/10.1007\/s11227-017-2116-5","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"type":"print","value":"0920-8542"},{"type":"electronic","value":"1573-0484"}],"subject":[],"published":{"date-parts":[[2017,8,20]]},"assertion":[{"value":"20 August 2017","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}