{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,19]],"date-time":"2025-03-19T15:12:57Z","timestamp":1742397177956},"publisher-location":"Berlin, Heidelberg","reference-count":25,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783540680390"},{"type":"electronic","value":"9783540680406"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2006]]},"DOI":"10.1007\/11945918_47","type":"book-chapter","created":{"date-parts":[[2006,12,17]],"date-time":"2006-12-17T22:09:45Z","timestamp":1166393385000},"page":"485-496","source":"Crossref","is-referenced-by-count":28,"title":["Proactive Fault Tolerance in MPI Applications Via Task Migration"],"prefix":"10.1007","author":[{"given":"Sayantan","family":"Chakravorty","sequence":"first","affiliation":[]},{"given":"Celso L.","family":"Mendes","sequence":"additional","affiliation":[]},{"given":"Laxmikant V.","family":"Kal\u00e9","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"key":"47_CR1","doi-asserted-by":"crossref","DOI":"10.7551\/mitpress\/7056.001.0001","volume-title":"Using MPI","author":"W. Gropp","year":"1999","unstructured":"Gropp, W., Lusk, E., Skjellum, A.: Using MPI, 2nd edn. MIT Press, Cambridge (1999)","edition":"2"},{"issue":"3","key":"47_CR2","doi-asserted-by":"publisher","first-page":"363","DOI":"10.1177\/1094342004046045","volume":"18","author":"W. Gropp","year":"2004","unstructured":"Gropp, W., Lusk, E.: Fault tolerance in message passing interface programs. International Journal of High Performance Computing Applications\u00a018(3), 363\u2013372 (2004)","journal-title":"International Journal of High Performance Computing Applications"},{"key":"47_CR3","unstructured":"Huang, C.: System support for checkpoint and restart of Charm++ and AMPI applications. Master\u2019s thesis, Dep. of Computer Science, University of Illinois, Urbana, IL (2004), Available at: http:\/\/charm.cs.uiuc.edu\/papers\/CheckpointThesis.html"},{"key":"47_CR4","unstructured":"Zheng, G., Shi, L., Kal\u00e9, L.V.: FTC-Charm++: An in-memory checkpoint-based fault tolerant runtime for Charm++ and MPI. In: 2004 IEEE International Conference on Cluster Computing, San Diego, CA (2004)"},{"key":"47_CR5","volume-title":"FTPDS Workshop at IPDPS 2004","author":"S. Chakravorty","year":"2004","unstructured":"Chakravorty, S., Kal\u00e9, L.V.: A fault tolerant protocol for massively parallel machines. In: FTPDS Workshop at IPDPS 2004, Santa Fe, NM. IEEE Press, Los Alamitos (2004)"},{"key":"47_CR6","unstructured":"Chakravorty, S., Mendes, C.L., Kale, L.V.: Proactive fault tolerance in large systems. In: HPCRI Workshop in conjunction with HPCA 2005 (2005)"},{"key":"47_CR7","unstructured":"Hewlett-Packard, Intel, Microsoft, Phoenix, Toshiba: Advanced configuration and power interface specification. ACPI Specification Document, Revision 3.0 (2004), Available from: http:\/\/www.acpi.info"},{"key":"47_CR8","doi-asserted-by":"crossref","unstructured":"Sahoo, R.K., Oliner, A.J., Rish, I., Gupta, M., Moreira, J.E., Ma, S., Vilalta, R., Sivasubramaniam, A.: Critical event prediction for proactive management in large-scale computer clusters. In: Proceedings og the ACM SIGKDD, Intl. Conf. on Knowledge Discovery Data Mining, pp. 426\u2013435 (2003)","DOI":"10.1145\/956750.956799"},{"key":"47_CR9","unstructured":"Oliner, A.J., Sahoo, R.K., Moreira, J.E., Gupta, M., Sivasubramaniam, A.: Fault-aware job scheduling for BlueGene\/L systems. Technical Report RC23077, IBM Research (2004)"},{"key":"47_CR10","doi-asserted-by":"crossref","first-page":"175","DOI":"10.7551\/mitpress\/5241.003.0009","volume-title":"Parallel Programming using C++","author":"L.V. Kal\u00e9","year":"1996","unstructured":"Kal\u00e9, L.V., Krishnan, S.: Charm++: Parallel programming with message-driven objects. In: Wilson, G.V., Lu, P. (eds.) Parallel Programming using C++, pp. 175\u2013213. MIT Press, Cambridge (1996)"},{"key":"47_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-24644-2_20","volume-title":"Languages and Compilers for Parallel Computing","author":"C. Huang","year":"2004","unstructured":"Huang, C., Lawlor, O., Kal\u00e9, L.V.: Adaptive MPI. In: Rauchwerger, L. (ed.) LCPC 2003. LNCS, vol.\u00a02958. Springer, Heidelberg (2004)"},{"key":"47_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"476","DOI":"10.1007\/978-3-540-71351-7_37","volume-title":"High Performance Computing for Computational Science - VECPAR 2006","author":"F. Gioachin","year":"2007","unstructured":"Gioachin, F., Sharma, A., Chakravorty, S., Mendes, C.L., Kal\u00e9, L.V., Quinn, T.: Scalable Cosmological Simulations on Parallel Machines. In: Dayd\u00e9, M., Palma, J.M.L.M., Coutinho, \u00c1.L.G.A., Pacitti, E., Lopes, J.C. (eds.) VECPAR 2006. LNCS, vol.\u00a04395, pp. 476\u2013489. Springer, Heidelberg (2007)"},{"key":"47_CR13","doi-asserted-by":"crossref","unstructured":"Kal\u00e9, L.V., Kumar, S., Zheng, G., Lee, C.W.: Scaling molecular dynamics to 3000 processors with projections: A performance analysis case study. In: Terascale Performance Analysis Workshop, International Conference on Computational Science (ICCS), Melbourne, Australia (2003)","DOI":"10.1007\/3-540-44864-0_3"},{"key":"47_CR14","doi-asserted-by":"publisher","first-page":"371","DOI":"10.1002\/cpe.665","volume":"15","author":"O.S. Lawlor","year":"2003","unstructured":"Lawlor, O.S., Kal\u00e9, L.V.: Supporting dynamic parallel object arrays. Concurrency and Computation: Practice and Experience\u00a015, 371\u2013393 (2003)","journal-title":"Concurrency and Computation: Practice and Experience"},{"key":"47_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"496","DOI":"10.1007\/BFb0097934","volume-title":"Proc. 3rd Workshop on Runtime Systems for Parallel Programming (RTSPP)","author":"G. Antoniu","year":"1999","unstructured":"Antoniu, G., Bouge, L., Namyst, R.: An efficient and transparent thread migration scheme in the PM 2 runtime system. In: Juan, S., Rico, P. (eds.) Proc. 3rd Workshop on Runtime Systems for Parallel Programming (RTSPP). LNCS, vol.\u00a01586, pp. 496\u2013510. Springer, Heidelberg (1999)"},{"key":"47_CR16","doi-asserted-by":"crossref","unstructured":"Stellner, G.: CoCheck: Checkpointing and process migration for MPI. In: Proceedings of the 10th International Parallel Processing Symposium, pp. 526\u2013531 (1996)","DOI":"10.1109\/IPPS.1996.508106"},{"issue":"3","key":"47_CR17","doi-asserted-by":"publisher","first-page":"227","DOI":"10.1023\/A:1023540604208","volume":"6","author":"A. Agbaria","year":"2003","unstructured":"Agbaria, A., Friedman, R.: Starfish: Fault-tolerant dynamic MPI programs on clusters of workstations. Cluster Computing\u00a06(3), 227\u2013236 (2003)","journal-title":"Cluster Computing"},{"key":"47_CR18","doi-asserted-by":"crossref","unstructured":"Chen, Y., Plank, J.S., Li, K.: Clip: A checkpointing tool for message-passing parallel programs. In: Proceedings of the 1997 ACM\/IEEE conference on Supercomputing (CDROM), pp. 1\u201311 (1997)","DOI":"10.1145\/509593.509626"},{"issue":"3","key":"47_CR19","doi-asserted-by":"publisher","first-page":"204","DOI":"10.1145\/3959.3962","volume":"3","author":"R. Strom","year":"1985","unstructured":"Strom, R., Yemini, S.: Optimistic recovery in distributed systems. ACM Transactions on Computer Systems\u00a03(3), 204\u2013226 (1985)","journal-title":"ACM Transactions on Computer Systems"},{"issue":"3","key":"47_CR20","doi-asserted-by":"publisher","first-page":"353","DOI":"10.1177\/1094342004046052","volume":"18","author":"G.E. Fagg","year":"2004","unstructured":"Fagg, G.E., Dongarra, J.J.: Building and using a fault-tolerant MPI implementation. International Journal of High Performance Computing Applications\u00a018(3), 353\u2013361 (2004)","journal-title":"International Journal of High Performance Computing Applications"},{"key":"47_CR21","doi-asserted-by":"publisher","first-page":"26","DOI":"10.1109\/CCGRID.2001.923171","volume-title":"Proceedings of the 1st International Symposium on Cluster Computing and the Grid","author":"R. Batchu","year":"2001","unstructured":"Batchu, R., Skjellum, A., Cui, Z., Beddhu, M., Neelamegam, J.P., Dandass, Y., Apte, M.: Mpi\/fttm: Architecture and taxonomies for fault-tolerant, message-passing middleware for performance-portable parallel computing. In: Proceedings of the 1st International Symposium on Cluster Computing and the Grid, p. 26. IEEE Computer Society, Los Alamitos (2001)"},{"issue":"4","key":"47_CR22","doi-asserted-by":"publisher","first-page":"371","DOI":"10.1142\/S0129626400000342","volume":"10","author":"S. Louca","year":"2000","unstructured":"Louca, S., Neophytou, N., Lachanas, A., Evripidou, P.: MPI-FT: Portable fault tolerance scheme for MPI. Parallel Processing Letters\u00a010(4), 371\u2013382 (2000)","journal-title":"Parallel Processing Letters"},{"key":"47_CR23","doi-asserted-by":"crossref","unstructured":"Bouteiller, A., Cappello, F., H\u00e9rault, T., Krawezik, G., Lemarinier, P., Magniette, F.: MPICH-V2: A fault tolerant MPI for volatile nodes based on the pessimistic sender based message logging programming via processor virtualization. In: Proceedings of Supercomputing 2003, Phoenix, AZ (2003)","DOI":"10.1145\/1048935.1050176"},{"issue":"5","key":"47_CR24","doi-asserted-by":"publisher","first-page":"526","DOI":"10.1109\/12.142678","volume":"41","author":"E.N. Elnozahy","year":"1992","unstructured":"Elnozahy, E.N., Zwaenepoel, W.: Manetho: Transparent rollback-recovery with low overhead, limited rollback, and fast output commit. IEEE Transactions on Computers\u00a041(5), 526\u2013531 (1992)","journal-title":"IEEE Transactions on Computers"},{"key":"47_CR25","doi-asserted-by":"crossref","unstructured":"Pertet, S., Narasimhan, P.: Proactive recovery in distributed CORBA applications. In: Proceedings of the International Conference on Dependable Systems and Networks, pp. 357\u2013366 (2004)","DOI":"10.1109\/DSN.2004.1311905"}],"container-title":["Lecture Notes in Computer Science","High Performance Computing - HiPC 2006"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/11945918_47.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,9]],"date-time":"2024-02-09T08:29:13Z","timestamp":1707467353000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/11945918_47"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2006]]},"ISBN":["9783540680390","9783540680406"],"references-count":25,"URL":"https:\/\/doi.org\/10.1007\/11945918_47","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2006]]}}}