{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,20]],"date-time":"2025-04-20T04:23:04Z","timestamp":1745122984487,"version":"3.40.4"},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2012,11,2]],"date-time":"2012-11-02T00:00:00Z","timestamp":1351814400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Int J Parallel Prog"],"published-print":{"date-parts":[[2013,6]]},"DOI":"10.1007\/s10766-012-0218-5","type":"journal-article","created":{"date-parts":[[2012,11,1]],"date-time":"2012-11-01T12:07:23Z","timestamp":1351771643000},"page":"469-493","source":"Crossref","is-referenced-by-count":9,"title":["Multi-Fault Tolerance for Cartesian Data Distributions"],"prefix":"10.1007","volume":"41","author":[{"given":"Nawab","family":"Ali","sequence":"first","affiliation":[]},{"given":"Sriram","family":"Krishnamoorthy","sequence":"additional","affiliation":[]},{"given":"Mahantesh","family":"Halappanavar","sequence":"additional","affiliation":[]},{"given":"Jeff","family":"Daily","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2012,11,2]]},"reference":[{"key":"218_CR1","doi-asserted-by":"crossref","unstructured":"Ali, N., Carns, P.H., Iskra, K., Kimpe, D., Lang, S., Latham, R., Ross, R.B., Ward, L., Sadayappan, P.: Scalable I\/O forwarding framework for high-performance computing systems. In: IEEE International Conference on Cluster Computing, pp. 1\u201310, Aug (2009)","DOI":"10.1109\/CLUSTR.2009.5289188"},{"key":"218_CR2","doi-asserted-by":"crossref","unstructured":"Ali, N., Krishnamoorthy, S., Govind, N., Kowalski, K., Sadayappan, P.: Application-specific fault tolerance via data access characterization. In International European Conference on Parallel and Distributed Computing, Aug (2011a)","DOI":"10.1007\/978-3-642-23397-5_34"},{"key":"218_CR3","doi-asserted-by":"crossref","unstructured":"Ali, N., Krishnamoorthy, S., Govind, N., Palmer, B.: A redundant communication approachq to scalable fault tolerance in PGAS programming models. In: 19th Euromicro International Conference on Parallel, Distributed and Network-Based Computing, Ayia Napa, Cyprus, Feb (2011b)","DOI":"10.1109\/PDP.2011.72"},{"key":"218_CR4","doi-asserted-by":"crossref","unstructured":"Ali, N., Krishnamoorthy, S., Halappanavar, M., Daily, J.: Tolerating correlated failures for generalized cartesian distributions via bipartite matching. In: ACM International Conference on Computing Frontiers, May (2011c)","DOI":"10.1145\/2016604.2016649"},{"issue":"4","key":"218_CR5","doi-asserted-by":"crossref","first-page":"410","DOI":"10.1016\/j.jpdc.2008.12.002","volume":"69","author":"G. Bosilca","year":"2009","unstructured":"Bosilca G., Delmas R., Dongarra J., Langou J.: Algorithm-based fault tolerance applied to high performance computing. J. Parallel Distrib. Comput. 69(4), 410\u2013416 (2009)","journal-title":"J. Parallel Distrib. Comput."},{"key":"218_CR6","doi-asserted-by":"crossref","unstructured":"Bronevetsky, G., Moody, A.: Scalable I\/O systems via node-local storage: approaching 1\u00a0TB\/sec file I\/O. Technical report LLNL-TR-415791, Lawrence Livermore National Laboratory, Aug (2009)","DOI":"10.2172\/964079"},{"key":"218_CR7","doi-asserted-by":"crossref","DOI":"10.1137\/1.9780898717754","volume-title":"Assignment Problems","author":"R. Burkard","year":"2009","unstructured":"Burkard R., Dell\u2019Amico M., Martello S.: Assignment Problems. Society for Industrial and Applied Mathematics, Philadelphia (2009)"},{"key":"218_CR8","unstructured":"Chen, Z., Dongarra, J.: Algorithm-based checkpoint-free fault tolerance for parallel matrix computations on volatile resources. In: IEEE International Parallel & Distributed Processing Symposium, Apr (2006)"},{"issue":"12","key":"218_CR9","doi-asserted-by":"crossref","first-page":"1628","DOI":"10.1109\/TPDS.2008.58","volume":"19","author":"Z. Chen","year":"2008","unstructured":"Chen Z., Dongarra J.: Algorithm-based fault tolerance for fail-stop failures. IEEE Trans. Parallel Distrib. Syst. 19(12), 1628\u20131641 (2008)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"218_CR10","doi-asserted-by":"crossref","unstructured":"Costa, P., Pasin, M., Bessani, A., Correia, M.: Byzantine fault-tolerant mapreduce: faults are not just crashes. In: IEEE International Conference on Cloud Computing Technology and Science, pp. 32\u201339 (2011)","DOI":"10.1109\/CloudCom.2011.15"},{"issue":"9","key":"218_CR11","doi-asserted-by":"crossref","first-page":"887","DOI":"10.1016\/S0743-7315(03)00103-5","volume":"63","author":"A. Darte","year":"2003","unstructured":"Darte A., Mellor-Crummey J., Fowler R., Chavarr\u00eda-Miranda D.: Generalized multipartitioning of multi-dimensional arrays for parallelizing line-sweep computations. J. Parallel Distrib. Comput. 63(9), 887\u2013911 (2003)","journal-title":"J. Parallel Distrib. Comput."},{"key":"218_CR12","unstructured":"Dean, J., Ghemawat S.: MapReduce: simplified data processing on large clusters. In: USENIX Symposium on Operating Systems Design and Implementation, pp. 137\u2013150 (2004)"},{"issue":"3","key":"218_CR13","doi-asserted-by":"crossref","first-page":"375","DOI":"10.1145\/568522.568525","volume":"34","author":"E.N. Elnozahy","year":"2002","unstructured":"Elnozahy E.N., Alvisi L., Wang Y.-M., Johnson D.B.: A survey of rollback-recovery protocols in message-passing systems. ACM Comput. Surv. 34(3), 375\u2013408 (2002)","journal-title":"ACM Comput. Surv."},{"key":"218_CR14","doi-asserted-by":"crossref","unstructured":"Engelmann, C., Vall\u00e9e, G., Naughton, T., Scott, S.L.: Proactive fault tolerance using preemptive migration. In: International Conference on Parallel, Distributed and Network-based Processing, pp. 252\u2013257, Feb (2009)","DOI":"10.1109\/PDP.2009.31"},{"key":"218_CR15","doi-asserted-by":"crossref","unstructured":"Fagg, G.E., Dongarra, J.: FT-MPI: fault tolerant MPI, supporting dynamic applications in a dynamic world. In: Proceedings of the 7th European PVM\/MPI Users\u2019 Group Meeting on Recent Advances in Parallel Virtual Machine and Message Passing Interface, pp. 346\u2013353 (2000)","DOI":"10.1007\/3-540-45255-9_47"},{"issue":"2","key":"218_CR16","doi-asserted-by":"crossref","first-page":"221","DOI":"10.1145\/321941.321942","volume":"23","author":"H.N. Gabow","year":"1976","unstructured":"Gabow H.N.: An efficient implementation of edmonds\u2019 algorithm for maximum matching on graphs. J. ACM 23(2), 221\u2013234 (1976)","journal-title":"J. ACM"},{"key":"218_CR17","doi-asserted-by":"crossref","unstructured":"Gupta, R., Beckman, P., Park, B.-H., Lusk, E., Hargrove, P., Geist, A., Panda, D., Lumsdaine, A., Dongarra, J.: CIFTS: a coordinated infrastructure for fault-tolerant systems. In: Proceedings of the International Conference on Parallel Processing, pp. 237\u2013245 (2009)","DOI":"10.1109\/ICPP.2009.20"},{"key":"218_CR18","unstructured":"Halappanavar, M.: Algorithms for vertex-weighted matching in graphs. PhD thesis, Old Dominion University, Norfolk, VA (2009)"},{"issue":"1","key":"218_CR19","doi-asserted-by":"crossref","first-page":"494","DOI":"10.1088\/1742-6596\/46\/1\/067","volume":"46","author":"P.H. Hargrove","year":"2006","unstructured":"Hargrove P.H., Duell J.C.: Berkeley lab checkpoint\/restart (BLCR) for Linux clusters. J. Phys. Conf. Ser. 46(1), 494\u2013499 (2006)","journal-title":"J. Phys. Conf. Ser."},{"key":"218_CR20","doi-asserted-by":"crossref","first-page":"225","DOI":"10.1137\/0202019","volume":"2","author":"J. Hopcroft","year":"1973","unstructured":"Hopcroft J., Karp R.: A $${n^{\\frac{5}{2}}}$$ algorithm for maximum matchings in bipartite graphs. SIAM J. Comput. 2, 225\u2013231 (1973)","journal-title":"SIAM J. Comput."},{"issue":"6","key":"218_CR21","doi-asserted-by":"crossref","first-page":"518","DOI":"10.1109\/TC.1984.1676475","volume":"33","author":"K.-H. Huang","year":"1984","unstructured":"Huang K.-H., Abraham J.A.: Algorithm-based fault tolerance for matrix operations. IEEE Trans. Comput. 33(6), 518\u2013528 (1984)","journal-title":"IEEE Trans. Comput."},{"key":"218_CR22","unstructured":"HPL. http:\/\/www.netlib.org\/benchmark\/hpl"},{"key":"218_CR23","doi-asserted-by":"crossref","unstructured":"Isard, M., Budiu, M., Yu, Y., Birrell, A., Fetterly, D.: Dryad: distributed data-parallel programs from sequential building blocks. In: Proceedings of the 2nd ACM SIGOPS\/EuroSys European Conference on Computer Systems, pp. 59\u201372 (2007)","DOI":"10.1145\/1272996.1273005"},{"key":"218_CR24","doi-asserted-by":"crossref","first-page":"83","DOI":"10.1002\/nav.3800020109","volume":"2","author":"H.W. Kuhn","year":"1955","unstructured":"Kuhn H.W.: The Hungarian method for the assignment problem. Naval Res. Logist. Q. 2, 83\u201397 (1955)","journal-title":"Naval Res. Logist. Q."},{"key":"218_CR25","volume-title":"Combinatorial Optimization: Networks and Matroids","author":"E. Lawler","year":"2001","unstructured":"Lawler E.: Combinatorial Optimization: Networks and Matroids. Dover Publications, Mineola (2001)"},{"key":"218_CR26","volume-title":"Matching Theory","author":"L. Lovasz","year":"1986","unstructured":"Lovasz L., Plummer M.D.: Matching Theory. North-Holland Publishing Co., Amsterdam (1986)"},{"issue":"6","key":"218_CR27","doi-asserted-by":"crossref","first-page":"1329","DOI":"10.1145\/195613.195663","volume":"41","author":"R. Motwani","year":"1994","unstructured":"Motwani R.: Average-case analysis of algorithms for matchings and related problems. J. ACM 41(6), 1329\u20131356 (1994)","journal-title":"J. ACM"},{"key":"218_CR28","doi-asserted-by":"crossref","first-page":"203","DOI":"10.1177\/1094342006064503","volume":"20","author":"J. Nieplocha","year":"2006","unstructured":"Nieplocha J., Palmer B., Tipparaju V., Krishnan M., Trease H., Apr\u00e0 \u00e0 E.: Advances, applications and performance of the global arrays shared memory programming toolkit. Int. J. High Perform. Comput. Appl. 20, 203\u2013231 (2006)","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"218_CR29","unstructured":"Panda, D.K.: MVAPICH. http:\/\/mvapich.cse.ohio-state.edu"},{"key":"218_CR30","volume-title":"Combinatorial Optimization: Algorithms and Complexity","author":"C.H. Papadimitriou","year":"1982","unstructured":"Papadimitriou C.H., Steiglitz K.: Combinatorial Optimization: Algorithms and Complexity. Prentice-Hall Inc., Upper Saddle River (1982)"},{"key":"218_CR31","doi-asserted-by":"crossref","unstructured":"Plank, J., Li, K.: Faster checkpointing with N + 1 parity. In: International Symposium on Fault-Tolerant Computing, pp. 288\u2013297, June (1994)","DOI":"10.1109\/FTCS.1994.315631"},{"key":"218_CR32","unstructured":"Plank, J.S., Beck, M., Kingsley, G., Li, K.: Libckpt: transparent checkpointing under Unix. In: Usenix Winter Technical Conference, pp. 213\u2013223, Jan (1995)"},{"issue":"10","key":"218_CR33","doi-asserted-by":"crossref","first-page":"972","DOI":"10.1109\/71.730527","volume":"9","author":"J.S. Plank","year":"1998","unstructured":"Plank J.S., Li K., Puening M.A.: Diskless checkpointing. IEEE Trans. Parallel Distrib. Syst. 9(10), 972\u2013986 (1998)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"218_CR34","volume-title":"Combinatorial Optimization: Polyhedra and Efficiency","author":"A. Schrijver","year":"2003","unstructured":"Schrijver A.: Combinatorial Optimization: Polyhedra and Efficiency. Springer Publishing Co., New York (2003)"},{"issue":"1","key":"218_CR35","first-page":"1","volume":"78","author":"B. Schroeder","year":"2007","unstructured":"Schroeder B., Gibson G.A.: Understanding failures in petascale computers. J. Phys. Conf. Ser. 78(1), 1\u201311 (2007)","journal-title":"J. Phys. Conf. Ser."},{"key":"218_CR36","unstructured":"Tipparaju, V., Krishnan, M., Palmer, B., Petrini, F., Nieplocha, J.: Towards fault resilient global arrays. In: International Conference on Parallel Computing, vol. 15, pp. 339\u2013345 (2007)"},{"key":"218_CR37","unstructured":"The ScaLAPACK project. http:\/\/www.netlib.org\/scalapack"},{"issue":"9","key":"218_CR38","doi-asserted-by":"crossref","first-page":"1477","DOI":"10.1016\/j.cpc.2010.04.018","volume":"181","author":"M. Valiev","year":"2010","unstructured":"Valiev M., Bylaska E., Govind N., Kowalski K., Straatsma T., Dam H.V., Wang D., Nieplocha J., Apra E., Windus T., de Jong W.: NWChem: a comprehensive and scalable open-source solution for large scale molecular simulations. Comput. Phys. Commun. 181(9), 1477\u20131489 (2010)","journal-title":"Comput. Phys. Commun."},{"key":"218_CR39","doi-asserted-by":"crossref","unstructured":"Wang, C., Mueller, F., Engelmann, C., Scott, S.L.: Proactive process-level live migration in HPC environments. In: Proceedings of the ACM\/IEEE Conference on Supercomputing, pp. 1\u201312, Nov (2008)","DOI":"10.1109\/SC.2008.5222634"},{"key":"218_CR40","volume-title":"Integer Programming","author":"L.A. Wolsey","year":"1998","unstructured":"Wolsey L.A.: Integer Programming. Wiley, Hoboken (1998)"},{"key":"218_CR41","unstructured":"Zheng, G., Shi, L., Kale, L.V.: FTC-Charm++: an in-memory checkpoint-based fault tolerant runtime for charm++ and MPI. In: IEEE International Conference on Cluster Computing, pp. 93\u2013103, Sept (2004)"}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-012-0218-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10766-012-0218-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-012-0218-5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,20]],"date-time":"2025-04-20T02:02:17Z","timestamp":1745114537000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10766-012-0218-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012,11,2]]},"references-count":41,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2013,6]]}},"alternative-id":["218"],"URL":"https:\/\/doi.org\/10.1007\/s10766-012-0218-5","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"type":"print","value":"0885-7458"},{"type":"electronic","value":"1573-7640"}],"subject":[],"published":{"date-parts":[[2012,11,2]]}}}