{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T14:54:15Z","timestamp":1774018455348,"version":"3.50.1"},"reference-count":30,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2010,3,13]],"date-time":"2010-03-13T00:00:00Z","timestamp":1268438400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Cluster Comput"],"published-print":{"date-parts":[[2010,12]]},"DOI":"10.1007\/s10586-010-0126-7","type":"journal-article","created":{"date-parts":[[2010,3,12]],"date-time":"2010-03-12T17:01:09Z","timestamp":1268413269000},"page":"421-434","source":"Crossref","is-referenced-by-count":9,"title":["Failure-aware workflow scheduling in cluster environments"],"prefix":"10.1007","volume":"13","author":[{"given":"Zhifeng","family":"Yu","sequence":"first","affiliation":[]},{"given":"Chenjia","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Weisong","family":"Shi","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2010,3,13]]},"reference":[{"key":"126_CR1","volume-title":"Computers and Intractibility: A Guide to the Theory of NP-completeness","author":"M. Garey","year":"1979","unstructured":"Garey, M., Johnson, D.: Computers and Intractibility: A Guide to the Theory of NP-completeness. Freeman, San Francisco (1979)"},{"key":"126_CR2","unstructured":"Open science grid. [Online]. Available: http:\/\/www.opensciencegrid.org\/"},{"key":"126_CR3","unstructured":"Nsf taragrid. [Online]. Available: http:\/\/www.teragrid.org\/"},{"key":"126_CR4","volume-title":"SC\u201907: Proceedings of the 2007 ACM\/IEEE Conference on Supercomputing","author":"L. Yang","year":"2007","unstructured":"Yang, L., Schopf, J., Foster, I.: Anomaly detection and diagnosis in grid environments. In: SC\u201907: Proceedings of the 2007 ACM\/IEEE Conference on Supercomputing. IEEE Computer Society, Washington (2007)"},{"key":"126_CR5","doi-asserted-by":"crossref","first-page":"476","DOI":"10.1109\/DSN.2005.50","volume-title":"Proceedings of the 2005 International Conference on Dependable Systems and Networks (DSN\u201905)","author":"Y. Liang","year":"2005","unstructured":"Liang, Y., Sivasubramaniam, A., Moreira, J.: Filtering failure logs for a bluegene\/l prototype. In: Proceedings of the 2005 International Conference on Dependable Systems and Networks (DSN\u201905), pp. 476\u2013485. IEEE Computer Society, Washington (2005)"},{"key":"126_CR6","doi-asserted-by":"crossref","unstructured":"Fu, S., Xu, C.: Exploring event correlation for failure prediction in coalitions of clusters. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis (SC\u201907) (2007)","DOI":"10.1145\/1362622.1362678"},{"key":"126_CR7","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1095810.1118581","volume-title":"Proceedings of the Twentieth ACM Symposium on Operating Systems Principles (SOSP\u201905)","author":"D. Oppenheimer","year":"2005","unstructured":"Oppenheimer, D., et al.: Service placement in shared wide-area platforms. In: Proceedings of the Twentieth ACM Symposium on Operating Systems Principles (SOSP\u201905), p.\u00a01. ACM, New York (2005)"},{"key":"126_CR8","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: Performance implications of failures in large-scale cluster scheduling. In: Proceedings of 10th International WorkshopJob Scheduling Strategies for Parallel Processing (JSSPP\u201904), pp. 233\u2013252 (2004)","DOI":"10.1007\/11407522_13"},{"key":"126_CR9","doi-asserted-by":"crossref","first-page":"249","DOI":"10.1109\/DSN.2006.5","volume-title":"Proceedings of the International Conference on Dependable Systems and Networks (DSN\u201906)","author":"B. Schroeder","year":"2006","unstructured":"Schroeder, B., Gibson, G.: A large-scale study of failures in high-performance computing systems. In: Proceedings of the International Conference on Dependable Systems and Networks (DSN\u201906), pp. 249\u2013258. IEEE Computer Society, Washington (2006)"},{"key":"126_CR10","unstructured":"Yalagandula, P., et al.: Beyond availability: Towards a deeper understanding of machine failure characteristics in large distributed systems. In: Proceedings of the Workshop on Real, Large Distributed Systems (WORLDS\u201904) (2004)"},{"issue":"2","key":"126_CR11","doi-asserted-by":"crossref","first-page":"173","DOI":"10.1007\/s10723-007-9077-5","volume":"5","author":"X. Ren","year":"2007","unstructured":"Ren, X., et al.: Prediction of resource availability in fine-grained cycle sharing systems empirical evaluation. J. Grid Comput. 5(2), 173\u2013195 (2007)","journal-title":"J. Grid Comput."},{"key":"126_CR12","doi-asserted-by":"crossref","unstructured":"Salfner, F., Schieschke, M., Malek, M.: Predicting failures of computer systems: a case study for a telecommunication system. In: Proceedings of the 20th International Parallel and Distributed Processing Symposium (IPDPS 2006) (2006)","DOI":"10.1109\/IPDPS.2006.1639672"},{"key":"126_CR13","first-page":"531","volume-title":"Proceedings of the Sixth IEEE International Symposium on Cluster Computing and the Grid (CCGRID\u201906)","author":"Y. Li","year":"2006","unstructured":"Li, Y., Lan, Z.: Exploit failure prediction for adaptive fault-tolerance in cluster computing. In: Proceedings of the Sixth IEEE International Symposium on Cluster Computing and the Grid (CCGRID\u201906), pp. 531\u2013538. IEEE Computer Society, Washington (2006)"},{"key":"126_CR14","doi-asserted-by":"crossref","first-page":"39","DOI":"10.1109\/ICPP.2007.42","volume-title":"Proceedings of the 2007 International Conference on Parallel Processing (ICPP\u201907)","author":"Y. Li","year":"2007","unstructured":"Li, Y., et al.: Fault-driven re-scheduling for improving system-level fault resilience. In: Proceedings of the 2007 International Conference on Parallel Processing (ICPP\u201907), p. 39. IEEE Computer Society, Washington (2007)"},{"key":"126_CR15","volume-title":"Proceedings of the 18th International Parallel and Distributed Processing Symposium (IPDPS\u201904)","author":"A. Oliner","year":"2004","unstructured":"Oliner, A., et al.: Fault-aware job scheduling for bluegene\/l systems. In: Proceedings of the 18th International Parallel and Distributed Processing Symposium (IPDPS\u201904). IEEE Computer Society, Washington (2004)"},{"key":"126_CR16","doi-asserted-by":"crossref","first-page":"126","DOI":"10.1109\/HPDC.2003.1210023","volume-title":"Proceedings of the 12th IEEE International Symposium on High Performance Distributed Computing (HPDC\u201903)","author":"S. Hwang","year":"2003","unstructured":"Hwang, S., Kesselman, C.: Gridworkflow: A flexible failure handling framework for the grid. In: Proceedings of the 12th IEEE International Symposium on High Performance Distributed Computing (HPDC\u201903), p.\u00a0126. IEEE Computer Society, Washington (2003)"},{"key":"126_CR17","volume-title":"Proceedings of the 18th International Parallel and Distributed Processing Symposium (IPDPS\u201904)","author":"J.H. Abawajy","year":"2004","unstructured":"Abawajy, J.H.: Fault-tolerant scheduling policy for grid computing systems. In: Proceedings of the 18th International Parallel and Distributed Processing Symposium (IPDPS\u201904). IEEE Computer Society, Washington (2004)"},{"key":"126_CR18","unstructured":"Dagman. [Online]. Available: http:\/\/www.cs.wisc.edu\/condor\/dagman\/"},{"issue":"3","key":"126_CR19","doi-asserted-by":"crossref","first-page":"308","DOI":"10.1109\/71.993209","volume":"13","author":"A. Dogan","year":"2002","unstructured":"Dogan, A., \u00d6zg\u00fcner, F.: Matching and scheduling algorithms for minimizing execution time and failure probability of applications in heterogeneous computing. IEEE Trans. Parallel Distrib. Syst. 13(3), 308\u2013323 (2002)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"126_CR20","doi-asserted-by":"crossref","first-page":"99","DOI":"10.1007\/978-1-4615-0509-9_7","volume-title":"Grid Resource Management: State of the Art and Future Trends","author":"E. Deelman","year":"2004","unstructured":"Deelman, E., Blythe, J., Gil, Y., Kesselman, C.: Workflow management in griphyn. In: Grid Resource Management: State of the Art and Future Trends, pp. 99\u2013116. Kluwer Academic, Norwell (2004)"},{"key":"126_CR21","unstructured":"Planet lab. [Online]. Available: http:\/\/www.planet-lab.org"},{"key":"126_CR22","doi-asserted-by":"crossref","unstructured":"Yu, Z., Shi, W.: A planner-guided scheduling strategy for multiple grid workflow applications. In: Proceeding of Fourth International Workshop on Scheduling and Resource Management for Parallel and Distributed Systems (SRMPDS \u201908), Portland, Oregon, USA, September 2008","DOI":"10.1109\/ICPP-W.2008.10"},{"key":"126_CR23","unstructured":"Los Alamos National Laboratory. Operational data to support and enable computer science research (2006). [Online]. Available: http:\/\/institutes.lanl.gov\/data\/fdata\/"},{"key":"126_CR24","first-page":"281.1","volume-title":"Proceedings of the 19th IEEE International Parallel and Distributed Processing Symposium (IPDPS\u201905)\u2014Workshop 16","author":"F. Salfner","year":"2005","unstructured":"Salfner, F., Malek, M.: Proactive fault handling for system availability enhancement. In: Proceedings of the 19th IEEE International Parallel and Distributed Processing Symposium (IPDPS\u201905)\u2014Workshop 16, p. 281.1. IEEE Computer Society, Washington (2005)"},{"key":"126_CR25","doi-asserted-by":"crossref","unstructured":"Yu, Z., Shi, W.: An adaptive rescheduling strategy for grid workflow applications. In: Proceeding of 21st International Parallel and Distributed Processing Symposium (IPDPS\u201907), Long Beach, Florida, USA, March 2007","DOI":"10.1109\/IPDPS.2007.370305"},{"issue":"3","key":"126_CR26","doi-asserted-by":"crossref","first-page":"260","DOI":"10.1109\/71.993206","volume":"13","author":"H. Topcuouglu","year":"2002","unstructured":"Topcuouglu, H., Hariri, S., Wu, M.: Performance-effective and low-complexity task scheduling for heterogeneous computing. IEEE Trans. Parallel Distrib. Syst. 13(3), 260\u2013274 (2002)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"126_CR27","first-page":"299.2","volume-title":"Proceedings of the 19th IEEE International Parallel and Distributed Processing Symposium (IPDPS\u201905)","author":"A. Oliner","year":"2005","unstructured":"Oliner, A., Sahoo, R., Moreira, J., Gupta, M.: Performance implications of periodic checkpointing on large-scale cluster systems. In: Proceedings of the 19th IEEE International Parallel and Distributed Processing Symposium (IPDPS\u201905), p.\u00a0299.2. IEEE Computer Society, Washington (2005)"},{"key":"126_CR28","doi-asserted-by":"crossref","unstructured":"Schroeder, B., Gibson, G.: Understanding failures in petascale computers. J. Phys., Condens. Matter 19(45) (2007)","DOI":"10.1088\/1742-6596\/78\/1\/012022"},{"key":"126_CR29","volume-title":"Proceedings of the 16th International Conference on Parallel and Distributed Computing and Systems (PDCS\u201904)","author":"U. H\u00f6nig","year":"2004","unstructured":"H\u00f6nig, U., Schiffmann, W.: A comprehensive test bench for the evaluation of scheduling heuristics. In: Proceedings of the 16th International Conference on Parallel and Distributed Computing and Systems (PDCS\u201904). IEEE, New York (2004)"},{"key":"126_CR30","first-page":"63","volume-title":"Integration Research in Grid Computing, CoreGRID Integration Workshop","author":"L.-C. Canon","year":"2008","unstructured":"Canon, L.-C., Jeannot, E., Sakellariou, R., Zheng, W.: Comparative evaluation of the robustness of dag scheduling heuristics. In: Integration Research in Grid Computing, CoreGRID Integration Workshop, pp.\u00a063\u201374. Crete University Press, Heraklion (2008)"}],"container-title":["Cluster Computing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-010-0126-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10586-010-0126-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-010-0126-7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,5,30]],"date-time":"2019-05-30T18:40:12Z","timestamp":1559241612000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10586-010-0126-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2010,3,13]]},"references-count":30,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2010,12]]}},"alternative-id":["126"],"URL":"https:\/\/doi.org\/10.1007\/s10586-010-0126-7","relation":{},"ISSN":["1386-7857","1573-7543"],"issn-type":[{"value":"1386-7857","type":"print"},{"value":"1573-7543","type":"electronic"}],"subject":[],"published":{"date-parts":[[2010,3,13]]}}}