{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T19:46:43Z","timestamp":1771703203342,"version":"3.50.1"},"publisher-location":"Cham","reference-count":23,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319232157","type":"print"},{"value":"9783319232164","type":"electronic"}],"license":[{"start":{"date-parts":[[2015,1,1]],"date-time":"2015-01-01T00:00:00Z","timestamp":1420070400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2015]]},"DOI":"10.1007\/978-3-319-23216-4_2","type":"book-chapter","created":{"date-parts":[[2015,8,14]],"date-time":"2015-08-14T01:20:05Z","timestamp":1439515205000},"page":"18-32","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["DDC: Distributed Data Collection Framework for Failure Prediction in Tianhe Supercomputers"],"prefix":"10.1007","author":[{"given":"Wei","family":"Hu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanhuang","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guangming","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenrui","family":"Dong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guilin","family":"Cai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2015,8,15]]},"reference":[{"issue":"6","key":"2_CR1","doi-asserted-by":"publisher","first-page":"767","DOI":"10.1109\/TC.2011.106","volume":"61","author":"X Yang","year":"2012","unstructured":"Yang, X., Wang, Z., Xue, J., Zhou, Y.: The reliability wall for exascale supercomputing. IEEE Trans. Comput. 61(6), 767\u2013779 (2012)","journal-title":"IEEE Trans. Comput."},{"key":"2_CR2","unstructured":"Philp, I.R.: Software failures and the road to a petaflop machine. In: Proceedings of the 1st Workshop on High Performance Computing Reliability Issues, San Francisco, CA, USA (2005)"},{"key":"2_CR3","doi-asserted-by":"crossref","unstructured":"Chen, Y., Plank, J.S., Li, K.: CLIP: a checkpointing tool for message-passing parallel programs. In: SC 1997, NY, USA (1997)","DOI":"10.1145\/509593.509626"},{"issue":"1","key":"2_CR4","first-page":"494","volume":"46","author":"PH Hargrove","year":"2006","unstructured":"Hargrove, P.H., Duell, J.C.: Berkeley lab checkpoint\/restart (BLCR) for Linux clusters. J. Phys: Conf. Ser. 46(1), 494\u2013499 (2006)","journal-title":"J. Phys: Conf. Ser."},{"key":"2_CR5","unstructured":"Liang, Y., Zhang, Y., Sivasubramaniam, A., Jette, M., Sahoo, R.: BlueGene\/L failure analysis and prediction models. In: DSN 2006, Washington, DC, USA, pp. 425\u2013434 (2006)"},{"key":"2_CR6","doi-asserted-by":"crossref","unstructured":"Liang, Y., Zhang, Y., Xiong, H., Sahoo, R.: Failure prediction in IBM BlueGene\/L event logs. In: The Seventh IEEE International Conference on Data Mining, pp. 583\u2013588 (2007)","DOI":"10.1109\/ICDM.2007.46"},{"key":"2_CR7","doi-asserted-by":"crossref","unstructured":"Liang, Y., Zhang, Y., Xiong, H., Sahoo, R.: An adaptive semantic filter for Blue Gene\/L failure log analysis. In: IEEE International Parallel and Distributed Processing Symposium, pp. 1\u20138 (2007)","DOI":"10.1109\/IPDPS.2007.370635"},{"key":"2_CR8","unstructured":"Li, Y., Lan, Z.: Exploit failure prediction for adaptive fault-tolerance in cluster computing. In: CCGRID 2006, Washington, DC, USA, pp. 531\u2013538 (2006)"},{"issue":"6","key":"2_CR9","doi-asserted-by":"publisher","first-page":"630","DOI":"10.1016\/j.jpdc.2010.03.003","volume":"70","author":"Z Lan","year":"2010","unstructured":"Lan, Z., Gu, J., Zheng, Z., Thakur, R., Coghlan, S.: A study of dynamic meta-learning for failure prediction in large-scale systems. J. Parallel Distrib. Comput. 70(6), 630\u2013643 (2010)","journal-title":"J. Parallel Distrib. Comput."},{"key":"2_CR10","doi-asserted-by":"crossref","unstructured":"Zheng, Z., Yu, L., Tang, W., Lan, Z., Gupta, R., Desai, N., Coghlan, S., Buettner, D.: Co-analysis of RAS log and job log on Blue Gene\/P. In: IPDPS 2011, pp. 840\u2013851 (2011)","DOI":"10.1109\/IPDPS.2011.83"},{"key":"2_CR11","doi-asserted-by":"crossref","unstructured":"Sahoo, R.K., Oliner, A.J., Rish, I., Gupta, M., Moreira, J.E., Ma, S., Vilalta, R., Sivasubramaniam, A.: Critical event prediction for proactive management in large-scale computer clusters. In: KDD 2003, NY, USA, pp. 426\u2013435 (2003)","DOI":"10.1145\/956750.956799"},{"key":"2_CR12","doi-asserted-by":"crossref","unstructured":"Oliner, A., Rudolph, L., Sahoo, R.: Cooperative checkpointing theory. In: IPDPS 2006, Washington, DC, USA, pp. 132\u2013141 (2006)","DOI":"10.1109\/IPDPS.2006.1639368"},{"issue":"2","key":"2_CR13","doi-asserted-by":"publisher","first-page":"55","DOI":"10.1145\/2076450.2076466","volume":"55","author":"A Oliner","year":"2012","unstructured":"Oliner, A., Ganapathi, A., Xu, W.: Advances and challenges in log analysis. Commun. ACM 55(2), 55\u201361 (2012)","journal-title":"Commun. ACM"},{"key":"2_CR14","doi-asserted-by":"crossref","unstructured":"Yamanishi, K., Maruyama, Y.: Dynamic syslog mining for network failure monitoring. In: KDD 2005, New York, NY, USA, pp. 499\u2013508 (2005)","DOI":"10.1145\/1081870.1081927"},{"key":"2_CR15","doi-asserted-by":"crossref","unstructured":"Xu, W., Huang, L., Fox, A., Patterson, D., Jordan, M.I.: Detecting large-scale system problems by mining console logs. In: SOSP 2009, NY, USA, pp. 117\u2013132 (2009)","DOI":"10.1145\/1629575.1629587"},{"key":"2_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1007\/978-3-540-30179-0_27","volume-title":"Intelligence in Communication Systems","author":"R Vaarandi","year":"2004","unstructured":"Vaarandi, R.: A breadth-first algorithm for mining frequent patterns from event logs. In: Aagesen, F.A., Anutariya, C., Wuwongse, V. (eds.) INTELLCOMM 2004. LNCS, vol. 3283, pp. 293\u2013308. Springer, Heidelberg (2004)"},{"key":"2_CR17","doi-asserted-by":"crossref","unstructured":"Gainaru, A., Cappello, F., Snir, M., Kramer, W.: Fault prediction under the microscope: a closer look into HPC systems. In: SC 2012, Los Alamitos, CA, USA (2012)","DOI":"10.1109\/SC.2012.57"},{"key":"2_CR18","doi-asserted-by":"crossref","unstructured":"Scott, S.L., Engelmann, C., Vallee, G.R., Naughton, T., Tikotekar, A., Ostrouchov, G., et al.: A tunable holistic resiliency approach for high-performance computing systems. In: PPoPP 2009, NY, USA, pp. 305\u2013306 (2009)","DOI":"10.1145\/1594835.1504227"},{"key":"2_CR19","doi-asserted-by":"crossref","unstructured":"Nagarajan, A.B., Mueller, F., Engelmann, C., Scott, S.L.: Proactive fault tolerance for HPC with Xen virtualization. In: ICS 2007, NY, USA, pp. 23\u201332 (2007)","DOI":"10.1145\/1274971.1274978"},{"key":"2_CR20","doi-asserted-by":"crossref","unstructured":"Rajachandrasekar, R., Besseron, X., Panda, D.K.: Monitoring and predicting hardware failures in HPC clusters with FTB-IPMI. In: IEEE 26th International Parallel and Distributed Processing Symposium Workshops PhD Forum (IPDPSW), pp. 1136\u20131143 (2012)","DOI":"10.1109\/IPDPSW.2012.139"},{"issue":"7","key":"2_CR21","doi-asserted-by":"publisher","first-page":"723","DOI":"10.1002\/(SICI)1097-024X(200006)30:7<723::AID-SPE314>3.0.CO;2-5","volume":"30","author":"R Buyya","year":"2000","unstructured":"Buyya, R.: PARMON: a portable and scalable monitoring system for clusters. Softw. Pract. Exper. 30(7), 723\u2013739 (2000)","journal-title":"Softw. Pract. Exper."},{"issue":"7","key":"2_CR22","doi-asserted-by":"publisher","first-page":"817","DOI":"10.1016\/j.parco.2004.04.001","volume":"30","author":"ML Massie","year":"2004","unstructured":"Massie, M.L., Chun, B.N., Culler, D.E.: The ganglia distributed monitoring system: design, implementation, and experience. Parallel Comput. 30(7), 817\u2013840 (2004)","journal-title":"Parallel Comput."},{"key":"2_CR23","doi-asserted-by":"crossref","unstructured":"Brandt, J.M., Debusschere, B.J., Gentile, A.C., Mayo, J.R., Pebay, P.P., Thompson, D., et al.: Ovis-2: a robust distributed architecture for scalable RAS. In: IEEE International Symposium on Parallel and Distributed Processing, pp. 1\u20138 (2008)","DOI":"10.1109\/IPDPS.2008.4536549"}],"container-title":["Lecture Notes in Computer Science","Advanced Parallel Processing Technologies"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-23216-4_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,20]],"date-time":"2023-01-20T18:43:53Z","timestamp":1674240233000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-319-23216-4_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015]]},"ISBN":["9783319232157","9783319232164"],"references-count":23,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-23216-4_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2015]]},"assertion":[{"value":"15 August 2015","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}}