{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T16:51:55Z","timestamp":1771951915536,"version":"3.50.1"},"publisher-location":"Cham","reference-count":36,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319586663","type":"print"},{"value":"9783319586670","type":"electronic"}],"license":[{"start":{"date-parts":[[2017,1,1]],"date-time":"2017-01-01T00:00:00Z","timestamp":1483228800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017]]},"DOI":"10.1007\/978-3-319-58667-0_19","type":"book-chapter","created":{"date-parts":[[2017,5,11]],"date-time":"2017-05-11T15:27:38Z","timestamp":1494516458000},"page":"355-373","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":60,"title":["Diagnosing Performance Variations in HPC Applications Using Machine Learning"],"prefix":"10.1007","author":[{"given":"Ozan","family":"Tuncer","sequence":"first","affiliation":[]},{"given":"Emre","family":"Ates","sequence":"additional","affiliation":[]},{"given":"Yijia","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Ata","family":"Turk","sequence":"additional","affiliation":[]},{"given":"Jim","family":"Brandt","sequence":"additional","affiliation":[]},{"given":"Vitus J.","family":"Leung","sequence":"additional","affiliation":[]},{"given":"Manuel","family":"Egele","sequence":"additional","affiliation":[]},{"given":"Ayse K.","family":"Coskun","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2017,5,12]]},"reference":[{"key":"19_CR1","unstructured":"Cisco bug: Csctf52095 - manually flushing os cache during load impacts server. https:\/\/quickview.cloudapps.cisco.com\/quickview\/bug\/CSCtf52095"},{"key":"19_CR2","unstructured":"Ganglia. ganglia.info"},{"key":"19_CR3","unstructured":"Massachusetts Open Cloud (MOC). http:\/\/info.massopencloud.org"},{"key":"19_CR4","unstructured":"MOC public code repository for kilo-puppet sensu modules. https:\/\/github.com\/CCI-MOC\/kilo-puppet\/tree\/liberty\/sensu. Accessed 27 Oct 2016"},{"key":"19_CR5","unstructured":"Nagios. www.nagios.org"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Agelastos, A., Allan, B., Brandt, J., Gentile, A., Lefantzi, S., Monk, S., Ogden, J., Rajan, M., Stevenson, J.: Toward rapid understanding of production HPC applications and systems. In: IEEE International Conference on Cluster Computing, pp. 464\u2013473, September 2015","DOI":"10.1109\/CLUSTER.2015.71"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Agelastos, A., et al.: The lightweight distributed metric service: a scalable infrastructure for continuous monitoring of large scale computing systems and applications. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC), pp. 154\u2013165, November 2014","DOI":"10.1109\/SC.2014.18"},{"key":"19_CR8","doi-asserted-by":"crossref","unstructured":"Bailey, D.H., et al.: The NAS parallel benchmarks - summary and preliminary results. In: Proceedings of the ACM\/IEEE Conference on Supercomputing, pp. 158\u2013165, August 1991","DOI":"10.1145\/125826.125925"},{"key":"19_CR9","unstructured":"Baseman, E., Blanchard, S., Debardeleben, N., Bonnie, A., Morrow, A.: Interpretable anomaly detection for monitoring of high performance computing systems. In: Outlier Definition, Detection, and Description on Demand Workshop at ACM SIGKDD, San Francisco, August 2016"},{"key":"19_CR10","unstructured":"Berndt, D.J., Clifford, J.: Using dynamic time warping to find patterns in time series. In: Proceedings of the 3rd International Conference on Knowledge Discovery and Data Mining, vol. 10, pp. 359\u2013370, August 1994"},{"key":"19_CR11","doi-asserted-by":"crossref","unstructured":"Bhatele, A., Titus, A.R., Thiagarajan, J.J., Jain, N., Gamblin, T., Bremer, P.T., Schulz, M., Kale, L.V.: Identifying the culprits behind network congestion. In: IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 113\u2013122, May 2015","DOI":"10.1109\/IPDPS.2015.92"},{"key":"19_CR12","doi-asserted-by":"crossref","unstructured":"Bhatele, A., Mohror, K., Langer, S.H., Isaacs, K.E.: There goes the neighborhood: performance degradation due to nearby jobs. In: SC, pp. 41:1\u201341:12, November 2013","DOI":"10.1145\/2503210.2503247"},{"key":"19_CR13","unstructured":"Bod\u00edk, P., Fox, A., Jordan, M.I., Patterson, D., Banerjee, A., Jagannathan, R., Su, T., Tenginakai, S., Turner, B., Ingalls, J.: Advanced tools for operators at amazon.com. In: Proceedings of the First International Conference on Hot Topics in Autonomic Computing, June 2006"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"Bodik, P., Goldszmidt, M., Fox, A., Woodard, D.B., Andersen, H.: Fingerprinting the datacenter: automated classification of performance crises. In: Proceedings of the 5th European Conference on Computer Systems, pp. 111\u2013124 (2010)","DOI":"10.1145\/1755913.1755926"},{"key":"19_CR15","unstructured":"Brandt, J., et al.: Enabling advanced operational analysis through multi-subsystem data integration on trinity. In: Proceedings of the Cray User\u2019s Group (2015)"},{"key":"19_CR16","doi-asserted-by":"crossref","unstructured":"Brandt, J., Chen, F., De Sapio, V., Gentile, A., Mayo, J., Pebay, P., Roe, D., Thompson, D., Wong, M.: Quantifying effectiveness of failure prediction and response in HPC systems: methodology and example. In: Proceedings of the International Conference on Dependable Systems and Networks Workshops, pp. 2\u20137, June 2010","DOI":"10.1109\/DSNW.2010.5542629"},{"key":"19_CR17","doi-asserted-by":"crossref","unstructured":"Brandt, J., Gentile, A., Mayo, J., P\u00e9bay, P., Roe, D., Thompson, D., Wong, M.: Methodologies for advance warning of compute cluster problems via statistical analysis: a case study. In: Proceedings of the 2009 Workshop on Resiliency in High Performance, pp. 7\u201314, June 2009","DOI":"10.1145\/1552526.1552528"},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"Dorier, M., Antoniu, G., Ross, R., Kimpe, D., Ibrahim, S.: Calciom: mitigating I\/O interference in HPC systems through cross-application coordination. In: IPDPS, pp. 155\u2013164, May 2014","DOI":"10.1109\/IPDPS.2014.27"},{"issue":"1","key":"19_CR19","doi-asserted-by":"publisher","first-page":"2","DOI":"10.1016\/j.jss.2012.06.025","volume":"86","author":"I Fronza","year":"2013","unstructured":"Fronza, I., Sillitti, A., Succi, G., Terho, M., Vlasenko, J.: Failure prediction based on log files using random indexing and support vector machines. J. Syst. Softw. 86(1), 2\u201311 (2013)","journal-title":"J. Syst. Softw."},{"key":"19_CR20","doi-asserted-by":"crossref","unstructured":"Fu, S.: Performance metric selection for autonomic anomaly detection on cloud computing systems. In: IEEE Global Telecommunications Conference, pp. 1\u20135, December 2011","DOI":"10.1109\/GLOCOM.2011.6134532"},{"key":"19_CR21","doi-asserted-by":"crossref","unstructured":"Gainaru, A., Cappello, F., Snir, M., Kramer, W.: Fault prediction under the microscope: a closer look into HPC systems. In: SC, pp. 77:1\u201377:11, November 2012","DOI":"10.1109\/SC.2012.57"},{"key":"19_CR22","doi-asserted-by":"crossref","unstructured":"Giannerini, S.: The quest for nonlinearity in time series. In: Handbook of Statistics: Time Series, vol. 30, pp. 43\u201363 (2012)","DOI":"10.1016\/B978-0-444-53858-1.00003-X"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Guan, Q., Fu, S.: Adaptive anomaly identification by exploring metric subspace in cloud computing infrastructures. In: IEEE 32nd International Symposium on Reliable Distributed Systems, pp. 205\u2013214, September 2013","DOI":"10.1109\/SRDS.2013.29"},{"key":"19_CR24","doi-asserted-by":"crossref","unstructured":"Heien, E., LaPine, D., Kondo, D., Kramer, B., Gainaru, A., Cappello, F.: Modeling and tolerating heterogeneous failures in large parallel systems. In: SC, pp. 1\u201311, November 2011","DOI":"10.1145\/2063384.2063444"},{"issue":"1","key":"19_CR25","doi-asserted-by":"publisher","first-page":"4:1","DOI":"10.1145\/2791120","volume":"48","author":"O Ibidunmoye","year":"2015","unstructured":"Ibidunmoye, O., Hern\u00e1ndez-Rodriguez, F., Elmroth, E.: Performance anomaly detection and bottleneck identification. ACM Comput. Surv. 48(1), 4:1\u20134:35 (2015)","journal-title":"ACM Comput. Surv."},{"key":"19_CR26","unstructured":"Kasick, M.P., Gandhi, R., Narasimhan, P.: Behavior-based problem localization for parallel file systems. In: Proceedings of the 6th Workshop on Hot Topics in System Dependability, October 2010"},{"issue":"2","key":"19_CR27","doi-asserted-by":"publisher","first-page":"174","DOI":"10.1109\/TPDS.2009.52","volume":"21","author":"Z Lan","year":"2010","unstructured":"Lan, Z., Zheng, Z., Li, Y.: Toward automated anomaly identification in large-scale systems. IEEE Trans. Parallel Distrib. Syst. 21(2), 174\u2013187 (2010)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"19_CR28","doi-asserted-by":"crossref","unstructured":"Leung, V.J., Phillips, C.A., Bender, M.A., Bunde, D.P.: Algorithmic support for commodity-based parallel computing systems. Technical report SAND2003-3702, Sandia National Laboratories (2003)","DOI":"10.2172\/918344"},{"key":"19_CR29","first-page":"2825","volume":"12","author":"F Pedregosa","year":"2011","unstructured":"Pedregosa, F., et al.: Scikit-learn: machine learning in Python. J. Mach. Learn. Res. 12, 2825\u20132830 (2011)","journal-title":"J. Mach. Learn. Res."},{"issue":"3","key":"19_CR30","first-page":"38","volume":"55","author":"O Sefraoui","year":"2012","unstructured":"Sefraoui, O., Aissaoui, M., Eleuldj, M.: OpenStack: toward an open-source solution for cloud computing. Int. J. Comput. Appl. 55(3), 38\u201342 (2012)","journal-title":"Int. J. Comput. Appl."},{"key":"19_CR31","doi-asserted-by":"crossref","unstructured":"Skinner, D., Kramer, W.: Understanding the causes of performance variability in HPC workloads. In: IEEE International Symposium on Workload Characterization, pp. 137\u2013149, October 2005","DOI":"10.1109\/IISWC.2005.1526010"},{"key":"19_CR32","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1177\/1094342014522573","volume":"28","author":"M Snir","year":"2014","unstructured":"Snir, M., et al.: Addressing failures in exascale computing. Int. J. High Perform. Comput. Appl. 28, 129\u2013173 (2014)","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"19_CR33","unstructured":"Sterling, T., Becker, D.J., Savarese, D., Dorband, J.E., Ranawake, U.A., Packer, C.V.: Beowulf: a parallel workstation for scientific computation. In: Proceedings of the 24th International Conference on Parallel Processing, pp. 11\u201314 (1995)"},{"key":"19_CR34","unstructured":"Turk, A., Chen, H., Tuncer, O., Li, H., Li, Q., Krieger, O., Coskun, A.K.: Seeing into a public cloud: monitoring the Massachusetts open cloud. In: USENIX Workshop on Cool Topics on Sustainable Data Centers, March 2016"},{"issue":"3","key":"19_CR35","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s10618-005-0039-x","volume":"13","author":"X Wang","year":"2006","unstructured":"Wang, X., Smith, K., Hyndman, R.: Characteristic-based clustering for time series data. Data Min. Knowl. Disc. 13(3), 335\u2013364 (2006)","journal-title":"Data Min. Knowl. Disc."},{"key":"19_CR36","volume-title":"Forecasting: Methods and Applications","author":"S Wheelwright","year":"1998","unstructured":"Wheelwright, S., Makridakis, S., Hyndman, R.J.: Forecasting: Methods and Applications. Wiley, Hoboken (1998)"}],"container-title":["Lecture Notes in Computer Science","High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-58667-0_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T12:55:42Z","timestamp":1750251342000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-319-58667-0_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017]]},"ISBN":["9783319586663","9783319586670"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-58667-0_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2017]]},"assertion":[{"value":"12 May 2017","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}}