{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,17]],"date-time":"2026-01-17T07:44:46Z","timestamp":1768635886639,"version":"3.49.0"},"publisher-location":"Cham","reference-count":46,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030787127","type":"print"},{"value":"9783030787134","type":"electronic"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-78713-4_11","type":"book-chapter","created":{"date-parts":[[2021,6,16]],"date-time":"2021-06-16T23:06:15Z","timestamp":1623884775000},"page":"195-214","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":16,"title":["Proctor: A Semi-Supervised Performance Anomaly Diagnosis Framework for Production HPC Systems"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3627-7311","authenticated-orcid":false,"given":"Burak","family":"Aksar","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yijia","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Emre","family":"Ates","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Benjamin","family":"Schwaller","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Omar","family":"Aaziz","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vitus J.","family":"Leung","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jim","family":"Brandt","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Manuel","family":"Egele","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ayse K.","family":"Coskun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,6,17]]},"reference":[{"key":"11_CR1","doi-asserted-by":"crossref","unstructured":"Agelastos, A., Allan, B., Brandt, J., et al.: The lightweight distributed metric service: a scalable infrastructure for continuous monitoring of large scale computing systems and applications. In: SC 2014: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 154\u2013165 (2014)","DOI":"10.1109\/SC.2014.18"},{"key":"11_CR2","doi-asserted-by":"crossref","unstructured":"Agelastos, A., Allan, B., Brandt, J., et al.: Toward rapid understanding of production HPC applications and systems. In: IEEE International Conference on Cluster Computing, pp. 464\u2013473 (2015)","DOI":"10.1109\/CLUSTER.2015.71"},{"key":"11_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"329","DOI":"10.1007\/978-3-319-10584-0_22","volume-title":"Computer Vision \u2013 ECCV 2014","author":"P Agrawal","year":"2014","unstructured":"Agrawal, P., Girshick, R., Malik, J.: Analyzing the performance of multilayer neural networks for object recognition. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8695, pp. 329\u2013344. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10584-0_22"},{"key":"11_CR4","unstructured":"Ahmad, W.A., Bartolini, A., Beneventi, F., et al.: Design of an energy aware petaflops class high performance cluster based on power architecture. In: IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW), pp. 964\u2013973 (2017)"},{"issue":"1","key":"11_CR5","first-page":"3563","volume":"15","author":"G Alain","year":"2014","unstructured":"Alain, G., Bengio, Y.: What regularized auto-encoders learn from the data-generating distribution. J. Mach. Learn. Res. 15(1), 3563\u20133593 (2014)","journal-title":"J. Mach. Learn. Res."},{"key":"11_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"92","DOI":"10.1007\/978-3-319-96983-1_7","volume-title":"Euro-Par 2018: Parallel Processing","author":"E Ates","year":"2018","unstructured":"Ates, E., et al.: Taxonomist: application detection through rich monitoring data. In: Aldinucci, M., Padovani, L., Torquati, M. (eds.) Euro-Par 2018. LNCS, vol. 11014, pp. 92\u2013105. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-319-96983-1_7"},{"key":"11_CR7","doi-asserted-by":"crossref","unstructured":"Ates, E., Zhang, Y., Aksar, B., et al.: HPAS: an HPC performance anomaly suite for reproducing performance variations. In: ACM Proceedings of the 48th International Conference on Parallel Processing, pp. 1\u201310, August 2019","DOI":"10.1145\/3337821.3337907"},{"key":"11_CR8","doi-asserted-by":"crossref","unstructured":"Bailey, D.H., Barszcz, E., Barton, J.T., et al.: The NAS parallel benchmarks summary and preliminary results. In: Supercomputing 1991: Proceedings of the 1991 ACM\/IEEE Conference on Supercomputing, pp. 158\u2013165 (1991)","DOI":"10.1145\/125826.125925"},{"key":"11_CR9","unstructured":"Baseman, E., Blanchard, S., DeBardeleben, N., Bonnie, A., Morrow, A.: Interpretable anomaly detection for monitoring of high performance computing systems. In: Outlier Definition, Detection, and Description on Demand Workshop at ACM SIGKDD, San Francisco, August 2016 (2016)"},{"key":"11_CR10","doi-asserted-by":"crossref","unstructured":"Beneventi, F., Bartolini, A., Cavazzoni, C., Benini, L.: Continuous learning of HPC infrastructure models using big data analytics and in-memory processing tools. In: Design, Automation Test in Europe Conference Exhibition (DATE), pp. 1038\u20131043 (2017)","DOI":"10.23919\/DATE.2017.7927143"},{"key":"11_CR11","doi-asserted-by":"crossref","unstructured":"Bengio, Y.: Learning Deep Architectures for AI. Now Publishers Inc., New York (2009)","DOI":"10.1561\/9781601982957"},{"key":"11_CR12","doi-asserted-by":"crossref","unstructured":"Bhatele, A., Mohror, K., Langer, S.H., Isaacs, K.E.: There goes the neighborhood: performance degradation due to nearby jobs. In: SC 2013: IEEE Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, pp. 1\u201312 (2013)","DOI":"10.1145\/2503210.2503247"},{"key":"11_CR13","doi-asserted-by":"crossref","unstructured":"Bodik, P., Goldszmidt, M., Fox, A., Woodard, D.B., Andersen, H.: Fingerprinting the datacenter: automated classification of performance crises. In: Proceedings of the 5th European Conference on Computer Systems, pp. 111\u2013124 (2010)","DOI":"10.1145\/1755913.1755926"},{"key":"11_CR14","doi-asserted-by":"publisher","first-page":"634","DOI":"10.1016\/j.engappai.2019.07.008","volume":"85","author":"A Borghesi","year":"2019","unstructured":"Borghesi, A., Bartolini, A., Lombardi, M., Milano, M., Benini, L.: A semisupervised autoencoder-based approach for anomaly detection in high performance computing systems. Eng. Appl. Artif. Intell. 85, 634\u2013644 (2019)","journal-title":"Eng. Appl. Artif. Intell."},{"key":"11_CR15","doi-asserted-by":"crossref","unstructured":"Borghesi, A., Bartolini, A., Lombardi, M., et al.: Anomaly detection using autoencoders in high performance computing systems. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 33, pp. 9428\u20139433, July 2019. arXiv: 1811.05269","DOI":"10.1609\/aaai.v33i01.33019428"},{"key":"11_CR16","doi-asserted-by":"crossref","unstructured":"Brandt, J., Chen, F., et al.: Quantifying effectiveness of failure prediction and response in HPC systems: methodology and example. In: IEEE International Conference on Dependable Systems and Networks Workshops (DSN-W), pp. 2\u20137 (2010)","DOI":"10.1109\/DSNW.2010.5542629"},{"key":"11_CR17","doi-asserted-by":"crossref","unstructured":"Ciregan, D., Meier, U., Schmidhuber, J.: Multi-column deep neural networks for image classification. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 3642\u20133649 (2012)","DOI":"10.1109\/CVPR.2012.6248110"},{"key":"11_CR18","doi-asserted-by":"crossref","unstructured":"Dorier, M., Antoniu, G., Ross, R., et al.: CALCioM: mitigating I\/O interference in HPC systems through cross-application coordination. In: IEEE 28th International Parallel and Distributed Processing Symposium, pp. 155\u2013164 (2014)","DOI":"10.1109\/IPDPS.2014.27"},{"key":"11_CR19","unstructured":"Exascale proxy applications. https:\/\/proxyapps.exascaleproject.org\/"},{"key":"11_CR20","unstructured":"Ganglia monitoring system. http:\/\/ganglia.info\/"},{"key":"11_CR21","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., Malik, J.: Rich feature hierarchies for accurate object detection and semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 580\u2013587 (2014)","DOI":"10.1109\/CVPR.2014.81"},{"key":"11_CR22","doi-asserted-by":"crossref","unstructured":"Habib, S., Morozov, V., Frontiere, N., Finkel, H., Pope, A., Heitmann, K.: HACC: extreme scaling and performance across diverse architectures. In: SC 2013: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, pp. 1\u201310. IEEE (2013)","DOI":"10.1145\/2503210.2504566"},{"key":"11_CR23","unstructured":"Heroux, M.A., et al.: Improving performance via mini-applications. Sandia National Laboratories, Technical report, SAND2009-5574 3 (2009)"},{"issue":"7","key":"11_CR24","doi-asserted-by":"publisher","first-page":"1527","DOI":"10.1162\/neco.2006.18.7.1527","volume":"18","author":"GE Hinton","year":"2006","unstructured":"Hinton, G.E., Osindero, S., Teh, Y.W.: A fast learning algorithm for deep belief nets. Neural Comput. 18(7), 1527\u20131554 (2006)","journal-title":"Neural Comput."},{"key":"11_CR25","unstructured":"Hinton, G.E., Zemel, R.S.: Autoencoders, minimum description length and Helmholtz free energy. In: Proceedings of the 6th International Conference on Neural Information Processing Systems. NIPS 1993, pp. 3\u201310. Morgan Kaufmann Publishers Inc., San Francisco (1993)"},{"issue":"1","key":"11_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2791120","volume":"48","author":"O Ibidunmoye","year":"2015","unstructured":"Ibidunmoye, O., Hern\u00e1ndez-Rodriguez, F., Elmroth, E.: Performance anomaly detection and bottleneck identification. ACM Comput. Surv. (CSUR) 48(1), 1\u201335 (2015)","journal-title":"ACM Comput. Surv. (CSUR)"},{"key":"11_CR27","doi-asserted-by":"crossref","unstructured":"Klinkenberg, J., Terboven, C., Lankes, S., M\u00fcller, M.S.: Data mining-based analysis of HPC center operations. In: IEEE International Conference on Cluster Computing, pp. 766\u2013773 (2017)","DOI":"10.1109\/CLUSTER.2017.23"},{"key":"11_CR28","doi-asserted-by":"crossref","unstructured":"Kunang, Y.N., Nurmaini, S., Stiawan, D., Zarkasi, A., Jasmir, F.: Automatic features extraction using autoencoder in intrusion detection system. In: IEEE International Conference on Electrical Engineering and Computer Science (ICECOS), pp. 219\u2013224 (2018)","DOI":"10.1109\/ICECOS.2018.8605181"},{"key":"11_CR29","unstructured":"Kunen, A.J., Bailey, T.S., Brown, P.N.: KRIPKE-a massively parallel transport mini-app. Technical report, Lawrence Livermore National Lab. (LLNL), Livermore, CA (United States) (2015)"},{"key":"11_CR30","doi-asserted-by":"crossref","unstructured":"Leung, V.J., Bender, M.A., Bunde, D.P., Phillips, C.A.: Algorithmic support for commodity-based parallel computing systems. Technical report, Sandia National Laboratories (2003)","DOI":"10.2172\/918344"},{"key":"11_CR31","doi-asserted-by":"crossref","unstructured":"Liu, G., Bao, H., Han, B.: A stacked autoencoder-based deep neural network for achieving gearbox fault diagnosis. Math. Probl. Eng. (2018)","DOI":"10.1155\/2018\/5105709"},{"key":"11_CR32","doi-asserted-by":"crossref","unstructured":"Luo, T., Nagarajan, S.G.: Distributed anomaly detection using autoencoder neural networks in WSN for IoT. In: IEEE International Conference on Communications (ICC), pp. 1\u20136 (2018)","DOI":"10.1109\/ICC.2018.8422402"},{"key":"11_CR33","unstructured":"Minhas, M.S., Zelek, J.: Semi-supervised anomaly detection using autoencoders. arXiv:2001.03674 [cs, eess, stat], January 2020. http:\/\/arxiv.org\/abs\/2001.03674"},{"key":"11_CR34","unstructured":"Nair, V., Hinton, G.E.: Rectified linear units improve restricted Boltzmann machines. In: ICML (2010)"},{"key":"11_CR35","unstructured":"Petersson, N., Sj\u00f6green, B.: Sw4 v1.1 [software] (2014). https:\/\/doi.org\/http:\/\/doi.org\/10.5281\/zenodo.571844"},{"issue":"1","key":"11_CR36","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1006\/jcph.1995.1039","volume":"117","author":"S Plimpton","year":"1995","unstructured":"Plimpton, S.: Fast parallel algorithms for short-range molecular dynamics. J. Comput. Phys. 117(1), 1\u201319 (1995)","journal-title":"J. Comput. Phys."},{"key":"11_CR37","doi-asserted-by":"crossref","unstructured":"Sato, D., Hanaoka, S., Nomura, Y., et al.: A primitive study on unsupervised anomaly detection with an autoencoder in emergency head CT volumes. In: Medical Imaging: Computer-Aided Diagnosis, vol. 10575, p. 105751P. International Society for Optics and Photonics (2018)","DOI":"10.1117\/12.2292276"},{"key":"11_CR38","doi-asserted-by":"crossref","unstructured":"Schwaller, B., Tucker, N., Tucker, T., Allan, B., Brandt, J.: HPC system data pipeline to enable meaningful insights through analysis-driven visualizations. In: IEEE International Conference on Cluster Computing, pp. 433\u2013441, September 2020","DOI":"10.1109\/CLUSTER49012.2020.00062"},{"issue":"2","key":"11_CR39","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1177\/1094342014522573","volume":"28","author":"M Snir","year":"2014","unstructured":"Snir, M., Carlson, B., et al.: Addressing failures in exascale computing. Int. J. High Perf. Comput. Appl. 28(2), 129\u2013173 (2014)","journal-title":"Int. J. High Perf. Comput. Appl."},{"key":"11_CR40","doi-asserted-by":"crossref","unstructured":"Song, H., Jiang, Z., et\u00a0al.: A hybrid semi-supervised anomaly detection model for high-dimensional data. Comput. Intell. Neurosci. (2017)","DOI":"10.1155\/2017\/8501683"},{"issue":"1","key":"11_CR41","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Dropout: a simple way to prevent neural networks from overfitting. J. Mach. Learn. Res. 15(1), 1929\u20131958 (2014)","journal-title":"J. Mach. Learn. Res."},{"key":"11_CR42","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"355","DOI":"10.1007\/978-3-319-58667-0_19","volume-title":"High Performance Computing","author":"O Tuncer","year":"2017","unstructured":"Tuncer, O., et al.: Diagnosing performance variations in HPC applications using machine learning. In: Kunkel, J.M., Yokota, R., Balaji, P., Keyes, D. (eds.) ISC 2017. LNCS, vol. 10266, pp. 355\u2013373. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-58667-0_19"},{"issue":"4","key":"11_CR43","doi-asserted-by":"publisher","first-page":"883","DOI":"10.1109\/TPDS.2018.2870403","volume":"30","author":"O Tuncer","year":"2018","unstructured":"Tuncer, O., Ates, E., Zhang, Y., et al.: Online diagnosis of performance variation in HPC systems using machine learning. IEEE Trans. Parallel Distrib. Syst. 30(4), 883\u2013896 (2018)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"11_CR44","doi-asserted-by":"crossref","unstructured":"Wang, K., et al.: Research on healthy anomaly detection model based on deep learning from multiple time-series physiological signals. Sci. Program. (2016)","DOI":"10.1155\/2016\/5642856"},{"issue":"7","key":"11_CR45","doi-asserted-by":"publisher","first-page":"1902","DOI":"10.1109\/TPDS.2015.2475741","volume":"27","author":"L Yu","year":"2015","unstructured":"Yu, L., Lan, Z.: A scalable, non-parametric method for detecting performance anomaly in large scale computing. IEEE Trans. Parallel Distrib. Syst. 27(7), 1902\u20131914 (2015)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"11_CR46","doi-asserted-by":"crossref","unstructured":"Zhou, C., Paffenroth, R.C.: Anomaly detection with robust deep autoencoders. In: Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 665\u2013674 (2017)","DOI":"10.1145\/3097983.3098052"}],"container-title":["Lecture Notes in Computer Science","High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-78713-4_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,3,29]],"date-time":"2023-03-29T07:05:48Z","timestamp":1680073548000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-78713-4_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030787127","9783030787134"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-78713-4_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"17 June 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ISC High Performance","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on High Performance Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24 June 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 July 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"36","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"supercomputing2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.isc-hpc.com\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Linklings","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"74","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"24","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"32% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.28","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.13","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"In the ISC High Performance Workshop, there were 49 submissions, out of which 35  were accepted.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}