{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,17]],"date-time":"2026-01-17T07:33:15Z","timestamp":1768635195560,"version":"3.49.0"},"publisher-location":"Cham","reference-count":37,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030856649","type":"print"},{"value":"9783030856656","type":"electronic"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-85665-6_5","type":"book-chapter","created":{"date-parts":[[2021,8,28]],"date-time":"2021-08-28T03:06:52Z","timestamp":1630120012000},"page":"70-85","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["E2EWatch: An End-to-End Anomaly Diagnosis Framework for Production HPC Systems"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3627-7311","authenticated-orcid":false,"given":"Burak","family":"Aksar","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Benjamin","family":"Schwaller","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Omar","family":"Aaziz","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vitus J.","family":"Leung","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jim","family":"Brandt","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Manuel","family":"Egele","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ayse K.","family":"Coskun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,8,25]]},"reference":[{"key":"5_CR1","doi-asserted-by":"crossref","unstructured":"Agelastos, A., et al.: The lightweight distributed metric service: a scalable infrastructure for continuous monitoring of large scale computing systems and applications. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC), pp. 154\u2013165 (2014)","DOI":"10.1109\/SC.2014.18"},{"key":"5_CR2","doi-asserted-by":"crossref","unstructured":"Agelastos, A., et al.: Toward rapid understanding of production HPC applications and systems. In: IEEE International Conference on Cluster Computing, pp. 464\u2013473 (2015)","DOI":"10.1109\/CLUSTER.2015.71"},{"key":"5_CR3","doi-asserted-by":"crossref","unstructured":"Ahad, R., Chan, E., Santos, A.: Toward autonomic cloud: automatic anomaly detection and resolution. In: International Conference on Cloud and Autonomic Computing, pp. 200\u2013203 (2015)","DOI":"10.1109\/ICCAC.2015.32"},{"key":"5_CR4","doi-asserted-by":"crossref","unstructured":"Arzani, B., Ciraci, S., Loo, B.T., Schuster, A., Outhred, G.: Taking the blame game out of data centers operations with NetPoirot. In: Proceedings of the ACM SIGCOMM Conference, pp. 440\u2013453 (2016)","DOI":"10.1145\/2934872.2934884"},{"key":"5_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"92","DOI":"10.1007\/978-3-319-96983-1_7","volume-title":"Euro-Par 2018: Parallel Processing","author":"E Ates","year":"2018","unstructured":"Ates, E., et al.: Taxonomist: application detection through rich monitoring data. In: Aldinucci, M., Padovani, L., Torquati, M. (eds.) Euro-Par 2018. LNCS, vol. 11014, pp. 92\u2013105. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-319-96983-1_7"},{"key":"5_CR6","doi-asserted-by":"crossref","unstructured":"Ates, E., Zhang, Y., Aksar, B., et al.: HPAS: an HPC performance anomaly suite for reproducing performance variations. In: Proceedings of the 48th International Conference on Parallel Processing, pp. 1\u201310. ACM, August 2019","DOI":"10.1145\/3337821.3337907"},{"key":"5_CR7","doi-asserted-by":"crossref","unstructured":"Bartolini, A., et al.: The DAVIDE big-data-powered fine-grain power and performance monitoring support. In: Proceedings of the 15th ACM International Conference on Computing Frontiers, pp. 303\u2013308 (2018)","DOI":"10.1145\/3203217.3205863"},{"key":"5_CR8","doi-asserted-by":"crossref","unstructured":"Bhatele, A., Mohror, K., Langer, S.H., Isaacs, K.E.: There goes the neighborhood: performance degradation due to nearby jobs. In: SC 2013: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, pp. 1\u201312 (2013)","DOI":"10.1145\/2503210.2503247"},{"key":"5_CR9","doi-asserted-by":"crossref","unstructured":"Bhatele, A., et al.: The case of performance variability on dragonfly-based systems. In: IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 896\u2013905 (2020)","DOI":"10.1109\/IPDPS47924.2020.00096"},{"key":"5_CR10","doi-asserted-by":"crossref","unstructured":"Bhuyan, M.H., Bhattacharyya, D., Kalita, J.K.: NADO: network anomaly detection using outlier approach. In: Proceedings of the International Conference on Communication, Computing and Security, pp. 531\u2013536 (2011)","DOI":"10.1145\/1947940.1948050"},{"key":"5_CR11","doi-asserted-by":"publisher","first-page":"634","DOI":"10.1016\/j.engappai.2019.07.008","volume":"85","author":"A Borghesi","year":"2019","unstructured":"Borghesi, A., Bartolini, A., Lombardi, M., Milano, M., Benini, L.: A semisupervised autoencoder-based approach for anomaly detection in high performance computing systems. Eng. Appl. Artif. Intell. 85, 634\u2013644 (2019)","journal-title":"Eng. Appl. Artif. Intell."},{"key":"5_CR12","doi-asserted-by":"crossref","unstructured":"Bourassa, N., et al.: Operational data analytics: optimizing the national energy research scientific computing center cooling systems. In: Proceedings of the 48th International Conference on Parallel Processing: Workshops, pp. 1\u20137 (2019)","DOI":"10.1145\/3339186.3339210"},{"key":"5_CR13","unstructured":"Brandt, J.M., et al.: Enabling advanced operational analysis through multi-subsystem data integration on trinity. Technical report, Sandia National Lab. (SNL-CA), Livermore, CA (United States) (2015)"},{"key":"5_CR14","doi-asserted-by":"crossref","unstructured":"Chen, T., Guestrin, C.: XGBoost: a scalable tree boosting system. In: Proceedings of the ACM International Conference on Knowledge Discovery and Data Mining, pp. 785\u2013794 (2016)","DOI":"10.1145\/2939672.2939785"},{"key":"5_CR15","doi-asserted-by":"crossref","unstructured":"Dalmazo, B.L., Vilela, J.P., Simoes, P., Curado, M.: Expedite feature extraction for enhanced cloud anomaly detection. In: IEEE\/IFIP Network Operations and Management Symposium, pp. 1215\u20131220 (2016)","DOI":"10.1109\/NOMS.2016.7502990"},{"key":"5_CR16","doi-asserted-by":"crossref","unstructured":"Das, A., Mueller, F., Rountree, B.: Aarohi: making real-time node failure prediction feasible. In: 2020 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 1092\u20131101 (2020)","DOI":"10.1109\/IPDPS47924.2020.00115"},{"key":"5_CR17","doi-asserted-by":"crossref","unstructured":"Dorier, M., Antoniu, G., Ross, R., Kimpe, D., Ibrahim, S.: CALCioM: mitigating i\/o interference in HPC systems through cross-application coordination. In: IEEE International Parallel and Distributed Processing Symposium, pp. 155\u2013164 (2014)","DOI":"10.1109\/IPDPS.2014.27"},{"key":"5_CR18","doi-asserted-by":"crossref","unstructured":"Jayathilaka, H., Krintz, C., Wolski, R.: Performance monitoring and root cause analysis for cloud-hosted web applications. In: Proceedings of the 26th International Conference on World Wide Web, pp. 469\u2013478 (2017)","DOI":"10.1145\/3038912.3052649"},{"key":"5_CR19","first-page":"3146","volume":"30","author":"G Ke","year":"2017","unstructured":"Ke, G., et al.: Lightgbm: a highly efficient gradient boosting decision tree. Adv. Neural. Inf. Process. Syst. 30, 3146\u20133154 (2017)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5_CR20","doi-asserted-by":"crossref","unstructured":"Klinkenberg, J., Terboven, C., Lankes, S., M\u00fcller, M.S.: Data mining-based analysis of HPC center operations. In: IEEE International Conference on Cluster Computing (CLUSTER), pp. 766\u2013773 (2017)","DOI":"10.1109\/CLUSTER.2017.23"},{"key":"5_CR21","doi-asserted-by":"crossref","unstructured":"Lan, Z., Zheng, Z., Li, Y.: Toward automated anomaly identification in large-scale systems. IEEE Trans. Parallel Distrib. Syst. 21(2), 174\u2013187 (2009)","DOI":"10.1109\/TPDS.2009.52"},{"key":"5_CR22","doi-asserted-by":"crossref","unstructured":"Leung, V.J., Bender, M.A., Bunde, D.P., Phillips, C.A.: Algorithmic support for commodity-based parallel computing systems. Technical report, Sandia National Laboratories (2003)","DOI":"10.2172\/918344"},{"key":"5_CR23","doi-asserted-by":"crossref","unstructured":"Marathe, A., Zhang, Y., Blanks, G., Kumbhare, N., Abdulla, G., Rountree, B.: An empirical survey of performance and energy efficiency variation on intel processors. In: Proceedings of the 5th International Workshop on Energy Efficient Supercomputing, pp. 1\u20138 (2017)","DOI":"10.1145\/3149412.3149421"},{"issue":"253","key":"5_CR24","doi-asserted-by":"publisher","first-page":"68","DOI":"10.1080\/01621459.1951.10500769","volume":"46","author":"FJ Massey Jr","year":"1951","unstructured":"Massey, F.J., Jr.: The Kolmogorov-Smirnov test for goodness of fit. J. Am. Stat. Assoc. 46(253), 68\u201378 (1951)","journal-title":"J. Am. Stat. Assoc."},{"issue":"7","key":"5_CR25","doi-asserted-by":"publisher","first-page":"817","DOI":"10.1016\/j.parco.2004.04.001","volume":"30","author":"ML Massie","year":"2004","unstructured":"Massie, M.L., Chun, B.N., Culler, D.E.: The ganglia distributed monitoring system: design, implementation, and experience. Parallel Comput. 30(7), 817\u2013840 (2004)","journal-title":"Parallel Comput."},{"key":"5_CR26","doi-asserted-by":"crossref","unstructured":"Nair, V., et al.: Learning a hierarchical monitoring system for detecting and diagnosing service issues. In: Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 2029\u20132038 (2015)","DOI":"10.1145\/2783258.2788624"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Netti, A., et al.: DCDB wintermute: enabling online and holistic operational data analytics on HPC systems. In: Proceedings of the 29th International Symposium on High-Performance Parallel and Distributed Computing, pp. 101\u2013112 (2020)","DOI":"10.1145\/3369583.3392674"},{"key":"5_CR28","first-page":"2825","volume":"12","author":"F Pedregosa","year":"2011","unstructured":"Pedregosa, F., et al.: Scikit-learn: machine learning in Python. J. Mach. Learn. Res. 12, 2825\u20132830 (2011)","journal-title":"J. Mach. Learn. Res."},{"key":"5_CR29","unstructured":"Sandia National Laboratories: HPC capacity cluster platforms (2017). https:\/\/hpc.sandia.gov\/HPC%20Production%20Clusters\/index.html"},{"key":"5_CR30","doi-asserted-by":"crossref","unstructured":"Schwaller, B., Tucker, N., Tucker, T., Allan, B., Brandt, J.: HPC system data pipeline to enable meaningful insights through analysis-driven visualizations. In: IEEE International Conference on Cluster Computing (CLUSTER), pp. 433\u2013441 (2020)","DOI":"10.1109\/CLUSTER49012.2020.00062"},{"key":"5_CR31","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1016\/j.procs.2018.08.235","volume":"136","author":"D Shaykhislamov","year":"2018","unstructured":"Shaykhislamov, D., Voevodin, V.: An approach for dynamic detection of inefficient supercomputer applications. Procedia Comput. Sci. 136, 35\u201343 (2018)","journal-title":"Procedia Comput. Sci."},{"key":"5_CR32","doi-asserted-by":"crossref","unstructured":"Skinner, D., Kramer, W.: Understanding the causes of performance variability in HPC workloads. In: Proceedings of the IEEE Workload Characterization Symposium, pp. 137\u2013149 (2005)","DOI":"10.1109\/IISWC.2005.1526010"},{"key":"5_CR33","doi-asserted-by":"crossref","unstructured":"Tuncer, O., et al.: Online diagnosis of performance variation in HPC systems using machine learning. IEEE Trans. Parallel Distrib. Syst. 30(4), 883\u2013896 (2018)","DOI":"10.1109\/TPDS.2018.2870403"},{"issue":"1","key":"5_CR34","doi-asserted-by":"publisher","first-page":"215","DOI":"10.1109\/TVCG.2018.2865026","volume":"25","author":"C Xie","year":"2018","unstructured":"Xie, C., Xu, W., Mueller, K.: A visual analytics framework for the detection of anomalous call stack trees in high performance computing applications. IEEE Trans. Vis. Comput. Graph. 25(1), 215\u2013224 (2018)","journal-title":"IEEE Trans. Vis. Comput. Graph."},{"key":"5_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"163","DOI":"10.1007\/978-3-319-96983-1_12","volume-title":"Euro-Par 2018: Parallel Processing","author":"M Zasadzi\u0144ski","year":"2018","unstructured":"Zasadzi\u0144ski, M., Munt\u00e9s-Mulero, V., Sol\u00e9, M., Carrera, D., Ludwig, T.: Early termination of failed HPC jobs through machine and deep learning. In: Aldinucci, M., Padovani, L., Torquati, M. (eds.) Euro-Par 2018. LNCS, vol. 11014, pp. 163\u2013177. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-319-96983-1_12"},{"key":"5_CR36","doi-asserted-by":"crossref","unstructured":"Zhang, X., Meng, F., Chen, P., Xu, J.: TaskInsight: a fine-grained performance anomaly detection and problem locating system. In: IEEE International Conference on Cloud Computing (CLOUD), pp. 917\u2013920 (2016)","DOI":"10.1109\/CLOUD.2016.0136"},{"key":"5_CR37","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Groves, T., Cook, B., Wright, N.J., Coskun, A.K.: Quantifying the impact of network congestion on application performance and network metrics. In: IEEE International Conference on Cluster Computing (CLUSTER), pp. 162\u2013168 (2020)","DOI":"10.1109\/CLUSTER49012.2020.00026"}],"container-title":["Lecture Notes in Computer Science","Euro-Par 2021: Parallel Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-85665-6_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,27]],"date-time":"2025-08-27T22:02:36Z","timestamp":1756332156000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-85665-6_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030856649","9783030856656"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-85665-6_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"25 August 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"Euro-Par","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Parallel Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lisbon","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Portugal","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 September 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3 September 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"europar2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2021.euro-par.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"136","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"38","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"6","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}