{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T00:20:52Z","timestamp":1775694052544,"version":"3.50.1"},"reference-count":50,"publisher":"Springer Science and Business Media LLC","issue":"15","license":[{"start":{"date-parts":[[2025,9,20]],"date-time":"2025-09-20T00:00:00Z","timestamp":1758326400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,20]],"date-time":"2025-09-20T00:00:00Z","timestamp":1758326400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"DOI":"10.1007\/s11227-025-07849-9","type":"journal-article","created":{"date-parts":[[2025,9,20]],"date-time":"2025-09-20T19:48:14Z","timestamp":1758397694000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["HAMS: an AI-driven framework for real-time failure detection in HPC system logs"],"prefix":"10.1007","volume":"81","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-2858-2583","authenticated-orcid":false,"given":"Byungwoo","family":"Bang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4773-6467","authenticated-orcid":false,"given":"Jaehoon","family":"Kim","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5911-4087","authenticated-orcid":false,"given":"Uiseok","family":"Song","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6147-0588","authenticated-orcid":false,"given":"Junyeon","family":"Lee","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7512-6754","authenticated-orcid":false,"given":"Jeonghyeon","family":"Ma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,9,20]]},"reference":[{"issue":"7","key":"7849_CR1","doi-asserted-by":"publisher","first-page":"56","DOI":"10.1145\/2699414","volume":"58","author":"DA Reed","year":"2015","unstructured":"Reed DA, Dongarra J (2015) Exascale computing and big data. Commun ACM 58(7):56\u201368","journal-title":"Commun ACM"},{"issue":"4","key":"7849_CR2","doi-asserted-by":"publisher","first-page":"435","DOI":"10.1177\/1094342018778123","volume":"32","author":"E Deelman","year":"2018","unstructured":"Deelman E, Vahi V, Juve G, Da Silva LF, Ellisman A (2018) Big data and extreme-scale computing: pathways to convergence\u2014Toward a shaping strategy for a future software and data ecosystem for scientific inquiry. Int J High Perform Comput Appl 32(4):435\u2013479","journal-title":"Int J High Perform Comput Appl"},{"key":"7849_CR3","unstructured":"Oppenheimer D, Ganapathi A, Patterson DA (2003) Why do internet services fail, and what can be done about it? In: 4th USENIX Symposium on Internet Technologies and Systems (USITS '03)"},{"key":"7849_CR4","doi-asserted-by":"publisher","first-page":"435","DOI":"10.1007\/s11227-023-05482-y","volume":"80","author":"J-W Park","year":"2024","unstructured":"Park J-W, Huang X, Lee C-H (2024) Analyzing and predicting job failures from HPC system log. J Supercomput 80:435\u2013462","journal-title":"J Supercomput"},{"key":"7849_CR5","doi-asserted-by":"crossref","unstructured":"Meng F, Jiang ZM (2019) LogAnomaly: unsupervised detection of sequential and quantitative anomalies in unstructured logs. In: Proceedings of the Twenty-Eighth International Joint Conference on Artificial Intelligence (IJCAI), pp 4739\u20134745","DOI":"10.24963\/ijcai.2019\/658"},{"key":"7849_CR6","unstructured":"Team L (2024) The Llama 3 herd of models. arXiv preprint arXiv:2407.21783"},{"issue":"4","key":"7849_CR7","doi-asserted-by":"publisher","first-page":"337","DOI":"10.1109\/TDSC.2009.4","volume":"7","author":"B Schroeder","year":"2009","unstructured":"Schroeder B, Gibson G (2009) A large-scale study of failures in high-performance computing systems. IEEE Trans Depend Secure Comput 7(4):337\u2013350","journal-title":"IEEE Trans Depend Secure Comput"},{"key":"7849_CR8","doi-asserted-by":"crossref","unstructured":"Tiwari D, Gupta S, Kalankar SH (2015) Reliability lessons learned from GPU experience with the titan supercomputer at Oak Ridge Leadership Computing Facility. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp 1\u201312","DOI":"10.1145\/2807591.2807666"},{"key":"7849_CR9","doi-asserted-by":"crossref","unstructured":"Gainaru A, Cappello F, Snir M, Kramer W (2012) Fault prediction under the microscope: a closer look into HPC systems. In: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis (SC)","DOI":"10.1109\/SC.2012.57"},{"key":"7849_CR10","doi-asserted-by":"crossref","unstructured":"He P, Zhu J, He S, Li J, Lyu MR (2016) An evaluation study on log parsing and its use in log mining. In: Proceedings of the 2016 46th Annual IEEE\/IFIP International Conference on Dependable Systems and Networks, Toulouse, France","DOI":"10.1109\/DSN.2016.66"},{"key":"7849_CR11","doi-asserted-by":"crossref","unstructured":"Du M, Li F (2017) \"DeepLog: anomaly detection and diagnosis from system logs through deep learning. In: Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security (CCS), Dallas, TX, USA","DOI":"10.1145\/3133956.3134015"},{"issue":"10","key":"7849_CR12","doi-asserted-by":"publisher","DOI":"10.3390\/electronics12102260","volume":"12","author":"W Dobrowolski","year":"2023","unstructured":"Dobrowolski W, Nikodem M, Unold O (2023) Software failure log analysis for engineers\u2014review. Electronics 12(10):2260","journal-title":"Electronics"},{"key":"7849_CR13","doi-asserted-by":"publisher","DOI":"10.1007\/s10922-024-09831-x","author":"E Karlsen","year":"2024","unstructured":"Karlsen E, Luo X, Zincir-Heywood N, Heywood M (2024) Benchmarking large language models for log analysis, security, and interpretation. J Netw Syst Manage. https:\/\/doi.org\/10.1007\/s10922-024-09831-x","journal-title":"J Netw Syst Manage"},{"key":"7849_CR14","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2019) BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp 4171\u20134186"},{"key":"7849_CR15","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. In: Advances in neural information processing systems 30 (NIPS 2017), pp 5998\u20136008"},{"key":"7849_CR16","unstructured":"TOP500.org, \"SSC-21 Supercomputer System Details,\" [Online]. Available: https:\/\/top500.org\/system\/180041\/. Accessed Nov 2021"},{"key":"7849_CR17","volume-title":"Practical reliability engineering","author":"P O'Connor","year":"2012","unstructured":"O\u2019Connor P, Newton DT, Kleyner A (2012) Practical reliability engineering, 5th edn. Wiley, Hoboken, NJ","edition":"5"},{"key":"7849_CR18","doi-asserted-by":"publisher","first-page":"1085","DOI":"10.1007\/s10586-018-02888-9","volume":"22","author":"AM Abdullah","year":"2019","unstructured":"Abdullah AM, Ali HA, Haikal AY (2019) A reliable, TOPSIS-based multi-criteria, and hierarchical load balancing method for computational grid. Cluster Comput 22:1085\u20131106","journal-title":"Cluster Comput"},{"key":"7849_CR19","doi-asserted-by":"publisher","DOI":"10.1088\/1742-6596\/78\/1\/012022","author":"B Schroeder","year":"2007","unstructured":"Schroeder B, Gibson GA (2007) Understanding failures in petascale computers. J Phys: Conf Ser. https:\/\/doi.org\/10.1088\/1742-6596\/78\/1\/012022","journal-title":"J Phys: Conf Ser"},{"key":"7849_CR20","volume-title":"Case studies in reliability and maintenance","author":"WR Blischke","year":"2003","unstructured":"Blischke WR, Murthy DNP (2003) Case studies in reliability and maintenance. Wiley, Hoboken, NJ"},{"issue":"1","key":"7849_CR21","first-page":"5","volume":"1","author":"F Cappello","year":"2014","unstructured":"Cappello F, Al-Ghalith G, Gropp W, Kale S, Kramer B, Snir M (2014) Toward exascale resilience: 2014 update. Supercomput Front Innov 1(1):5\u201328","journal-title":"Supercomput Front Innov"},{"issue":"2","key":"7849_CR22","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1177\/1094342014522573","volume":"28","author":"M Snir","year":"2014","unstructured":"Snir M, Wisniewski RW, Abraham JA, Adve SV, Bagchi S, Balaji P, Shalf J (2014) Addressing failures in exascale computing. Int J High Perform Comput Appl 28(2):129\u2013173","journal-title":"Int J High Perform Comput Appl"},{"key":"7849_CR23","volume-title":"UNIX and Linux system administration handbook","author":"E Nemeth","year":"2017","unstructured":"Nemeth E, Snyder G, Hein TR, Whaley B, Mackin D (2017) UNIX and Linux system administration handbook, 5th edn. Addison-Wesley Professional, Boston, MA","edition":"5"},{"issue":"no. 2","key":"7849_CR24","doi-asserted-by":"publisher","first-page":"55","DOI":"10.1145\/2076450.2076466","volume":"55","author":"A Oliner","year":"2012","unstructured":"Oliner A, Ganapathi A, Xu W (2012) Advances and challenges in log analysis. Commun ACM 55(2):55\u201361","journal-title":"Commun ACM"},{"key":"7849_CR25","unstructured":"NVIDIA Corporation, \"GPU Management and Deployment,\" NVIDIA Documentation, Sept. 2024. [Online]. Available: https:\/\/docs.nvidia.com\/deploy\/xid-errors\/contents.html."},{"issue":"3","key":"7849_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1670679.1670680","volume":"42","author":"F Salfner","year":"2010","unstructured":"Salfner F, Lenk M, Malek M (2010) A survey of online failure prediction methods. ACM Comput Surv 42(3):1\u201342","journal-title":"ACM Comput Surv"},{"key":"7849_CR27","unstructured":"Liu Y, Ott M, Goyal N, Du J, Joshi M, Chen D, Levy O, Lewis M, Zettlemoyer L, Stoyanov V (2019) \"RoBERTa: a robustly optimized BERT pretraining approach. arXiv preprint arXiv:1907.11692"},{"key":"7849_CR28","doi-asserted-by":"crossref","unstructured":"Fang H, Wang S, Zhou M, Ding J, Xie P (2020) CERT: contrastive self-supervised learning for language understanding. In: Proceedings of the 28th International Conference on Computational Linguistics","DOI":"10.36227\/techrxiv.12308378"},{"key":"7849_CR29","doi-asserted-by":"crossref","unstructured":"Gao T, Yao X, Chen D (2021) \"SimCSE: simple contrastive learning of sentence embeddings. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp 6894\u20136910","DOI":"10.18653\/v1\/2021.emnlp-main.552"},{"key":"7849_CR30","doi-asserted-by":"crossref","unstructured":"Gururangan S, Marasovi\u0107 A, Swayamdipta S, Lo K, Beltagy I, Downey D, Smith NA (2020) \"Don't stop pretraining: adapt language models to domains and tasks. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp 8342\u20138360","DOI":"10.18653\/v1\/2020.acl-main.740"},{"key":"7849_CR31","unstructured":"Intel Corporation, \"Intelligent Platform Management Interface Specification v2.0 rev. 1.1,\" Oct. 2013. [Online]. Available: https:\/\/www.intel.com\/content\/www\/kr\/ko\/products\/docs\/servers\/ipmi\/ipmi-second-gen-interface-spec-v2-rev1-1.html"},{"issue":"no. 3","key":"7849_CR32","doi-asserted-by":"publisher","first-page":"273","DOI":"10.1177\/1094342013488258","volume":"27","author":"A Gainaru","year":"2013","unstructured":"Gainaru A, Cappello F, Snir M, Kramer W (2013) Failure prediction for HPC systems and applications. Int J High Perform Comput Appl 27(3):273\u2013282","journal-title":"Int J High Perform Comput Appl"},{"key":"7849_CR33","unstructured":"de Supinski BR (2009) The CIFTS architecture: a coordinated infrastructure for fault-tolerant systems. In: IEEE International Symposium on Parallel & Distributed Processing"},{"key":"7849_CR34","unstructured":"Gainaru A (2011) Ovis 2: A robust distributed architecture for scalable RAS in HPC. In: IEEE International Parallel & Distributed Processing Symposium"},{"key":"7849_CR35","doi-asserted-by":"crossref","unstructured":"Fu S, Xu C-Z (2007) Exploring event correlation for failure prediction in coalitions of clusters. In: Proceedings of the 2007 ACM\/IEEE Conference on Supercomputing","DOI":"10.1145\/1362622.1362678"},{"key":"7849_CR36","doi-asserted-by":"crossref","unstructured":"Liang Y, Zhang Y, Xiong H, Sahoo R (2007) Failure prediction in IBM BlueGene\/L event logs. In: Seventh IEEE International Conference on Data Mining","DOI":"10.1109\/ICDM.2007.46"},{"key":"7849_CR37","doi-asserted-by":"publisher","first-page":"471","DOI":"10.1007\/s10586-019-02917-1","volume":"22","author":"B Mohammed","year":"2019","unstructured":"Mohammed B, Awan I, Ugail H, Younas M (2019) Failure prediction using machine learning in a virtualised HPC system and application. Cluster Comput 22:471\u2013485","journal-title":"Cluster Comput"},{"key":"7849_CR38","doi-asserted-by":"publisher","first-page":"1345","DOI":"10.1007\/s10586-019-02998-y","volume":"23","author":"A Alnafessah","year":"2019","unstructured":"Alnafessah A, Casale G (2019) Artificial neural networks based techniques for anomaly detection in Apache Spark. Cluster Comput 23:1345\u20131360","journal-title":"Cluster Comput"},{"key":"7849_CR39","unstructured":"Lou J, Fu Q, Yang S, Xu D, Lou J-G (2010) Mining invariants from console logs for system problem detection. In: USENIX Annual Technical Conference"},{"key":"7849_CR40","doi-asserted-by":"crossref","unstructured":"Guo H, Yuan S, Wu X (2021) LogBERT: log anomaly detection via BERT. In: Proceedings of the 2021 International Joint Conference on Neural Networks (IJCNN), Shenzhen, China","DOI":"10.1109\/IJCNN52387.2021.9534113"},{"key":"7849_CR41","unstructured":"Ruff L, Vandermeulen R, Goernitz N, Lucas D, Siddiqui SA, Kloft M, M\u00fcller E, Binder A (2018) Deep one-class classification. In: International Conference on Machine Learning"},{"key":"7849_CR42","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2023.110689","author":"Y Lee","year":"2023","unstructured":"Lee Y, Kim J, Kang P (2023) LAnoBERT: system log anomaly detection based on BERT masked language model. Appl Soft Comput. https:\/\/doi.org\/10.1016\/j.asoc.2023.110689","journal-title":"Appl Soft Comput"},{"key":"7849_CR43","doi-asserted-by":"crossref","unstructured":"Xu W, Huang L, Fox A, Patterson D, Jordan MI (2009) Detecting large-scale system problems by mining console logs. In: Proceedings of the ACM SIGOPS 22nd Symposium on Operating Systems Principles","DOI":"10.1145\/1629575.1629587"},{"key":"7849_CR44","unstructured":"\"Filebeat Documentation,\" Elasticsearch, [Online]. Available: https:\/\/www.elastic.co\/beats\/filebeat"},{"key":"7849_CR45","unstructured":"\"Apache Kafka,\" Apache Software Foundation, [Online]. Available: https:\/\/kafka.apache.org\/"},{"key":"7849_CR46","unstructured":"\"Grafana,\" Grafana Labs, [Online]. Available: https:\/\/grafana.com\/"},{"key":"7849_CR47","unstructured":"Yang Z, Dai Z, Y. Yang, J. Carbonell, R. Salakhutdinov, and Q. V. Le, \"XLNet: Generalized Autoregressive Pretraining for Language Understanding,\" in Advances in Neural Information Processing Systems (NeurIPS), 2019."},{"key":"7849_CR48","unstructured":"He P, Liu X, Gao J, Chen J (2021) DeBERTa: Decoding-enhanced BERT with disentangled attention. In: International Conference on Learning Representations (ICLR)"},{"key":"7849_CR49","unstructured":"Sanh V, Debut L, Chaumond J, Wolf T (2019) DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. In: NeurIPS Workshop on Efficient Methods for Deep Neural Networks"},{"key":"7849_CR50","unstructured":"Clark K, Luong MT, Le QV, Manning CD (2020) ELECTRA: pre-training text encoders as discriminators rather than generators. In: International Conference on Learning Representations (ICLR)"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-025-07849-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-025-07849-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-025-07849-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,20]],"date-time":"2025-09-20T19:48:21Z","timestamp":1758397701000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-025-07849-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,20]]},"references-count":50,"journal-issue":{"issue":"15","published-online":{"date-parts":[[2025,10]]}},"alternative-id":["7849"],"URL":"https:\/\/doi.org\/10.1007\/s11227-025-07849-9","relation":{},"ISSN":["1573-0484"],"issn-type":[{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,20]]},"assertion":[{"value":"10 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 September 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 September 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"1364"}}