{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:57:26Z","timestamp":1776931046900,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","funder":[{"name":"the Advanced Research Project of China","award":["31511010501"],"award-info":[{"award-number":["31511010501"]}]},{"name":"the National Natural Science Foundation of China","award":["62272249, 62302244"],"award-info":[{"award-number":["62272249, 62302244"]}]},{"name":"the Fundamental Research Funds for the Central Universities","award":["XXX-63253249"],"award-info":[{"award-number":["XXX-63253249"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759794","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:05:39Z","timestamp":1762963539000},"page":"1180-1194","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Effective Node-Level Anomaly Detection in HPC Systems via Coarse-Grained Clustering and Fine-Grained Model Sharing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-2853-9549","authenticated-orcid":false,"given":"Sibo","family":"Xia","sequence":"first","affiliation":[{"name":"Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0266-7899","authenticated-orcid":false,"given":"Yongqian","family":"Sun","sequence":"additional","affiliation":[{"name":"Nankai University, Tianjin, China and Tianjin Key Laboratory of Software Experience and Human Computer Interaction, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7357-5313","authenticated-orcid":false,"given":"Xijie","family":"Pan","sequence":"additional","affiliation":[{"name":"Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5072-6093","authenticated-orcid":false,"given":"Yuan","family":"Yuan","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0330-0028","authenticated-orcid":false,"given":"Shenglin","family":"Zhang","sequence":"additional","affiliation":[{"name":"Nankai University, Tianjin, China and Haihe Laboratory of Information Technology Application Innovation, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1166-2126","authenticated-orcid":false,"given":"Shaoyu","family":"Hu","sequence":"additional","affiliation":[{"name":"Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1282-3089","authenticated-orcid":false,"given":"Lei","family":"Tao","sequence":"additional","affiliation":[{"name":"Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0576-2127","authenticated-orcid":false,"given":"Yuqi","family":"Li","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China and National Supercomputer Center in Tianjin, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9657-3584","authenticated-orcid":false,"given":"Jinghua","family":"Feng","sequence":"additional","affiliation":[{"name":"National Supercomputer Center in Tianjin, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"2025. sacct. https:\/\/slurm.schedmd.com\/sacct.html. [Online]."},{"key":"e_1_3_3_2_3_2","unstructured":"Mahdi Abavisani Alireza Naghizadeh Dimitris Metaxas and Vishal Patel. 2020. Deep subspace clustering with data augmentation. Advances in Neural Information Processing Systems 33 (2020) 10360\u201310370."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"crossref","unstructured":"Burak Aksar Efe Sencan Benjamin Schwaller Omar Aaziz Vitus\u00a0J Leung Jim Brandt Brian Kulis Manuel Egele and Ayse Coskun. 2024. Runtime Performance Anomaly Diagnosis in Production HPC Systems Using Active Learning. IEEE Transactions on Parallel and Distributed Systems 35 (2024) 693\u2013706.","DOI":"10.1109\/TPDS.2024.3365462"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607076"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-78713-4_11"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Mar\u00edlia Barandas Duarte Folgado Let\u00edcia Fernandes Sara Santos Mariana Abreu Patr\u00edcia Bota Hui Liu Tanja Schultz and Hugo Gamboa. 2020. TSFEL: Time series feature extraction library. SoftwareX 11 (2020) 100456.","DOI":"10.1016\/j.softx.2020.100456"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"crossref","unstructured":"Paul Bergmann Kilian Batzner Michael Fauser David Sattlegger and Carsten Steger. 2021. The MVTec anomaly detection dataset: a comprehensive real-world dataset for unsupervised anomaly detection. International Journal of Computer Vision 129 4 (2021) 1038\u20131059.","DOI":"10.1007\/s11263-020-01400-4"},{"key":"e_1_3_3_2_9_2","first-page":"1","volume-title":"SC20: International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Besta Maciej","year":"2020","unstructured":"Maciej Besta, Marcel Schneider, Marek Konieczny, Karolina Cynk, Erik Henriksson, Salvatore Di\u00a0Girolamo, Ankit Singla, and Torsten Hoefler. 2020. FatPaths: Routing in supercomputers and data centers when shortest paths fall short. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201318."},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"crossref","unstructured":"Angela Bonifati Francesco Del\u00a0Buono Francesco Guerra and Donato Tiano. 2022. Time2Feat: learning interpretable representations for multivariate time series clustering. Proceedings of the VLDB Endowment (PVLDB) 16 2 (2022) 193\u2013201.","DOI":"10.14778\/3565816.3565822"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"crossref","unstructured":"Andrea Borghesi Martin Molan Michela Milano and Andrea Bartolini. 2021. Anomaly detection and anticipation in high performance computing systems. IEEE Transactions on Parallel and Distributed Systems 33 4 (2021) 739\u2013750.","DOI":"10.1109\/TPDS.2021.3082802"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3324884.3416548"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Zixiang Chen Yihe Deng Yue Wu Quanquan Gu and Yuanzhi Li. 2022. Towards understanding the mixture-of-experts layer in deep learning. Advances in neural information processing systems 35 (2022) 23049\u201323062.","DOI":"10.52202\/068431-1675"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3510003.3510085"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.5555\/3571885.3571916"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Wangxiang Ding Wenzhong Li Zhijie Zhang Chen Wan Jianhui Duan and Sanglu Lu. 2022. Time-varying Gaussian Markov random fields learning for multivariate time series clustering. IEEE Transactions on Knowledge and Data Engineering 35 11 (2022) 11950\u201311966.","DOI":"10.1109\/TKDE.2022.3232331"},{"key":"e_1_3_3_2_17_2","unstructured":"William Fedus Barret Zoph and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research 23 120 (2022) 1\u201339."},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508418"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSRE55969.2022.00014"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Dan Huang Zhenlu Qin Qing Liu Norbert Podhorszki and Scott Klasky. 2022. Identifying challenges and opportunities of in-memory computing on large hpc systems. J. Parallel and Distrib. Comput. 164 (2022) 106\u2013122.","DOI":"10.1016\/j.jpdc.2022.02.002"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3624062.3624145"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00078"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599258"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671507"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3627041"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671862"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i9.17008"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSRE59848.2023.00014"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Haopeng Liu Xu Wang Guangpu Li Shan Lu Feng Ye and Chen Tian. 2018. FCatch: Automatically detecting time-of-fault bugs in cloud systems. ACM SIGPLAN Notices 53 2 (2018) 419\u2013431.","DOI":"10.1145\/3296957.3177161"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","unstructured":"Ruiying Lu YuJie Wu Long Tian Dongsheng Wang Bo Chen Xiyang Liu and Ruimin Hu. 2023. Hierarchical vector quantized transformer for multi-class unsupervised anomaly detection. Advances in Neural Information Processing Systems (2023) 8487\u20138500.","DOI":"10.52202\/075280-0370"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"crossref","unstructured":"Martin Molan Andrea Borghesi Daniele Cesarini Luca Benini and Andrea Bartolini. 2023. RUAD: Unsupervised anomaly detection in HPC systems. Future Generation Computer Systems 141 (2023) 542\u2013554.","DOI":"10.1016\/j.future.2022.12.001"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467404"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-59851-8_18"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"crossref","unstructured":"George Papadimitriou and Dimitris Gizopoulos. 2023. Silent data corruptions: Microarchitectural perspectives. IEEE Trans. Comput. 72 11 (2023) 3072\u20133085.","DOI":"10.1109\/TC.2023.3285094"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"crossref","unstructured":"Armin\u00a0Danesh Pazho Ghazal\u00a0Alinezhad Noghre Arnab\u00a0A Purkayastha Jagannadh Vempati Otto Martin and Hamed Tabkhi. 2023. A survey of graph-based deep learning for anomaly detection in distributed systems. IEEE Transactions on Knowledge and Data Engineering 36 1 (2023) 1\u201320.","DOI":"10.1109\/TKDE.2023.3282898"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"crossref","unstructured":"Efe Sencan Yin-Ching Lee Connor Casey Benjamin Schwaller Vitus\u00a0J Leung Jim Brandt Brian Kulis Manuel Egele and Ayse\u00a0K Coskun. 2025. Refine: Robust Unsupervised Anomaly Detection for Production HPC Systems. (2025).","DOI":"10.23919\/ISC.2025.11018307"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3650083"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3650200.3656637"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM42981.2021.9488755"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICWS60048.2023.00033"},{"key":"e_1_3_3_2_41_2","first-page":"10183","volume-title":"International conference on machine learning","author":"Tay Yi","year":"2021","unstructured":"Yi Tay, Dara Bahri, Donald Metzler, Da-Cheng Juan, Zhe Zhao, and Che Zheng. 2021. Synthesizer: Rethinking self-attention for transformer models. In International conference on machine learning. PMLR, 10183\u201310192."},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"Shreshth Tuli Giuliano Casale and Nicholas\u00a0R Jennings. 2022. Tranad: Deep transformer networks for anomaly detection in multivariate time series data. Proceedings of the VLDB Endowment 15 6 (2022) 1201\u20131214.","DOI":"10.14778\/3514061.3514067"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Ozan Tuncer Emre Ates Yijia Zhang Ata Turk Jim Brandt Vitus\u00a0J Leung Manuel Egele and Ayse\u00a0K Coskun. 2018. Online diagnosis of performance variation in HPC systems using machine learning. IEEE Transactions on Parallel and Distributed Systems 30 4 (2018) 883\u2013896.","DOI":"10.1109\/TPDS.2018.2870403"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"crossref","unstructured":"Steven\u00a0Euijong Whang Yuji Roh Hwanjun Song and Jae-Gil Lee. 2023. Data collection and quality challenges in deep learning: A data-centric ai perspective. The VLDB Journal 32 4 (2023) 791\u2013813.","DOI":"10.1007\/s00778-022-00775-9"},{"key":"e_1_3_3_2_45_2","unstructured":"Justin Whitt. 2022. Frontier Testing and Tuning Problems Downplayed by Oak Ridge. InsideHPC (2022). https:\/\/insidehpc.com\/2022\/10\/frontier-testing-and-tuning-problems-downplayed-by-oak-ridge\/ Accessed: 2024-04-21."},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26617"},{"key":"e_1_3_3_2_47_2","first-page":"3861","volume-title":"international conference on machine learning","author":"Yang Bo","year":"2017","unstructured":"Bo Yang, Xiao Fu, Nicholas\u00a0D Sidiropoulos, and Mingyi Hong. 2017. Towards k-means-friendly spaces: Simultaneous deep learning and clustering. In international conference on machine learning. PMLR, 3861\u20133870."},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599295"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"crossref","unstructured":"Nan Zhang and Shiliang Sun. 2022. Multiview unsupervised shapelet learning for multivariate time series clustering. IEEE Transactions on Pattern Analysis and Machine Intelligence 45 4 (2022) 4981\u20134996.","DOI":"10.1109\/TPAMI.2022.3198411"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3511983"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER49012.2020.00026"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657686"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i4.25620"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17325"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"crossref","unstructured":"Kaiyang Zhou Yongxin Yang Yu Qiao and Tao Xiang. 2021. Domain adaptive ensemble learning. IEEE Transactions on Image Processing 30 (2021) 8008\u20138018.","DOI":"10.1109\/TIP.2021.3112012"}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759794","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:53:18Z","timestamp":1773255198000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759794"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":54,"alternative-id":["10.1145\/3712285.3759794","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759794","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}