{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T12:25:21Z","timestamp":1773318321784,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62406141"],"award-info":[{"award-number":["62406141"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759893","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:04:47Z","timestamp":1762963487000},"page":"1165-1179","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Exploring and Mitigating Failure Behavior of Large Language Model Training Workloads in HPC Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9926-9442","authenticated-orcid":false,"given":"Pengfei","family":"Yu","sequence":"first","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3989-1520","authenticated-orcid":false,"given":"Jingjing","family":"Gu","sequence":"additional","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7912-027X","authenticated-orcid":false,"given":"Hao","family":"Han","sequence":"additional","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3947-4153","authenticated-orcid":false,"given":"Dazhong","family":"Shen","sequence":"additional","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4750-6516","authenticated-orcid":false,"given":"Bao","family":"Wen","sequence":"additional","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2596-8419","authenticated-orcid":false,"given":"Yang","family":"Liu","sequence":"additional","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics, Nanjing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_3_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00092"},{"key":"e_1_3_3_3_3_2","doi-asserted-by":"crossref","unstructured":"Jean Arlat Martine Aguera Louis Amat Yves Crouzet J-C Fabre J-C Laprie Eliane Martins and David Powell. 1990. Fault injection for dependability validation: A methodology and some applications. IEEE Transactions on Software Engineering 16 2 (1990) 166\u2013182.","DOI":"10.1109\/32.44380"},{"key":"e_1_3_3_3_4_2","doi-asserted-by":"crossref","unstructured":"Jean Arlat Yves Crouzet Johan Karlsson Peter Folkesson Emmerich Fuchs and G\u00fcnther\u00a0H Leber. 2003. Comparison of physical and software-implemented fault injection techniques. IEEE Trans. Comput. 52 9 (2003) 1115\u20131133.","DOI":"10.1109\/TC.2003.1228509"},{"key":"e_1_3_3_3_5_2","first-page":"1455","volume-title":"Conference on learning theory","author":"Arora Sanjeev","year":"2018","unstructured":"Sanjeev Arora, Wei Hu, and Pravesh\u00a0K Kothari. 2018. An analysis of the t-sne algorithm for data visualization. In Conference on learning theory. PMLR, 1455\u20131462."},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607084"},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707242"},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"crossref","unstructured":"Davide Baroffio Federico Reghenzani and William Fornaciari. 2024. Enhanced Compiler Technology for Software-based Hardware Fault Detection. ACM Transactions on Design Automation of Electronic Systems 29 5 Article 91 (2024) 23\u00a0pages.","DOI":"10.1145\/3660524"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"crossref","unstructured":"Anna\u00a0C Belkina Christopher\u00a0O Ciccolella Rina Anno Richard Halpert Josef Spidlen and Jennifer\u00a0E Snyder-Cappione. 2019. Automated optimized parameters for T-distributed stochastic neighbor embedding improve visualization and analysis of large datasets. Nature Communications 10 1 (2019) 5415.","DOI":"10.1038\/s41467-019-13055-y"},{"key":"e_1_3_3_3_10_2","doi-asserted-by":"crossref","unstructured":"G Bharathi\u00a0Mohan R Prasanna\u00a0Kumar P Vishal\u00a0Krishh A Keerthinathan G Lavanya Meka Kavya\u00a0Uma Meghana Sheba Sulthana and Srinath Doss. 2024. An analysis of large language models: their impact and potential applications. Knowledge and Information Systems 66 9 (2024) 5047\u20135070.","DOI":"10.1007\/s10115-024-02120-8"},{"key":"e_1_3_3_3_11_2","first-page":"1877","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared\u00a0D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et\u00a0al. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems (NeurIPS) , Vol.\u00a033. 1877\u20131901."},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/SCW63240.2024.00178"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/DSN48987.2021.00042"},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3717481"},{"key":"e_1_3_3_3_15_2","unstructured":"Google. 2023. Gemini Technical Reports. https:\/\/www.datacenterdynamics.com\/en\/news\/training-gemini-tpus-multiple-data-centers-and-risks-of-cosmic-rays"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"crossref","unstructured":"Luanzheng Guo Dong Li and Ignacio Laguna. 2021. Paris: Predicting application resilience using machine learning. J. Parallel and Distrib. Comput. 152 (2021) 111\u2013124.","DOI":"10.1016\/j.jpdc.2021.02.015"},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","DOI":"10.23919\/DATE.2018.8342139"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589105"},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607078"},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/ASE.2019.00126"},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"publisher","DOI":"10.5555\/3571885.3571907"},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3460319.3464825"},{"key":"e_1_3_3_3_23_2","first-page":"1615","volume-title":"33rd USENIX Security Symposium (USENIX Security 24)","author":"Jattke Patrick","year":"2024","unstructured":"Patrick Jattke, Max Wipfli, Flavien Solt, Michele Marazzi, Matej B\u00f6lcskei, and Kaveh Razavi. 2024. { ZenHammer} : Rowhammer Attacks on { AMD} Zen-based Platforms. In 33rd USENIX Security Symposium (USENIX Security 24). 1615\u20131633."},{"key":"e_1_3_3_3_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD63220.2024.00023"},{"key":"e_1_3_3_3_25_2","doi-asserted-by":"publisher","DOI":"10.1016\/B978-1-55860-335-6.50023-4"},{"key":"e_1_3_3_3_26_2","first-page":"4171","volume-title":"Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)","author":"Kenton Jacob Devlin Ming-Wei\u00a0Chang","year":"2019","unstructured":"Jacob Devlin Ming-Wei\u00a0Chang Kenton and Lee\u00a0Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT). 4171\u20134186."},{"key":"e_1_3_3_3_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607081"},{"key":"e_1_3_3_3_28_2","doi-asserted-by":"crossref","unstructured":"Jooyoung Kim Woosung Lee Keewon Cho and Sungho Kang. 2017. Hardware-Efficient Built-In Redundancy Analysis for Memory With Various Spares. IEEE Transactions on Very Large Scale Integration Systems 25 3 (2017) 844\u2013856.","DOI":"10.1109\/TVLSI.2016.2606499"},{"key":"e_1_3_3_3_29_2","first-page":"26736","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Li Conglong","year":"2022","unstructured":"Conglong Li, Minjia Zhang, and Yuxiong He. 2022. The stability-efficiency dilemma: Investigating sequence length warmup for training GPT models. In Advances in Neural Information Processing Systems (NeurIPS) , Vol.\u00a035. 26736\u201326750."},{"key":"e_1_3_3_3_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126964"},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651349"},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSRE.2018.00021"},{"key":"e_1_3_3_3_33_2","unstructured":"Laurens van\u00a0der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of Machine Learning Research 9 (2008) 2579\u20132605."},{"key":"e_1_3_3_3_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/DSN-W50199.2020.00014"},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICECA.2017.8212811"},{"key":"e_1_3_3_3_36_2","unstructured":"Meta. 2024. Llama3 Technical Reports. https:\/\/ai.meta.com\/blog\/meta-llama-3"},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"crossref","unstructured":"Niranjhana Narayanan Zitao Chen Bo Fang Guanpeng Li Karthik Pattabiraman and Nathan DeBardeleben. 2023. Fault Injection for TensorFlow Applications. IEEE Transactions on Dependable and Secure Computing 20 4 (2023) 2677\u20132695.","DOI":"10.1109\/TDSC.2022.3175930"},{"key":"e_1_3_3_3_39_2","unstructured":"Humza Naveed Asad\u00a0Ullah Khan Shi Qiu Muhammad Saqib Saeed Anwar Muhammad Usman Naveed Akhtar Nick Barnes and Ajmal Mian. 2023. A comprehensive overview of large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.06435 (2023)."},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446091"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-NIER52604.2021.00022"},{"key":"e_1_3_3_3_42_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1144"},{"key":"e_1_3_3_3_43_2","doi-asserted-by":"crossref","unstructured":"Fritz Previlon Charu Kalra Devesh Tiwari and David Kaeli. 2022. Characterizing and Exploiting Soft Error Vulnerability Phase Behavior in GPU Applications. IEEE Transactions on Dependable and Secure Computing 19 1 (2022) 288\u2013300.","DOI":"10.1109\/TDSC.2020.2991136"},{"key":"e_1_3_3_3_44_2","unstructured":"Colin Raffel Noam Shazeer Adam Roberts Katherine Lee Sharan Narang Michael Matena Yanqi Zhou Wei Li and Peter\u00a0J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of Machine Learning Research 21 140 (2020) 1\u201367."},{"key":"e_1_3_3_3_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00058"},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3195970.3195997"},{"key":"e_1_3_3_3_47_2","doi-asserted-by":"crossref","unstructured":"Behrooz Sangchoolie Karthik Pattabiraman and Johan Karlsson. 2022. An Empirical Study of the Impact of Single and Multiple Bit-Flip Errors in Programs. IEEE Transactions on Dependable and Secure Computing 19 3 (2022) 1988\u20132006.","DOI":"10.1109\/TDSC.2020.3043023"},{"key":"e_1_3_3_3_48_2","unstructured":"Dazhong Shen Guanglu Song Yi Zhang Bingqi Ma Lujundong Li Dongzhi Jiang Zhuofan Zong and Yu Liu. 2025. ADT: Tuning Diffusion Models with Adversarial Supervision. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.11423 (2025)."},{"key":"e_1_3_3_3_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/CCWC62904.2025.10903774"},{"key":"e_1_3_3_3_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00010"},{"key":"e_1_3_3_3_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2001.941435"},{"key":"e_1_3_3_3_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/ETS61313.2024.10567161"},{"key":"e_1_3_3_3_53_2","doi-asserted-by":"publisher","DOI":"10.1145\/2807591.2807666"},{"key":"e_1_3_3_3_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/DSN48987.2021.00041"},{"key":"e_1_3_3_3_55_2","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information Processing Systems (NeurIPS) , Vol.\u00a030."},{"key":"e_1_3_3_3_56_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5446"},{"key":"e_1_3_3_3_57_2","doi-asserted-by":"crossref","unstructured":"Xiaohui Wei Chenyang Wang Hengshan Yue Jingweijia Tan Zeyu Guan Nan Jiang Xinyang Zheng Jianpeng Zhao and Meikang Qiu. 2024. ReIPE: Recycling Idle PEs in CNN Accelerator for Vulnerable Filters Soft-Error Detection. ACM Transactions on Architecture and Code Optimization 21 3 Article 61 (2024) 26\u00a0pages.","DOI":"10.1145\/3674909"},{"key":"e_1_3_3_3_58_2","doi-asserted-by":"crossref","unstructured":"Zujia Yan Yi Zhuang Weining Zheng and Jingjing Gu. 2023. Multi-bit Data Flow Error Detection Method Based on SDC Vulnerability Analysis. ACM Transactions on Embedded Computing Systems 22 3 Article 49 (2023) 30\u00a0pages.","DOI":"10.1145\/3572838"},{"key":"e_1_3_3_3_59_2","doi-asserted-by":"publisher","DOI":"10.1145\/3690624.3709424"},{"key":"e_1_3_3_3_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476170"},{"key":"e_1_3_3_3_61_2","doi-asserted-by":"crossref","unstructured":"Jinyu Zhan Ruoxu Sun Wei Jiang Yucheng Jiang Xunzhao Yin and Cheng Zhuo. 2022. Improving Fault Tolerance for Reliable DNN Using Boundary-Aware Activation. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 41 10 (2022) 3414\u20133425.","DOI":"10.1109\/TCAD.2021.3129114"},{"key":"e_1_3_3_3_62_2","doi-asserted-by":"publisher","DOI":"10.23919\/DATE54114.2022.9774569"},{"key":"e_1_3_3_3_63_2","unstructured":"Kai Zhao Sheng Di Sihuan Li Xin Liang Yujia Zhai Jieyang Chen Kaiming Ouyang Franck Cappello and Zizhong Chen. 2020. FT-CNN: Algorithm-based fault tolerance for convolutional neural networks. IEEE Transactions on Parallel and Distributed Systems 32 7 (2020) 1677\u20131689."},{"key":"e_1_3_3_3_64_2","doi-asserted-by":"crossref","unstructured":"Changbao Zhou Jiawei Du Ming Yan Hengshan Yue Xiaohui Wei and Joey\u00a0Tianyi Zhou. 2024. SAR: Sharpness-Aware minimization for enhancing DNNs\u2019 Robustness against bit-flip errors. Journal of Systems Architecture 156 (2024) 103284.","DOI":"10.1016\/j.sysarc.2024.103284"},{"key":"e_1_3_3_3_65_2","doi-asserted-by":"crossref","unstructured":"Ranyang Zhou Jacqueline Liu Sabbir Ahmed Nakul Kochar Adnan\u00a0Siraj Rakin and Shaahin Angizi. 2024. Assessing the Potential of Escalating RowHammer Attack Distance to Bypass Counter-Based Defenses. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems (2024).","DOI":"10.1109\/TCAD.2024.3481388"},{"key":"e_1_3_3_3_66_2","unstructured":"Yuanxin Zhuang Dazhong Shen and Ying Sun. 2025. MolEditRL: Structure-Preserving Molecular Editing via Discrete Diffusion and Reinforcement Learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.20131 (2025)."},{"key":"e_1_3_3_3_67_2","doi-asserted-by":"crossref","unstructured":"Zhuofan Zong Bingqi Ma Dazhong Shen Guanglu Song Hao Shao Dongzhi Jiang Hongsheng Li and Yu Liu. 2024. Mova: Adapting mixture of vision experts to multimodal context. Advances in neural information processing systems (NeurIPS) 37 (2024) 103305\u2013103333.","DOI":"10.52202\/079017-3282"}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759893","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:41:09Z","timestamp":1773254469000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759893"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":66,"alternative-id":["10.1145\/3712285.3759893","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759893","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}