{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T19:26:43Z","timestamp":1777058803709,"version":"3.51.4"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2025,7,5]],"date-time":"2025-07-05T00:00:00Z","timestamp":1751673600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,7,5]],"date-time":"2025-07-05T00:00:00Z","timestamp":1751673600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["CCF Trans. HPC"],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1007\/s42514-025-00233-2","type":"journal-article","created":{"date-parts":[[2025,7,5]],"date-time":"2025-07-05T04:00:53Z","timestamp":1751688053000},"page":"413-430","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Deep learning-based prediction of major page faults in cluster systems"],"prefix":"10.1007","volume":"7","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8659-1803","authenticated-orcid":false,"given":"Edward","family":"Chuah","sequence":"first","affiliation":[]},{"given":"Arshad","family":"Jhumka","sequence":"additional","affiliation":[]},{"given":"Sai","family":"Narasimhamurthy","sequence":"additional","affiliation":[]},{"given":"Aladdin","family":"Ayesh","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,7,5]]},"reference":[{"key":"233_CR1","unstructured":"Abraham, J.P.: An Algorithmic Approach for Optimizing Page Fault to Increase Processor Performance Along with Thread-Level Speculation, p. 178. Cochin University of Science and Technology (2017). http:\/\/hdl.handle.net\/10603\/231904"},{"key":"233_CR2","doi-asserted-by":"publisher","unstructured":"Alharthi, K.A., Jhumka, A., Di, S., Cappello, F.: Clairvoyant: A log-based transformer-decoder for failure prediction in large-scale systems. In: Proceedings of the 36th ACM International Conference on Supercomputing (ICS). Association for Computing Machinery, New York, NY, USA (2022). https:\/\/doi.org\/10.1145\/3524059.3532374","DOI":"10.1145\/3524059.3532374"},{"key":"233_CR3","doi-asserted-by":"publisher","unstructured":"Alharthi, K.A., Jhumka, A., Di, S., Gui, L., Cappello, F., McIntosh-Smith, S.: Time machine: Generative real-time model for failure (and lead time) prediction in HPC systems. In: Proceedings of the Annual IEEE\/IFIP International Conference on Dependable Systems and Networks (DSN) (2023). https:\/\/doi.org\/10.1109\/DSN58367.2023.00054","DOI":"10.1109\/DSN58367.2023.00054"},{"issue":"1","key":"233_CR4","doi-asserted-by":"publisher","first-page":"11","DOI":"10.1109\/TDSC.2004.2","volume":"1","author":"A Avizienis","year":"2004","unstructured":"Avizienis, A., Lapire, J.-C., Randell, B., Landwehr, C.: Basic concepts and taxonomy of dependable and secure computing. IEEE Trans. Dependable Secure Comput. 1(1), 11\u201333 (2004). https:\/\/doi.org\/10.1109\/TDSC.2004.2","journal-title":"IEEE Trans. Dependable Secure Comput."},{"key":"233_CR5","doi-asserted-by":"publisher","unstructured":"Chuah, E., Jhumka, A., Alt, S., Evans, R.T., Suri, N.: Failure diagnosis for cluster systems using partial correlations. In: Proceedings of IEEE International Symposium on Parallel & Distributed Processing with Applications (ISPA) (2021). https:\/\/doi.org\/10.1109\/ISPA-BDCloud-SocialCom-SustainCom52081.2021.00151","DOI":"10.1109\/ISPA-BDCloud-SocialCom-SustainCom52081.2021.00151"},{"key":"233_CR6","doi-asserted-by":"publisher","unstructured":"Chuah, E., Jhumka, A., Browne, J.C., Gurumdimma, N., Narasimharmuthy, S., Barth, B.: Using message logs and resource use data for cluster failure diagnosis. In: Proceedings of IEEE International Conference on High Performance Computing (HiPC) (2016). https:\/\/doi.org\/10.1109\/HiPC.2016.035","DOI":"10.1109\/HiPC.2016.035"},{"key":"233_CR7","doi-asserted-by":"publisher","unstructured":"Chuah, E., Jhumka, A., Narasimhamurthy, S.: An empirical study of major page faults for failure diagnosis in cluster systems. Journal of Supercomputing, 1\u201335 (2023). https:\/\/doi.org\/10.1007\/s11227-023-05366-1","DOI":"10.1007\/s11227-023-05366-1"},{"key":"233_CR8","doi-asserted-by":"publisher","unstructured":"Chuah, E., Jhumka, A., Narasimharmuthy, S., Hammond, J., Browne, J.C., Barth, B.: Linking resource usage anomalies with system failures from cluster log data. In: Proceedings of IEEE International Symposium on Reliable Distributed Systems (SRDS) (2013). https:\/\/doi.org\/10.1109\/SRDS.2013.20","DOI":"10.1109\/SRDS.2013.20"},{"key":"233_CR9","doi-asserted-by":"publisher","unstructured":"Chuah, E., Kuo, S.-H., Hiew, P., Tjhi, W.-C., Lee, G., Hammond, J., Michalewicz, M.T., Hung, T., Browne, J.C.: Diagnosing the root-causes of failures from cluster log files. In: Proceedings of IEEE International Conference on High-Performance Computing (HiPC), pp. 1\u201310 (2010). https:\/\/doi.org\/10.1109\/HIPC.2010.5713159","DOI":"10.1109\/HIPC.2010.5713159"},{"key":"233_CR10","doi-asserted-by":"publisher","DOI":"10.1016\/C2010-0-65241-2","volume-title":"Logging and Log Management","author":"A Chuvakin","year":"2013","unstructured":"Chuvakin, A., Schmidt, K., Phillips, C.: Logging and Log Management, pp. 1\u2013463. ScienceDirect, USA (2013). https:\/\/doi.org\/10.1016\/C2010-0-65241-2"},{"key":"233_CR11","unstructured":"Darlington, R.B., Hayes, A.F.: Regression Analysis and Linear Models: Concepts, Applications, and Implementation, p. 661. The Guilford Press, (2016)"},{"key":"233_CR12","doi-asserted-by":"publisher","unstructured":"Das, A., Mueller, F., Hargrove, P., Roman, E., Baden, S.: Doomsday: Predicting which node will fail when on supercomputers. In: Proceedings of the IEEE\/ACM International Conference for High Performance Computing, Networking, Storage and Analysis (SC) (2018). https:\/\/doi.org\/10.1109\/SC.2018.00012","DOI":"10.1109\/SC.2018.00012"},{"key":"233_CR13","doi-asserted-by":"publisher","unstructured":"Das, A., Mueller, F., Siegel, C., Vishnu, A.: Desh: Deep learning for system health prediction of lead times to failure in HPC. In: Proceedings of ACM International Symposium on High-Performance Parallel and Distributed Computing (HPDC) (2018). https:\/\/doi.org\/10.1145\/3208040.3208051","DOI":"10.1145\/3208040.3208051"},{"key":"233_CR14","unstructured":"Deitel, H.M.: Operating Systems. Addison-Wesley, Longman Publishing Co., Inc. (1990)"},{"key":"233_CR15","doi-asserted-by":"publisher","unstructured":"Di, S., Bouguerra, M.S., Bautista-Gomez, L., Cappello, F.: Optimization of multi-level checkpoint model for large scale HPC applications. In: IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 1181\u20131190 (2014). https:\/\/doi.org\/10.1109\/IPDPS.2014.122","DOI":"10.1109\/IPDPS.2014.122"},{"key":"233_CR16","doi-asserted-by":"publisher","unstructured":"Evans, R.T., Browne, J.C., Barth, W.L.: Understanding application and system performance through system-wide monitoring. In: Proceedings of IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW) (2016). https:\/\/doi.org\/10.1109\/IPDPSW.2016.145","DOI":"10.1109\/IPDPSW.2016.145"},{"key":"233_CR17","doi-asserted-by":"publisher","unstructured":"Fu, S., Xu, C.-Z.: Exploring event correlation for failure prediction in coalitions of clusters. In: Proceedings of ACM\/IEEE International Conference on Supercomputing (SC) (2007). https:\/\/doi.org\/10.1145\/1362622.1362678","DOI":"10.1145\/1362622.1362678"},{"key":"233_CR18","doi-asserted-by":"publisher","unstructured":"Gainaru, A., Cappello, F., Fullop, J., Trausan-Matu, S., Kramer, W.: Adaptive event prediction strategy with dynamic time window for large-scale HPC systems. In: Proceedings of SLAML Workshop, pp. 1\u20138 (2011). https:\/\/doi.org\/10.1145\/2038633.2038637","DOI":"10.1145\/2038633.2038637"},{"key":"233_CR19","doi-asserted-by":"publisher","unstructured":"Gainaru, A., Cappello, F., Snir, M., Kramer, W.: Fault prediction under the microscope: A closer look into HPC systems. In: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis. SC \u201912, pp. 77\u201317711 (2012). https:\/\/doi.org\/10.1109\/SC.2012.57","DOI":"10.1109\/SC.2012.57"},{"key":"233_CR20","doi-asserted-by":"publisher","unstructured":"Gu, J., Zheng, Z., Lan, Z., White, J., Hocks, E., Park, B.-H.: Dynamic meta-learning for failure prediction in large-scale systems: A case study. In: Proceedings of 37th International Conference on Parallel Processing (ICPP), pp. 157\u2013164 (2008). https:\/\/doi.org\/10.1109\/ICPP.2008.17","DOI":"10.1109\/ICPP.2008.17"},{"key":"233_CR21","doi-asserted-by":"publisher","unstructured":"Hammond, J.L., Minyard, T., Browne, J.: End-to-end framework for fault management for open source clusters: Ranger. In: Proceedings of ACM TeraGrid Conference (2010). https:\/\/doi.org\/10.1145\/1838574.1838583","DOI":"10.1145\/1838574.1838583"},{"key":"233_CR22","doi-asserted-by":"publisher","unstructured":"He, P., Zhu, J., He, S., Li, J., Lyu, M.R.: An evaluation study on log parsing and its use in log mining. In: Proceedings of IEEE\/IFIP International Conference on Dependable Systems and Networks (DSN), pp. 654\u2013661 (2016). https:\/\/doi.org\/10.1109\/DSN.2016.66","DOI":"10.1109\/DSN.2016.66"},{"issue":"8","key":"233_CR23","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997). https:\/\/doi.org\/10.1162\/neco.1997.9.8.1735","journal-title":"Neural Comput."},{"key":"233_CR24","unstructured":"IEEE: IEEE Std 1003.1-2001 Standard for Information Technology \u2014 Portable Operating System Interface (POSIX) Rationale (Informative), p. 310. IEEE Standards, USA (2001)"},{"key":"233_CR25","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/11171.001.0001","volume-title":"Deep Learning","author":"JD Kelleher","year":"2019","unstructured":"Kelleher, J.D.: Deep Learning, pp. 1\u2013296. MIT Press, USA (2019)"},{"key":"233_CR26","unstructured":"Kuhn, M., Johnson, K.: Applied Predictive Modeling, pp. 1\u2013613. Springer, Cham (2018)"},{"issue":"6","key":"233_CR27","doi-asserted-by":"publisher","first-page":"630","DOI":"10.1016\/j.jpdc.2010.03.003","volume":"70","author":"Z Lan","year":"2010","unstructured":"Lan, Z., Gu, J., Zheng, Z., Thakur, R., Coghlan, S.: A study of dynamic meta-learning for failure prediction in large-scale systems. Journal of Parallel and Distributed Computing 70(6), 630\u2013643 (2010). https:\/\/doi.org\/10.1016\/j.jpdc.2010.03.003","journal-title":"Journal of Parallel and Distributed Computing"},{"key":"233_CR28","doi-asserted-by":"publisher","unstructured":"Lea, C., Vidal, R., Reiter, A., Hager, G.D.: Temporal convolutional networks: A unified approach to action segmentation. In: Hua, G., J\u00e9gou, H. (eds.) Computer Vision \u2013 ECCV 2016 Workshops, pp. 47\u201354. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-49409-8_7","DOI":"10.1007\/978-3-319-49409-8_7"},{"key":"233_CR29","doi-asserted-by":"publisher","unstructured":"Li, L., Znati, T.: AtFP: Attention-based failure predictor for extreme-scale computing. In: Proceedings of the IEEE 13th International Conference on Reliability, Maintainability, and Safety (ICRMS), pp. 23\u201327 (2022). https:\/\/doi.org\/10.1109\/ICRMS55680.2022.9944604","DOI":"10.1109\/ICRMS55680.2022.9944604"},{"key":"233_CR30","doi-asserted-by":"publisher","unstructured":"Mitra, S., Javagal, S., Maji, A.K., Gamblin, T., Moody, A., Harrell, S., Bagchi, S.: A study of failures in community clusters: The case of conte. In: Proceedings of the IEEE International Symposium on Software Reliability Engineering Workshops (ISSREW), pp. 189\u2013196 (2016). https:\/\/doi.org\/10.1109\/ISSREW.2016.7","DOI":"10.1109\/ISSREW.2016.7"},{"key":"233_CR31","doi-asserted-by":"publisher","unstructured":"Nicolae, B., Moody, A., Gonsiorowski, E., Mohror, K., Cappello, F.: VeloC: Towards high performance adaptive asynchronous checkpointing at large scale. In: IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 911\u2013920 (2019). https:\/\/doi.org\/10.1109\/IPDPS.2019.00099","DOI":"10.1109\/IPDPS.2019.00099"},{"key":"233_CR32","doi-asserted-by":"publisher","first-page":"10","DOI":"10.1016\/j.micpro.2019.01.006","volume":"66","author":"A Orlando","year":"2019","unstructured":"Orlando, A., Amato, P., Caraccio, D., Cinque, M., Izzi, R., Mirichigni, G., Porzio, L.: Linux page fault analysis in android systems. Microprocess. Microsyst. 66, 10\u201318 (2019). https:\/\/doi.org\/10.1016\/j.micpro.2019.01.006","journal-title":"Microprocess. Microsyst."},{"key":"233_CR33","doi-asserted-by":"publisher","unstructured":"Park, J.-W., Huang, X., Lee, C.-H.: Analyzing and predicting job failures from HPC system log. The Journal of Supercomputing, 1\u201328 (2023) https:\/\/doi.org\/10.1007\/s11227-023-05482-y","DOI":"10.1007\/s11227-023-05482-y"},{"issue":"1","key":"233_CR34","doi-asserted-by":"publisher","first-page":"14","DOI":"10.5573\/JSTS.2018.18.1.014","volume":"18","author":"Y Park","year":"2018","unstructured":"Park, Y., Bahn, H.: Analysis of memory access latency considering page faults and TLB misses in NVM storage. J. Semiconduct. Technol. Sci. 18(1), 14\u201319 (2018). https:\/\/doi.org\/10.5573\/JSTS.2018.18.1.014","journal-title":"J. Semiconduct. Technol. Sci."},{"key":"233_CR35","doi-asserted-by":"publisher","unstructured":"Pelaez, A., Quiroz, A., Browne, J.C., Chuah, E., Parashar, M.: Online failure prediction for HPC resources using decentralized clustering. In: Proceedings of IEEE International Conference on High Performance Computing (HiPC), pp. 1\u20139 (2014). https:\/\/doi.org\/10.1109\/HiPC.2014.7116903","DOI":"10.1109\/HiPC.2014.7116903"},{"issue":"12","key":"233_CR36","doi-asserted-by":"publisher","first-page":"3990","DOI":"10.1109\/TPDS.2022.3175666","volume":"33","author":"A Psistakis","year":"2022","unstructured":"Psistakis, A., Chrysos, N., Chaix, F., Asiminakis, M., Gianioudis, M., Xirouchakis, P., Papaefstathiou, V., Katevenis, M.: Optimized page fault handling during RDMA. IEEE Trans. Parallel Distrib. Syst. 33(12), 3990\u20134005 (2022). https:\/\/doi.org\/10.1109\/TPDS.2022.3175666","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"233_CR37","doi-asserted-by":"publisher","unstructured":"Ren, R., Li, J., Yin, Y., Tian, S.: Failure prediction for large-scale clusters logs via mining frequent patterns. In: Intelligent Computing and Block Chain, pp. 147\u2013165. Springer, Singapore (2021). https:\/\/doi.org\/10.1007\/978-981-16-1160-5_13","DOI":"10.1007\/978-981-16-1160-5_13"},{"key":"233_CR38","unstructured":"Tan, P.-N., Steinbach, M., Kumar, V.: Introduction to Data Mining. Addison-Wesley, Boston (2006)"},{"key":"233_CR39","doi-asserted-by":"publisher","unstructured":"Thomasian, A.: Storage Systems, pp. 1\u2013746. Morgan Kaufmann (2022). https:\/\/doi.org\/10.1016\/B978-0-32-390796-5.00024-3. https:\/\/www.sciencedirect.com\/science\/article\/pii\/B9780323907965000243","DOI":"10.1016\/B978-0-32-390796-5.00024-3"},{"key":"233_CR40","doi-asserted-by":"publisher","unstructured":"Tirumalasetty, C., Chou, C.C., Reddy, N., Gratz, P., Abouelwafa, A.: Reducing minor page fault overheads through enhanced page walker. ACM Trans. Architect. Code Optim. 19(4) (2022). https:\/\/doi.org\/10.1145\/3547142","DOI":"10.1145\/3547142"},{"key":"233_CR41","unstructured":"Walpole, R.E., Myers, R.H., Myers, S.L.: Probability and Statistics for Engineers and Scientists. Prentice Hall International, Boston (1998)"},{"key":"233_CR42","volume-title":"Introduction to High Performance Computing for Scientists and Engineers","author":"G Wellein","year":"2010","unstructured":"Wellein, G., Hager, G.: Introduction to High Performance Computing for Scientists and Engineers, p. 356. CRC Press, USA (2010)"},{"issue":"23","key":"233_CR43","doi-asserted-by":"publisher","first-page":"7202","DOI":"10.1002\/cpe.7202","volume":"34","author":"G Xian","year":"2022","unstructured":"Xian, G., Zhang, X., Yu, J., Wang, G., Yang, W., Zhou, L., Wu, Y., Li, X., He, X.: PreF: Predicting job failure on supercomputers with job path and user behavior. Concurrency and Computation: Practice and Experience 34(23), 7202 (2022). https:\/\/doi.org\/10.1002\/cpe.7202","journal-title":"Concurrency and Computation: Practice and Experience"},{"key":"233_CR44","doi-asserted-by":"publisher","unstructured":"Zheng, Z., Lan, Z., Gupta, R., Coghlan, S., Beckman, P.: A practical failure prediction with location and lead time for blue gene\/p. In: 1st Workshop on Fault-Tolerance for HPC at Extreme Scale (in Conjunction with IEEE\/IFIP DSN 2010) (2010). https:\/\/doi.org\/10.1109\/DSNW.2010.5542627","DOI":"10.1109\/DSNW.2010.5542627"},{"key":"233_CR45","doi-asserted-by":"publisher","unstructured":"Zheng, Z., Lan, Z., Park, B.H., Geist, A.: System log pre-processing to improve failure prediction. In: Proceedings of IEEE\/IFIP Nternational Conference on Dependable Systems and Networks (DSN) (2009). https:\/\/doi.org\/10.1109\/DSN.2009.5270289","DOI":"10.1109\/DSN.2009.5270289"}],"container-title":["CCF Transactions on High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-025-00233-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42514-025-00233-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42514-025-00233-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T23:03:36Z","timestamp":1761174216000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42514-025-00233-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,5]]},"references-count":45,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,10]]}},"alternative-id":["233"],"URL":"https:\/\/doi.org\/10.1007\/s42514-025-00233-2","relation":{},"ISSN":["2524-4922","2524-4930"],"issn-type":[{"value":"2524-4922","type":"print"},{"value":"2524-4930","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7,5]]},"assertion":[{"value":"15 December 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 June 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 July 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"All authors certify that they have no affiliations with or involvement in any organization or entity with any financial interest or non-financial interest in the subject matter or materials discussed in this manuscript.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}]}}