{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T11:56:54Z","timestamp":1775735814085,"version":"3.50.1"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2021,4,27]],"date-time":"2021-04-27T00:00:00Z","timestamp":1619481600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,4,27]],"date-time":"2021-04-27T00:00:00Z","timestamp":1619481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2021,11]]},"DOI":"10.1007\/s11227-021-03811-7","type":"journal-article","created":{"date-parts":[[2021,4,27]],"date-time":"2021-04-27T04:02:17Z","timestamp":1619496137000},"page":"13494-13513","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":28,"title":["Smart predictive maintenance for high-performance computing systems: a literature review"],"prefix":"10.1007","volume":"77","author":[{"given":"Andr\u00e9 Luis da Cunha Dantas","family":"Lima","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vitor Moraes","family":"Aranha","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Caio Jord\u00e3o de Lima","family":"Carvalho","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2219-0290","authenticated-orcid":false,"given":"Erick Giovani Sperandio","family":"Nascimento","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,4,27]]},"reference":[{"key":"3811_CR1","doi-asserted-by":"publisher","unstructured":"Aydin O, Guldamlasioglu S (2017) Using LSTM networks to predict engine condition on large scale data processing framework. In: 2017 4th International Conference on Electrical and Electronic Engineering (ICEEE). IEEE, pp 281\u2013285. https:\/\/doi.org\/10.1109\/iceee2.2017.7935834","DOI":"10.1109\/iceee2.2017.7935834"},{"key":"3811_CR2","doi-asserted-by":"publisher","first-page":"9428","DOI":"10.1609\/aaai.v33i01.33019428","volume":"33","author":"A Borghesi","year":"2019","unstructured":"Borghesi A, Bartolini A, Lombardi M, Milano M, Benini L (2019) Anomaly detection using autoencoders in high performance computing systems. Proc AAAI Conf Artif Intell 33:9428\u20139433.\u00a0https:\/\/doi.org\/10.1609\/aaai.v33i01.33019428","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"3811_CR3","doi-asserted-by":"publisher","unstructured":"Borghesi A, Libri A, Benini L, Bartolini A (2019) Online anomaly detection in hpc systems. In: 2019 IEEE International Conference on Artificial Intelligence Circuits and Systems (AICAS). IEEE, pp 229\u2013233.\u00a0https:\/\/doi.org\/10.1109\/AICAS.2019.8771527","DOI":"10.1109\/AICAS.2019.8771527"},{"key":"3811_CR4","doi-asserted-by":"publisher","first-page":"245","DOI":"10.1007\/978-3-030-22964-1_25","volume-title":"International conference on smart innovation","author":"R Caponetto","year":"2019","unstructured":"Caponetto R, Rizzo F, Russotti L, Xibilia M (2019) Deep learning algorithm for predictive maintenance of rotating machines through the analysis of the orbits shape of the rotor shaft. Ergonomics and applied human factors. International conference on smart innovation. Springer, pp 245\u2013250.\u00a0https:\/\/doi.org\/10.1007\/978-3-030-22964-1_25"},{"key":"3811_CR5","doi-asserted-by":"publisher","first-page":"106024","DOI":"10.1016\/j.cie.2019.106024","volume":"137","author":"T P Carvalho","year":"2019","unstructured":"Carvalho T P, Soares F A, Vita R, Francisco R d P, Basto J P, Alcal\u00e1 S G (2019) A systematic literature review of machine learning methods applied to predictive maintenance. Comput Ind Eng 137:106024.\u00a0https:\/\/doi.org\/10.1016\/j.cie.2019.106024","journal-title":"Comput Ind Eng"},{"key":"3811_CR6","doi-asserted-by":"publisher","unstructured":"Chen X, Lu CD, Pattabiraman K (2014) Failure prediction of jobs in compute clouds: a google cluster case study. In: 2014 IEEE international symposium on software reliability engineering workshops. IEEE, pp 341\u2013346.\u00a0https:\/\/doi.org\/10.1109\/ISSREW.2014.105","DOI":"10.1109\/ISSREW.2014.105"},{"key":"3811_CR7","doi-asserted-by":"publisher","unstructured":"Das A, Mueller F, Siegel C, Vishnu A (2018) Desh: deep learning for system health prediction of lead times to failure in hpc. In: Proceedings of the 27th international symposium on high-performance parallel and distributed computing. pp 40\u201351.\u00a0https:\/\/doi.org\/10.1145\/3208040.3208051","DOI":"10.1145\/3208040.3208051"},{"key":"3811_CR8","doi-asserted-by":"publisher","unstructured":"Du M, Li F, Zheng G, Srikumar V (2017) Deeplog: Anomaly detection and diagnosis from system logs through deep learning. In: Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security. pp 1285\u20131298.\u00a0https:\/\/doi.org\/10.1145\/3133956.3134015","DOI":"10.1145\/3133956.3134015"},{"issue":"9","key":"3811_CR9","doi-asserted-by":"publisher","first-page":"6069","DOI":"10.1109\/TII.2020.2967556","volume":"16","author":"A Essien","year":"2020","unstructured":"Essien A, Giannetti C (2020) A deep learning model for smart manufacturing using convolutional LSTM neural network autoencoders. IEEE Trans Ind Inform 16(9):6069\u20136078. https:\/\/doi.org\/10.1109\/TII.2020.2967556","journal-title":"IEEE Trans Ind Inform"},{"key":"3811_CR10","doi-asserted-by":"publisher","first-page":"103678","DOI":"10.1016\/j.engappai.2020.103678","volume":"92","author":"O Fink","year":"2020","unstructured":"Fink O, Wang Q, Svens\u00e9n M, Dersin P, Lee WJ, Ducoffe M (2020) Potential, challenges and future directions for deep learning in prognostics and health management applications. Eng Appl Artif Intell 92:103678.\u00a0https:\/\/doi.org\/10.1016\/j.engappai.2020.103678","journal-title":"Eng Appl Artif Intell"},{"key":"3811_CR11","volume-title":"Deep learning with python","author":"C Francois","year":"2017","unstructured":"Francois C (2017) Deep learning with python. Apress, Berkeley"},{"key":"3811_CR12","doi-asserted-by":"publisher","unstructured":"Ghiasvand S, Ciorba F.M (2019) Anomaly detection in high performance computers: a vicinity perspective. In: 2019 18th international symposium on parallel and distributed computing (ISPDC). IEEE, pp 112\u2013120. https:\/\/doi.org\/10.1109\/ISPDC.2019.00024","DOI":"10.1109\/ISPDC.2019.00024"},{"key":"3811_CR13","doi-asserted-by":"publisher","unstructured":"Giommi L, Bonacorsi D, Diotalevi T, Tisbeni S.R, Rinaldi L, Morganti L, Falabella A, Ronchieri E, Ceccanti A, Martelli B (2019) Towards predictive maintenance with machine learning at the INFN-CNAF computing centre. In: international symposium on grids & clouds (ISGC). Taipei, Taiwan: Proceedings of Science, p 17.\u00a0https:\/\/doi.org\/10.22323\/1.351.0003","DOI":"10.22323\/1.351.0003"},{"key":"3811_CR14","volume-title":"Deep learning","author":"I Goodfellow","year":"2016","unstructured":"Goodfellow I, Bengio Y, Courville A, Bengio Y (2016) Deep learning, vol 1. MIT press Cambridge, Cambridge"},{"issue":"1","key":"3811_CR15","doi-asserted-by":"publisher","first-page":"52","DOI":"10.4304\/jcm.7.1.52-61","volume":"7","author":"Q Guan","year":"2012","unstructured":"Guan Q, Zhang Z, Fu S (2012) Ensemble of bayesian predictors and decision trees for proactive failure management in cloud computing systems. J Commun 7(1):52\u201361. https:\/\/doi.org\/10.4304\/jcm.7.1.52-61","journal-title":"J Commun"},{"key":"3811_CR16","volume-title":"Neural networks: a comprehensive foundation","author":"S Haykin","year":"2007","unstructured":"Haykin S (2007) Neural networks: a comprehensive foundation. Prentice-Hall Inc, New Jersey"},{"issue":"8","key":"3811_CR17","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780.\u00a0https:\/\/doi.org\/10.1162\/neco.1997.9.8.1735","journal-title":"Neural Comput"},{"key":"3811_CR18","doi-asserted-by":"publisher","unstructured":"Hu B, Pang CK, Luo M, Li X, Chan HL (2012) A two-stage equipment predictive maintenance framework for high-performance manufacturing systems. In: 2012 7th IEEE Conference on Industrial Electronics and Applications (ICIEA). IEEE, pp 1343\u20131348. https:\/\/doi.org\/10.1109\/ICIEA.2012.6360931","DOI":"10.1109\/ICIEA.2012.6360931"},{"issue":"1","key":"3811_CR19","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1016\/j.infsof.2008.09.009","volume":"51","author":"B Kitchenham","year":"2009","unstructured":"Kitchenham B, Brereton OP, Budgen D, Turner M, Bailey J, Linkman S (2009) Systematic literature reviews in software engineering-a systematic literature review. Inf Softw Technol 51(1):7\u201315.\u00a0https:\/\/doi.org\/10.1016\/j.infsof.2008.09.009","journal-title":"Inf Softw Technol"},{"key":"3811_CR20","doi-asserted-by":"publisher","unstructured":"Klinkenberg J, Terboven C, Lankes S, M\u00fcller MS (2017) Data mining-based analysis of hpc center operations. In: 2017 IEEE International Conference on Cluster Computing (CLUSTER). IEEE, pp 766\u2013773.\u00a0https:\/\/doi.org\/10.1109\/CLUSTER.2017.23","DOI":"10.1109\/CLUSTER.2017.23"},{"key":"3811_CR21","doi-asserted-by":"publisher","first-page":"113100","DOI":"10.1016\/j.dss.2019.113100","volume":"125","author":"M Kraus","year":"2019","unstructured":"Kraus M, Feuerriegel S (2019) Forecasting remaining useful life: interpretable deep learning approach via variational bayesian inferences. Decis Support Syst 125:113100.\u00a0https:\/\/doi.org\/10.1016\/j.dss.2019.113100","journal-title":"Decis Support Syst"},{"key":"3811_CR22","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.ress.2017.11.021","volume":"172","author":"X Li","year":"2018","unstructured":"Li X, Ding Q, Sun JQ (2018) Remaining useful life estimation in prognostics using deep convolution neural networks. Reliab Eng Syst Saf 172:1\u201311.\u00a0https:\/\/doi.org\/10.1016\/j.ress.2017.11.021","journal-title":"Reliab Eng Syst Saf"},{"key":"3811_CR23","doi-asserted-by":"publisher","unstructured":"Lima ALDCD, Aranha VM, Sperandio EG (2019) Manuten\u00e7\u00e3o preditiva aplicada a ambientes de miss\u00e3o cr\u00edtica de supercomputa\u00e7\u00e3o utilizando intelig\u00eancia artificial: Uma revis\u00e3o sistem\u00e1tica de literatura. In: Anais do V Simp\u00f3sio Internacional de Inova\u00e7\u00e3o e Tecnologia. Blucher Engineering Proceedings, pp 657\u2013664. https:\/\/doi.org\/10.5151\/siintec2019-82","DOI":"10.5151\/siintec2019-82"},{"issue":"1","key":"3811_CR24","doi-asserted-by":"publisher","first-page":"509","DOI":"10.1109\/TIE.2018.2807414","volume":"66","author":"B Luo","year":"2018","unstructured":"Luo B, Wang H, Liu H, Li B, Peng F (2018) Early fault detection of machine tools based on deep learning and dynamic identification. IEEE Trans Ind Electron 66(1):509\u2013518. https:\/\/doi.org\/10.1109\/TIE.2018.2807414","journal-title":"IEEE Trans Ind Electron"},{"key":"3811_CR25","doi-asserted-by":"publisher","first-page":"114","DOI":"10.1016\/j.jpdc.2019.08.008","volume":"135","author":"D Mart\u00ednez","year":"2020","unstructured":"Mart\u00ednez D, Brewer W, Strelzoff A, Wilson A, Wade D (2020) Rotorcraft virtual sensors via deep regression. J Parallel Distrib Comput 135:114\u2013126.\u00a0https:\/\/doi.org\/10.1016\/j.jpdc.2019.08.008","journal-title":"J Parallel Distrib Comput"},{"key":"3811_CR26","doi-asserted-by":"publisher","unstructured":"Mathew V, Toby T, Singh V, Rao B.M, Kumar M.G (2017) Prediction of Remaining Useful Lifetime (RUL) of turbofan engine using machine learning. In: 2017 IEEE International Conference on Circuits and Systems (ICCS). IEEE, pp 306\u2013311.\u00a0https:\/\/doi.org\/10.1109\/ICCS1.2017.8326010","DOI":"10.1109\/ICCS1.2017.8326010"},{"issue":"2","key":"3811_CR27","doi-asserted-by":"publisher","first-page":"471","DOI":"10.1007\/s10586-019-02917-1","volume":"22","author":"B Mohammed","year":"2019","unstructured":"Mohammed B, Awan I, Ugail H, Younas M (2019) Failure prediction using machine learning in a virtualised HPC system and application. Cluster Computing 22(2):471\u2013485.\u00a0https:\/\/doi.org\/10.1007\/s10586-019-02917-1","journal-title":"Cluster Computing"},{"key":"3811_CR28","doi-asserted-by":"publisher","unstructured":"Nakka N, Agrawal A, Choudhary A (2011) Predicting node failure in high performance computing systems from failure and usage logs. In: 2011 IEEE international symposium on parallel and distributed processing workshops and Phd Forum. IEEE, pp 1557\u20131566.\u00a0https:\/\/doi.org\/10.1109\/IPDPS.2011.310","DOI":"10.1109\/IPDPS.2011.310"},{"key":"3811_CR29","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1016\/j.ress.2019.03.018","volume":"188","author":"KT Nguyen","year":"2019","unstructured":"Nguyen KT, Medjaher K (2019) A new dynamic predictive maintenance framework using deep learning for failure prognostics. Reliab Eng Syst Saf 188:251\u2013262. https:\/\/doi.org\/10.1016\/j.ress.2019.03.018","journal-title":"Reliab Eng Syst Saf"},{"key":"3811_CR30","doi-asserted-by":"publisher","unstructured":"Nie B, Xue, J, Gupta S, Patel T, Engelmann C, Smirni E, Tiwari D (2018) Machine learning models for GPU error prediction in a large scale HPC system. In: 2018 48th Annual IEEE\/IFIP International Conference on Dependable Systems and Networks (DSN). IEEE, pp 95\u2013106. https:\/\/doi.org\/10.1109\/DSN.2018.00022","DOI":"10.1109\/DSN.2018.00022"},{"key":"3811_CR31","doi-asserted-by":"publisher","DOI":"10.1016\/j.cie.2020.107060","volume":"153","author":"RM Souza","year":"2021","unstructured":"Souza RM, Nascimento EGS, Miranda UA, Silva WJD, Lepikson HA (2021) Deep learning for diagnosis and classification of faults in industrial rotating machinery. Comput Ind Eng 153:107060. https:\/\/doi.org\/10.1016\/j.cie.2020.107060","journal-title":"Comput Ind Eng"},{"key":"3811_CR32","doi-asserted-by":"publisher","unstructured":"Susto G.A, McLoone S, Pagano D, Schirru A, Pampuri S, Beghi A (2013) Prediction of integral type failures in semiconductor manufacturing through classification methods. In: 2013 IEEE 18th Conference on Emerging Technologies & Factory Automation (ETFA). IEEE, pp 1\u20134. https:\/\/doi.org\/10.1109\/ETFA.2013.6648127","DOI":"10.1109\/ETFA.2013.6648127"},{"issue":"3","key":"3811_CR33","doi-asserted-by":"publisher","first-page":"812","DOI":"10.1109\/TII.2014.2349359","volume":"11","author":"GA Susto","year":"2014","unstructured":"Susto G.A, Schirru A, Pampuri S, McLoone S, Beghi A (2014) Machine learning for predictive maintenance: a multiple classifier approach. IEEE Trans Ind Inform 11(3):812\u2013820. https:\/\/doi.org\/10.1109\/TII.2014.2349359","journal-title":"IEEE Trans Ind Inform"},{"key":"3811_CR34","doi-asserted-by":"publisher","first-page":"355","DOI":"10.1007\/978-3-319-58667-0_19","volume-title":"International supercomputing conference","author":"O Tuncer","year":"2017","unstructured":"Tuncer O, Ates E, Zhang Y, Turk A, Brandt J, Leung VJ, Egele M, Coskun AK (2017) Diagnosing performance variations in HPC applications using machine learning. International supercomputing conference. Springer, pp 355\u2013373.\u00a0https:\/\/doi.org\/10.1007\/978-3-319-58667-0_19"},{"key":"3811_CR35","doi-asserted-by":"publisher","first-page":"167","DOI":"10.1016\/j.neucom.2017.05.063","volume":"275","author":"Y Wu","year":"2018","unstructured":"Wu Y, Yuan M, Dong S, Lin L, Liu Y (2018) Remaining useful life estimation of engineered systems using vanilla LSTM neural networks. Neurocomputing 275:167\u2013179.\u00a0https:\/\/doi.org\/10.1016\/j.neucom.2017.05.063","journal-title":"Neurocomputing"},{"key":"3811_CR36","doi-asserted-by":"publisher","unstructured":"Yurek O.E, Birant D (2019) Remaining useful life estimation for predictive maintenance using feature engineering. In: Innovations in Intelligent Systems and Applications Conference (ASYU). IEEE, pp 1\u20135. https:\/\/doi.org\/10.1109\/ASYU48272.2019.8946397","DOI":"10.1109\/ASYU48272.2019.8946397"},{"key":"3811_CR37","doi-asserted-by":"publisher","first-page":"78","DOI":"10.1016\/j.jmsy.2018.05.011","volume":"48","author":"J Zhang","year":"2018","unstructured":"Zhang J, Wang P, Yan R, Gao R.X (2018) Long short-term memory for machine remaining life prediction. J Manuf Syst 48:78\u201386.\u00a0https:\/\/doi.org\/10.1016\/j.jmsy.2018.05.011","journal-title":"J Manuf Syst"},{"key":"3811_CR38","doi-asserted-by":"publisher","unstructured":"Zhang K, Xu J, Min M.R, Jiang G, Pelechrinis K, Zhang H (2016) Automated IT system failure prediction: a deep learning approach. In: 2016 IEEE International Conference on Big Data (Big Data). IEEE, pp 1291\u20131300. https:\/\/doi.org\/10.1109\/BigData.2016.7840733","DOI":"10.1109\/BigData.2016.7840733"},{"key":"3811_CR39","doi-asserted-by":"publisher","first-page":"253","DOI":"10.1007\/978-3-030-00916-8_24","volume-title":"International Conference on Collaborative Computing: Networking, Applications and Worksharing","author":"S Zhang","year":"2017","unstructured":"Zhang S, Li X, Wang J, Su S (2017) Curve-registration-based feature extraction for predictive maintenance of industrial equipment. International Conference on Collaborative Computing: Networking, Applications and Worksharing. Springer, pp 253\u2013263.\u00a0https:\/\/doi.org\/10.1007\/978-3-030-00916-8_24"},{"key":"3811_CR40","doi-asserted-by":"publisher","unstructured":"Zhao H, Wang J, Gao P (2017) A Deep Learning Approach for Condition-Based Monitoring and Fault Diagnosis of Rod Pump System. STIoT Editorial Board 32. https:\/\/doi.org\/10.29268\/stsc.2017.0003","DOI":"10.29268\/stsc.2017.0003"},{"key":"3811_CR41","doi-asserted-by":"publisher","first-page":"621","DOI":"10.1007\/978-3-030-46133-1_37","volume-title":"Joint European Conference on Machine Learning and Knowledge Discovery in Databases","author":"S Zheng","year":"2019","unstructured":"Zheng S, Farahat A, Gupta C (2019) Generative adversarial networks for failure prediction. Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, pp 621\u2013637.\u00a0https:\/\/doi.org\/10.1007\/978-3-030-46133-1_37"},{"key":"3811_CR42","doi-asserted-by":"publisher","unstructured":"Zhu B, Wang G, Liu X, Hu D, Lin S, Ma J (2013) Proactive drive failure prediction for large scale storage systems. In: IEEE 29th symposium on mass storage systems and technologies (MSST). IEEE, pp 1\u20135. https:\/\/doi.org\/10.1109\/MSST.2013.6558427","DOI":"10.1109\/MSST.2013.6558427"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-021-03811-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-021-03811-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-021-03811-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,10,25]],"date-time":"2021-10-25T09:46:44Z","timestamp":1635155204000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-021-03811-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,4,27]]},"references-count":42,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2021,11]]}},"alternative-id":["3811"],"URL":"https:\/\/doi.org\/10.1007\/s11227-021-03811-7","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,4,27]]},"assertion":[{"value":"12 April 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 April 2021","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}