{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,17]],"date-time":"2026-02-17T12:12:10Z","timestamp":1771330330546,"version":"3.50.1"},"reference-count":44,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2015,2]]},"DOI":"10.1109\/hpca.2015.7056044","type":"proceedings-article","created":{"date-parts":[[2015,3,10]],"date-time":"2015-03-10T18:13:51Z","timestamp":1426011231000},"page":"331-342","source":"Crossref","is-referenced-by-count":142,"title":["Understanding GPU errors on large-scale HPC systems and the implications for system design and operation"],"prefix":"10.1109","author":[{"given":"Devesh","family":"Tiwari","sequence":"first","affiliation":[]},{"given":"Saurabh","family":"Gupta","sequence":"additional","affiliation":[]},{"given":"James","family":"Rogers","sequence":"additional","affiliation":[]},{"given":"Don","family":"Maxwell","sequence":"additional","affiliation":[]},{"given":"Paolo","family":"Rech","sequence":"additional","affiliation":[]},{"given":"Sudharshan","family":"Vazhkudai","sequence":"additional","affiliation":[]},{"given":"Daniel","family":"Oliveira","sequence":"additional","affiliation":[]},{"given":"Dave","family":"Londo","sequence":"additional","affiliation":[]},{"given":"Nathan","family":"DeBardeleben","sequence":"additional","affiliation":[]},{"given":"Philippe","family":"Navaux","sequence":"additional","affiliation":[]},{"given":"Luigi","family":"Carro","sequence":"additional","affiliation":[]},{"given":"Arthur","family":"Bland","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1177\/1094342014522573"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/1555349.1555372"},{"key":"ref33","article-title":"Measuring the Radiation Reliability of SRAM Structures in GPUS Designed for HPC","author":"rech","year":"2014","journal-title":"IEEE workshop on silicon errors in logic-system effects"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ETS.2013.6569352"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TNS.2013.2252625"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/IOLTS.2012.6313841"},{"key":"ref37","first-page":"1","article-title":"Disk failures in the real world: What does an rnttf of 1, 000,000 hours mean to you?","volume":"7","author":"schroeder","year":"2007","journal-title":"FAST"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2009.4"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2004.1311948"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/IDT.2013.6727092"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ISPA.2011.50"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/2503210.2503257"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/IRPS.2011.5784522"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2013.6575356"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2014.6844486"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/SC.Companion.2012.289"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.7873\/DATE2014.354"},{"key":"ref16","article-title":"Probability and statistics","year":"2002"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2010.84"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/2150976.2150989"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CICC.2006.321010"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2011.5958210"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.microrel.2011.08.011"},{"key":"ref27","article-title":"Preparing for Exascale: ORNL Leadership Computing Facility Application Requirements and Strategy","year":"2009"},{"key":"ref3","article-title":"Understanding xid errors","year":"0"},{"key":"ref6","article-title":"Temperature effects on soft error rate due to atmospheric neutrons on 28 nm fpgas","author":"bruni","year":"2014"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TNS.2014.2301768"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TDMR.2005.853449"},{"key":"ref8","article-title":"GPU Behavior on a Large HPC Cluster","author":"debardeleben","year":"2013","journal-title":"6th Workshop on Resiliency in High Performance Computing (Resilience) in Clusters Clouds and Grids in conjunction with the 19th International European Conference on Parallel and Distributed Computing (Euro-Par 2013)"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/1735688.1735702"},{"key":"ref2","article-title":"Tesla k20 gpu accelerator. board specification","year":"0"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2014.62"},{"key":"ref1","article-title":"Computational science requirements for leadership computing","year":"2007"},{"key":"ref20","article-title":"Measurement and Reporting of Alpha Particle and Terrestrial Cosmic Ray-Induced Soft Errors in Semiconductor Devices","year":"2006","journal-title":"JEDEC Standard Tech Rep JESD89A"},{"key":"ref22","first-page":"476","article-title":"Filtering failure logs for a bluegene\/l prototype","author":"liang","year":"2005","journal-title":"Dependable Systems and Networks 2005 DSN 2005 Proceedings International Conference on"},{"key":"ref21","doi-asserted-by":"crossref","first-page":"425","DOI":"10.1109\/DSN.2006.18","article-title":"Bluegene\/l failure analysis and prediction models","author":"liang","year":"2006","journal-title":"Dependable Systems and Networks 2006 DSN 2006 International Conference on"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2014.101"},{"key":"ref24","article-title":"uBLAS Library User Guide","year":"2014"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2011.6114182"},{"key":"ref23","article-title":"Top ten exascale research challenges","author":"lucas","year":"2014","journal-title":"DOE ASCAC Data Subcommittee Report"},{"key":"ref44","author":"ziegler","year":"2010","journal-title":"SER-History Trends and Challenges A Guide for Designing With Memory ICs"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/DFT.2014.6962085"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TNS.2007.902349"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2007.103"}],"event":{"name":"2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA)","location":"Burlingame, CA, USA","start":{"date-parts":[[2015,2,7]]},"end":{"date-parts":[[2015,2,11]]}},"container-title":["2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7048058\/7056013\/07056044.pdf?arnumber=7056044","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2017,6,23]],"date-time":"2017-06-23T03:04:01Z","timestamp":1498187041000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/7056044\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,2]]},"references-count":44,"URL":"https:\/\/doi.org\/10.1109\/hpca.2015.7056044","relation":{},"subject":[],"published":{"date-parts":[[2015,2]]}}}