{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T16:11:18Z","timestamp":1774627878767,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2015,11,15]],"date-time":"2015-11-15T00:00:00Z","timestamp":1447545600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Department of Energy","award":["DE-AC05-00OR22725"],"award-info":[{"award-number":["DE-AC05-00OR22725"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2015,11,15]]},"DOI":"10.1145\/2807591.2807666","type":"proceedings-article","created":{"date-parts":[[2015,10,27]],"date-time":"2015-10-27T13:07:31Z","timestamp":1445951251000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":69,"title":["Reliability lessons learned from GPU experience with the Titan supercomputer at Oak Ridge leadership computing facility"],"prefix":"10.1145","author":[{"given":"Devesh","family":"Tiwari","sequence":"first","affiliation":[{"name":"Oak Ridge National Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Saurabh","family":"Gupta","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"George","family":"Gallarno","sequence":"additional","affiliation":[{"name":"Christian Brothers University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jim","family":"Rogers","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Don","family":"Maxwell","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2015,11,15]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"\"Gpus speed early science apps \" http:\/\/www.hpcwire.com\/2014\/01\/06\/gpus-speed-early-science-apps\/.  \"Gpus speed early science apps \" http:\/\/www.hpcwire.com\/2014\/01\/06\/gpus-speed-early-science-apps\/."},{"key":"e_1_3_2_1_2_1","unstructured":"\"Understanding xid errors \" http:\/\/docs.nvidia.com\/deploy\/xid-errors\/index.html.  \"Understanding xid errors \" http:\/\/docs.nvidia.com\/deploy\/xid-errors\/index.html."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2007.108"},{"key":"e_1_3_2_1_4_1","first-page":"225","volume-title":"2006 IEEE International Symposium on. IEEE","author":"Alam S. R.","year":"2006"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2014.62"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPA.2011.50"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2013.6575356"},{"key":"e_1_3_2_1_8_1","unstructured":"M. Ezell \"Understanding the impact of interconnect failures on system operation \" in Proceedings of Cray User Group Conference (CUG 2013) 2013.  M. Ezell \"Understanding the impact of interconnect failures on system operation \" in Proceedings of Cray User Group Conference (CUG 2013) 2013."},{"key":"e_1_3_2_1_9_1","volume-title":"Gpu-qin: A methodology for evaluating the error resilience of gpgpu applications","author":"Fang B.","year":"2014"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.Companion.2012.289"},{"key":"e_1_3_2_1_11_1","first-page":"175","volume-title":"SRDS 2007. 26th IEEE International Symposium on. IEEE","author":"Fu S.","year":"2007"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/2038633.2038637"},{"key":"e_1_3_2_1_13_1","volume-title":"Networking, Storage and Analysis","author":"Gainaru A.","year":"2012"},{"key":"e_1_3_2_1_14_1","volume-title":"Dresden","author":"Gomez L. A. B.","year":"2014"},{"key":"e_1_3_2_1_15_1","volume-title":"Understanding and exploiting spatial properties of system failures on extreme-scale hpc systems,\" International Conference on Dependable Systems and Networks (DSN)","author":"Gupta S.","year":"2015"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2010.84"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2248487.2150989"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2006.18"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2005.50"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/2670529.2754959"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2007.103"},{"key":"e_1_3_2_1_22_1","first-page":"80","article-title":"A survey of general-purpose computation on graphics hardware,\" in Computer graphics forum, vol. 26, no. 1","author":"Owens J. D.","year":"2007","journal-title":"Wiley Online Library"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2011.5958210"},{"key":"e_1_3_2_1_24_1","first-page":"772","volume-title":"2004 International Conference on. IEEE","author":"Sahoo R. K.","year":"2004"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2009.4"},{"key":"e_1_3_2_1_26_1","first-page":"1","article-title":"Disk failures in the real world: What does an mttf of 1, 000, 000 hours mean to you?","volume":"7","author":"Schroeder B.","year":"2007","journal-title":"FAST"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/2492101.1555372"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2503210.2503257"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2011.6114182"},{"key":"e_1_3_2_1_30_1","first-page":"331","volume-title":"2015 IEEE 21st International Symposium on. IEEE","author":"Tiwari D.","year":"2015"},{"key":"e_1_3_2_1_31_1","volume-title":"Experience with gpus on the titan supercomputer from a reliability, performance and power perspective,\" in Proceedings of Cray User Group Conference (CUG","author":"Tiwari D.","year":"2015"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2014.101"}],"event":{"name":"SC15: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"Austin Texas","acronym":"SC15","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","SIGARCH ACM Special Interest Group on Computer Architecture","IEEE-CS Computer Society"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2807591.2807666","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2807591.2807666","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T06:12:44Z","timestamp":1750227164000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2807591.2807666"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,11,15]]},"references-count":32,"alternative-id":["10.1145\/2807591.2807666","10.1145\/2807591"],"URL":"https:\/\/doi.org\/10.1145\/2807591.2807666","relation":{},"subject":[],"published":{"date-parts":[[2015,11,15]]},"assertion":[{"value":"2015-11-15","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}