{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,11]],"date-time":"2025-11-11T15:54:09Z","timestamp":1762876449029,"version":"3.28.0"},"reference-count":33,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,5,22]],"date-time":"2023-05-22T00:00:00Z","timestamp":1684713600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,5,22]],"date-time":"2023-05-22T00:00:00Z","timestamp":1684713600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,5,22]]},"DOI":"10.1109\/ets56758.2023.10173972","type":"proceedings-article","created":{"date-parts":[[2023,7,12]],"date-time":"2023-07-12T17:20:34Z","timestamp":1689182434000},"page":"1-6","source":"Crossref","is-referenced-by-count":6,"title":["Understanding Permanent Hardware Failures in Deep Learning Training Accelerator Systems"],"prefix":"10.1109","author":[{"given":"Yi","family":"He","sequence":"first","affiliation":[{"name":"University of Chicago,Chicago,USA"}]},{"given":"Yanjing","family":"Li","sequence":"additional","affiliation":[{"name":"University of Chicago,Chicago,USA"}]}],"member":"263","reference":[{"year":"2018","key":"ref1","article-title":"Nvdla open source project"},{"year":"2019","key":"ref2","article-title":"Tensorflow"},{"year":"2021","key":"ref3","article-title":"Cloud tpu"},{"year":"2021","key":"ref4","article-title":"Panel: Hardware operation at scale reliability to address silent data corruptions"},{"year":"2021","key":"ref5","article-title":"Profile your model with cloud tpu tools"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-3302"},{"article-title":"Training in turmoil: Silent data corruption in systems at scale","volume-title":"International Test Conference Silicon Lifecycle Management Workshop","author":"Bonderson","key":"ref7"},{"article-title":"Tanatomy of an in-die tester for infield testing","volume-title":"IEEE International ART Workshop","author":"Chakravarty","key":"ref8"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2022.3166108"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/2463209.2488859"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/iolts59296.2023.10224872"},{"article-title":"Detecting silent data corruptions in the wild","year":"2022","author":"Dixit","key":"ref12"},{"author":"Everingham","key":"ref13","article-title":"The PASCAL Visual Object Classes Challenge 2012 (VOC2012) Results"},{"article-title":"Understanding the difficulty of training deep feedforward neural networks","volume-title":"Proc. AISTATS","author":"Glorot","key":"ref14"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1098\/rsta.2019.0164"},{"key":"ref16","doi-asserted-by":"crossref","DOI":"10.1109\/ICCV.2015.123","article-title":"Delving deep into rectifiers: Surpassing human-level performance on imagenet classification","author":"He","year":"2015"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2016.90"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00033"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ITC50571.2021.00017"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3458336.3465297"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00010"},{"article-title":"Learning multiple layers of features from tiny images","year":"2009","author":"Krizhevsky","key":"ref22"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/VTS.2010.5469571"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/1403375.1403590"},{"article-title":"Tiny chips, big headaches","year":"2022","author":"Markoff","key":"ref25"},{"article-title":"Deep learning training in facebook data centers: Design of scale-up and scale-out systems","year":"2020","author":"Naumov","key":"ref26"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00075"},{"article-title":"Yolov3: An incremental improvement","year":"2018","author":"Redmon","key":"ref28"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2021.3051841"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/tcad.2019.2944582"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/VTS.2018.8368656"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.3043449"}],"event":{"name":"2023 IEEE European Test Symposium (ETS)","start":{"date-parts":[[2023,5,22]]},"location":"Venezia, Italy","end":{"date-parts":[[2023,5,26]]}},"container-title":["2023 IEEE European Test Symposium (ETS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10173930\/10173940\/10173972.pdf?arnumber=10173972","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T13:18:11Z","timestamp":1709299091000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10173972\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5,22]]},"references-count":33,"URL":"https:\/\/doi.org\/10.1109\/ets56758.2023.10173972","relation":{},"subject":[],"published":{"date-parts":[[2023,5,22]]}}}