{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,3]],"date-time":"2025-12-03T17:46:46Z","timestamp":1764784006929,"version":"3.37.3"},"reference-count":50,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2018,11,1]],"date-time":"2018-11-01T00:00:00Z","timestamp":1541030400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2018,11,1]],"date-time":"2018-11-01T00:00:00Z","timestamp":1541030400000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2018,11,1]],"date-time":"2018-11-01T00:00:00Z","timestamp":1541030400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2018,11,1]],"date-time":"2018-11-01T00:00:00Z","timestamp":1541030400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"U.S. Department of Energy, Office of Science"},{"name":"Office of Advanced Scientific Computing Research","award":["2015-02674"],"award-info":[{"award-number":["2015-02674"]}]},{"name":"US National Science Foundation","award":["CNS 13-14891"],"award-info":[{"award-number":["CNS 13-14891"]}]},{"name":"Air Force Research Lab","award":["FA8750-11-2-0084"],"award-info":[{"award-number":["FA8750-11-2-0084"]}]},{"name":"US National Science Foundation","award":["OCI-0725070","ACI-1238993"],"award-info":[{"award-number":["OCI-0725070","ACI-1238993"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Dependable and Secure Comput."],"published-print":{"date-parts":[[2018,11,1]]},"DOI":"10.1109\/tdsc.2017.2737537","type":"journal-article","created":{"date-parts":[[2017,8,9]],"date-time":"2017-08-09T18:11:27Z","timestamp":1502302287000},"page":"915-930","source":"Crossref","is-referenced-by-count":9,"title":["Resiliency of HPC Interconnects: A Case Study of Interconnect Failures and Recovery in Blue Waters"],"prefix":"10.1109","volume":"15","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0926-0776","authenticated-orcid":false,"given":"Saurabh","family":"Jha","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1166-4729","authenticated-orcid":false,"given":"Valerio","family":"Formicola","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Catello Di","family":"Martino","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mark","family":"Dalton","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"William T.","family":"Kramer","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zbigniew","family":"Kalbarczyk","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2245-3038","authenticated-orcid":false,"given":"Ravishankar K.","family":"Iyer","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1177\/1094342009347445"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1177\/1094342010391989"},{"key":"ref33","first-page":"476","article-title":"Filtering failure logs for a BlueGene\/L prototype","author":"liang","year":"2005","journal-title":"Proc Int Conf Dependable Syst Netw"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2009.4"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2014.62"},{"year":"0","key":"ref30"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1177\/1094342009347767"},{"key":"ref36","article-title":"Analyzing the interplay of failures and\n workload on a leadership-class supercomputer","volume":"2","author":"meneses","year":"2015","journal-title":"Comput"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2015.50"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2007.103"},{"key":"ref28","article-title":"GPUDirect: Integrating the GPU with a network interface","author":"rossetti","year":"2015","journal-title":"Proc GPU Technology Conf"},{"year":"2016","key":"ref27"},{"key":"ref29","first-page":"8","article-title":"Understanding fault scenarios and impacts through fault\n injection experiments in cielo","author":"formicola et","year":"2017","journal-title":"Cray User Group"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/2751504.2751511"},{"key":"ref1","first-page":"8","article-title":"Analysis of gemini interconnect recovery mechanisms: Methods and observations","author":"jha","year":"2016","journal-title":"Proc Cray User Group Conf"},{"journal-title":"Probability and Statistics with Reliability Queuing and Computer Science Applications","year":"2002","author":"trivedi","key":"ref20"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1016\/0167-8191(96)00024-5"},{"year":"2016","key":"ref21"},{"key":"ref24","first-page":"1539","article-title":"PGAS (partitioned global address space) languages","author":"almasi","year":"2011","journal-title":"Encyclopedia of Parallel Computing"},{"key":"ref23","doi-asserted-by":"crossref","first-page":"91","DOI":"10.1145\/167962.165874","article-title":"CHARM++: A portable concurrent object oriented system based on C++","volume":"28","author":"kale","year":"1993","journal-title":"ACM SIGPLAN Notices"},{"key":"ref26","first-page":"93","article-title":"FTC-charm++: An in-memory checkpoint-based fault tolerant runtime for Charm++ and MPI","author":"zheng","year":"2004","journal-title":"Proc IEEE Int Conf Cluster Comput"},{"year":"0","key":"ref25"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2004.1310775"},{"year":"0","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1016\/0376-5075(79)90032-1"},{"key":"ref40","article-title":"ExaScale computing study: Technology\n challenges in achieving exascale systems","author":"bergman","year":"2008","journal-title":"Defense Advanced Research Projects Agency Information Processing Techniques Office"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/292469.292472"},{"key":"ref13","article-title":"Torque resource manager","author":"staples","year":"2006","journal-title":"Proc ACM\/IEEE Conf Supercomputing"},{"key":"ref14","first-page":"1","article-title":"The application level placement scheduler","author":"karo","year":"2008","journal-title":"Proc Cray User Group Conf"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-38750-0_23"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/24.58720"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/2043556.2043583"},{"key":"ref18","article-title":"File systems for clusters from a protocol perspective","author":"braam","year":"0","journal-title":"presented at the 2nd Extreme Linux Topics Workshop"},{"key":"ref19","first-page":"31","article-title":"Microreboot-A technique for cheap recovery","author":"candea","year":"2004","journal-title":"Proc 6th Conf Symp Opearting Syst Des Implementation"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/FTCS.1991.146709"},{"year":"2013","key":"ref3"},{"year":"0","key":"ref6"},{"key":"ref5","article-title":"GPGPU: General-purpose computation on graphics hardware","author":"harris","year":"2005","journal-title":"Proc Int Conf Comput Graph Interactive Techn"},{"year":"0","key":"ref8"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/HOTI.2010.23"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/52325.52356"},{"journal-title":"Principles and Practices of Interconnection Networks","year":"2004","author":"dally","key":"ref9"},{"key":"ref46","doi-asserted-by":"crossref","DOI":"10.1017\/CBO9780511800467","author":"blahut","year":"2003","journal-title":"Algebraic Codes for Data Transmission"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TC.1987.1676939"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1016\/0169-7552(89)90019-6"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/139669.140383"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-58429-3_41"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1177\/1094342009106189"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/12.67315"},{"article-title":"Torus routing chip","year":"1990","author":"dally","key":"ref43"}],"container-title":["IEEE Transactions on Dependable and Secure Computing"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielaam\/8858\/8528978\/8006294-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8858\/8528978\/08006294.pdf?arnumber=8006294","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,8]],"date-time":"2022-04-08T18:51:29Z","timestamp":1649443889000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8006294\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,11,1]]},"references-count":50,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/tdsc.2017.2737537","relation":{},"ISSN":["1545-5971","1941-0018","2160-9209"],"issn-type":[{"type":"print","value":"1545-5971"},{"type":"electronic","value":"1941-0018"},{"type":"electronic","value":"2160-9209"}],"subject":[],"published":{"date-parts":[[2018,11,1]]}}}