{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,18]],"date-time":"2025-04-18T19:04:00Z","timestamp":1745003040412},"publisher-location":"Berlin, Heidelberg","reference-count":34,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783642126581"},{"type":"electronic","value":"9783642126598"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2010]]},"DOI":"10.1007\/978-3-642-12659-8_23","type":"book-chapter","created":{"date-parts":[[2010,4,19]],"date-time":"2010-04-19T11:20:05Z","timestamp":1271676005000},"page":"304-322","source":"Crossref","is-referenced-by-count":3,"title":["Failure Data-Driven Selective Node-Level Duplication to Improve MTTF in High Performance Computing Systems"],"prefix":"10.1007","author":[{"given":"Nithin","family":"Nakka","sequence":"first","affiliation":[]},{"given":"Alok","family":"Choudhary","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"key":"23_CR1","doi-asserted-by":"crossref","unstructured":"Plank, J.S., Elwasif, W.R.: Experimental assessment of workstation failures and their impact on checkpointing systems. In: Proceedings of Fault Tolerant Computing Systems, FTCS 1998 (1998)","DOI":"10.1109\/FTCS.1998.689454"},{"key":"23_CR2","unstructured":"Nath, S., Yu, H., Gibbons, P.B., Seshan, S.: Subtleties in tolerating correlated failures. In: Proceedings of the Symposium On Networked Systems Design and Implementation, NSDI 2006 (2006)"},{"key":"23_CR3","doi-asserted-by":"crossref","unstructured":"Heath, T., Martin, R.P., Nguyen, T.D.: Improving cluster availability using workstation validation. In: Proceedings of ACM SIGMETRICS (2002)","DOI":"10.1145\/511334.511362"},{"key":"23_CR4","doi-asserted-by":"crossref","unstructured":"Long, D., Muir, A., Golding, R.: A longitudinal survey of internet host reliability. In: Proceedings of the 14th Intl. Symposium on Reliable Distributed Systems (1995)","DOI":"10.1109\/RELDIS.1995.518718"},{"key":"23_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"432","DOI":"10.1007\/11549468_50","volume-title":"Euro-Par 2005 Parallel Processing","author":"D. Nurmi","year":"2005","unstructured":"Nurmi, D., Brevik, J., Wolski, R.: Modeling machine availability in enterprise and wide-area distributed computing environments. In: Cunha, J.C., Medeiros, P.D. (eds.) Euro-Par 2005. LNCS, vol.\u00a03648, pp. 432\u2013441. Springer, Heidelberg (2005)"},{"key":"23_CR6","doi-asserted-by":"crossref","unstructured":"Sahoo, R.K., Sivasubramaniam, A., Squillante, M.S., Zhang, Y.: Failure data analysis of a large-scale heterogeneous server environment. In: Proceedings of Dependable Systems and Networks (June 2004)","DOI":"10.1109\/DSN.2004.1311948"},{"key":"23_CR7","unstructured":"Tang, D., Iyer, R.K., Subramani, S.S.: Failure analysis and modelling of a VAX cluster system. In: Fault Tolerant Computing Systems (1990)"},{"key":"23_CR8","unstructured":"Xu, J., Kalbarczyk, Z., Iyer, R.K.: Networked Windows NT system field failure data analysis. In: Proc. of the Pacific Rim International Symposium on Dependable Computing (1999)"},{"key":"23_CR9","doi-asserted-by":"crossref","unstructured":"Schroeder, B., Gibson, G.: A large-scale study of failures in high-performance-computing systems. In: Proceedings of the International Conference on Dependable Systems and Networks (DSN), Philadelphia, PA (June 2006)","DOI":"10.1109\/DSN.2006.5"},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"Iyer, R.K., Rossetti, D.J., Hsueh, M.C.: Measurement and modeling of computer reliability as affected by system activity. ACM Transactions on Computing Systems\u00a04(3) (1986)","DOI":"10.1145\/6420.6422"},{"key":"23_CR11","unstructured":"Castillo, X., Siewiorek, D.: Workload, performance, and reliability of digital computing systems. In: 11th International Conference on Fault Tolerant Computing Systems (1981)"},{"key":"23_CR12","doi-asserted-by":"crossref","unstructured":"Oliner, A.J., Stearley, J.: What Supercomputers Say: A Study of Five System Logs. In: Proceedings of the International Conference on Dependable Systems and Networks (DSN), Edinburgh, UK, June 2007, pp. 575\u2013584 (2007)","DOI":"10.1109\/DSN.2007.103"},{"key":"23_CR13","unstructured":"Lan, Z., Li, Y., Gujrati, P., Zheng, Z., Thakur, R., White, J.: A Fault Diagnosis and Prognosis Service for TeraGrid Clusters. In: Proceedings of TeraGrid 2007 (2007)"},{"key":"23_CR14","unstructured":"Gujrati, P., Li, Y., Lan, Z., Thakur, R., White, J.: Exploring Meta-learning to Improve Failure Prediction in Supercomputing Clusters. In: Proceedings of International Conference on Parallel Processing, ICPP (2007)"},{"key":"23_CR15","unstructured":"Li, Y., Lan, Z.: Using Adaptive Fault Tolerance to Improve Application Robustness on the TeraGrid. In: Proceedings of TeraGrid 2007 (2007)"},{"issue":"12","key":"23_CR16","doi-asserted-by":"publisher","first-page":"1647","DOI":"10.1109\/TC.2008.90","volume":"57","author":"Z. Lan","year":"2008","unstructured":"Lan, Z., Li, Y.: Adaptive Fault Management of Parallel Applications for High Performance Computing. IEEE Transactions on Computers\u00a057(12), 1647\u20131660 (2008)","journal-title":"IEEE Transactions on Computers"},{"key":"23_CR17","unstructured":"Oliner, A.J., Sahoo, R.K., Moreira, J.E., Gupta, M., Sivasubramaniam, A.: Fault-aware job scheduling for Bluegene\/L systems. In: Proceedings of the 18th International Parallel and Distributed Processing Symposium, IPDPS (2004)"},{"key":"23_CR18","doi-asserted-by":"crossref","unstructured":"Weaver, C., Austin, T.: A fault tolerant approach to microprocessor design. In: Proceedings of the International Conference on Dependable Systems and Networks, July 2001, pp. 411\u2013420 (2001)","DOI":"10.1109\/DSN.2001.941425"},{"key":"23_CR19","doi-asserted-by":"crossref","unstructured":"Austin, T.: DIVA: A reliable substrate for deep submicron microarchitecture design. In: Proceedings of the Thirty-Second International Symposium on Microarchitecture, November 1999, pp. 196\u2013207 (1999)","DOI":"10.1109\/MICRO.1999.809458"},{"key":"23_CR20","doi-asserted-by":"crossref","unstructured":"Vijaykumar, T., Pomeranz, I., Cheng, K.: Transient fault recovery using simultaneous multithreading. In: Proceedings of the Twenty-Ninth Annual International Symposium on Computer Architecture, May 2002, pp. 87\u201398 (2002)","DOI":"10.1109\/ISCA.2002.1003565"},{"issue":"1","key":"23_CR21","doi-asserted-by":"publisher","first-page":"63","DOI":"10.1109\/24.994913","volume":"51","author":"N. Oh","year":"2002","unstructured":"Oh, N., Shirvani, P.P., McCluskey, E.J.: Error detection by duplicated instructions in superscalar processors. IEEE Transactions on Reliability\u00a051(1), 63\u201375 (2002)","journal-title":"IEEE Transactions on Reliability"},{"issue":"2","key":"23_CR22","doi-asserted-by":"publisher","first-page":"180","DOI":"10.1109\/12.980007","volume":"51","author":"N. Oh","year":"2002","unstructured":"Oh, N., Mitra, S., McCluskey, E.J.: ED4I: Error Detection by Diverse Data and Duplicated Instructions. IEEE Transactions on Computers\u00a051(2), 180\u2013199 (2002)","journal-title":"IEEE Transactions on Computers"},{"issue":"2","key":"23_CR23","doi-asserted-by":"publisher","first-page":"12","DOI":"10.1109\/40.755464","volume":"19","author":"T. Slegel","year":"1999","unstructured":"Slegel, T., et al.: IBM\u2019s S\/390 G5 microprocessor design. IEEE Micro\u00a019(2), 12\u201323 (1999)","journal-title":"IEEE Micro"},{"key":"23_CR24","doi-asserted-by":"crossref","unstructured":"Tullsen, D.M., Eggers, S.J., Levy, H.M.: Simultaneous multithreading: Maximizing on-chip performance. In: Proceedings of the Twenty-Second International Symposium on Computer Architecture, June 1995, pp. 392\u2013403 (1995)","DOI":"10.1145\/225830.224449"},{"key":"23_CR25","doi-asserted-by":"crossref","unstructured":"Rotenberg, E.: AR-SMT: A microarchitectural approach to fault tolerance in microprocessors. In: Proceedings of the Twenty-Ninth International Symposium on Fault-Tolerant Computing Systems, June 1999, pp. 84\u201391 (1999)","DOI":"10.1109\/FTCS.1999.781037"},{"key":"23_CR26","doi-asserted-by":"crossref","unstructured":"Sundaramoorthy, K., Purser, Z., Rotenberg, E.: Slipstream processors: Improving both performance and fault tolerance. In: Proceedings of the Thirty-Third International Symposium on Microarchitecture, December 2000, pp. 269\u2013280 (2000)","DOI":"10.1145\/378993.379247"},{"key":"23_CR27","doi-asserted-by":"crossref","unstructured":"Reinhardt, S.K., Mukherjee, S.S.: Transient fault detection via simultaneous multithreading. In: Proceedings of the Twenty-Seventh International Symposium on Computer Architecture, June 2000, pp. 25\u201336 (2000)","DOI":"10.1145\/339647.339652"},{"key":"23_CR28","doi-asserted-by":"crossref","unstructured":"Qureshi, M.A., Mutlu, O., Patt, Y.N.: Microarchitecture-based introspection: A technique for transient-fault tolerance in microprocessors. In: Proceedings of International Conference on Dependable Systems and Networks, June 2005, pp. 434\u2013443 (2005)","DOI":"10.1109\/DSN.2005.62"},{"key":"23_CR29","doi-asserted-by":"crossref","unstructured":"Parashar, A., Sivasubramaniam, A., Gurumurthi, S.: SlicK: slice-based locality exploitation for efficient redundant multithreading. In: Proceedings of the 12th Intl., conference on ASPLOS (2006)","DOI":"10.1145\/1168857.1168870"},{"issue":"1","key":"23_CR30","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1147\/rd.201.0005","volume":"20","author":"A.E. Cooper","year":"1976","unstructured":"Cooper, A.E., Chow, W.T.: Development of on-board space computer systems. IBM Journal of Research and Development\u00a020(1), 5\u201319 (1976)","journal-title":"IBM Journal of Research and Development"},{"key":"23_CR31","doi-asserted-by":"crossref","unstructured":"Jewett, D.: Integrity S2: A fault-tolerant Unix platform. In: Digest of Papers Fault-Tolerant Computing: The Twenty-First International Symposium, Montreal, Canada, June 25-27, pp. 512\u2013519 (1991)","DOI":"10.1109\/FTCS.1991.146709"},{"key":"23_CR32","unstructured":"AT&T 5ESSTM from top to bottom, http:\/\/www.morehouse.org\/hin\/ess\/ess05.htm"},{"key":"23_CR33","unstructured":"AT&T Technical Staff. The 5ESS switching system. The AT&T Technical Journal 64(6), Part 2 (July-August 1985)"},{"issue":"11","key":"23_CR34","first-page":"1332","volume":"20","author":"A. Avizienis","year":"1971","unstructured":"Avizienis, A.: Arithmetic error codes: Cost and effectiveness studies for Application in digital system design. IEEE Transactions on Computers\u00a020(11), 1332\u20131331 (1971)","journal-title":"IEEE Transactions on Computers"}],"container-title":["Lecture Notes in Computer Science","High Performance Computing Systems and Applications"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-12659-8_23.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,11,24]],"date-time":"2020-11-24T02:56:05Z","timestamp":1606186565000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-12659-8_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2010]]},"ISBN":["9783642126581","9783642126598"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-12659-8_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2010]]}}}