{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T19:25:47Z","timestamp":1780773947331,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,6,23]],"date-time":"2020-06-23T00:00:00Z","timestamp":1592870400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100014718","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1525609, 1813004"],"award-info":[{"award-number":["1525609, 1813004"]}],"id":[{"id":"10.13039\/100014718","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000015","name":"U.S. Department of Energy","doi-asserted-by":"publisher","award":["DE-AC05-00OR22725, Exascale Computing Project 17-SC-20-SC"],"award-info":[{"award-number":["DE-AC05-00OR22725, Exascale Computing Project 17-SC-20-SC"]}],"id":[{"id":"10.13039\/100000015","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,6,23]]},"DOI":"10.1145\/3369583.3392672","type":"proceedings-article","created":{"date-parts":[[2020,6,22]],"date-time":"2020-06-22T03:27:27Z","timestamp":1592796447000},"page":"167-171","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Orchestrating Fault Prediction with Live Migration and Checkpointing"],"prefix":"10.1145","author":[{"given":"Subhendu","family":"Behera","sequence":"first","affiliation":[{"name":"North Carolina State University, Raleigh, NC, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lipeng","family":"Wan","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory, Oak Ridge, TN, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Frank","family":"Mueller","sequence":"additional","affiliation":[{"name":"North Carolina State University, Raleigh, NC, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Matthew","family":"Wolf","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory, Oak Ridge, TN, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Scott","family":"Klasky","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory, Oak Ridge, TN, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2020,6,23]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/WORKS.2018.00007"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063427"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2016.2643660"},{"key":"e_1_3_2_2_4_1","unstructured":"Wahid Bhimji Deborah Bard Melissa Romanus David Paul Andrey Ovsyannikov Brian Friesen and Matt and Bryson. 2016. Accelerating Science with the NERSC Burst Buffer Early User Program. https:\/\/www.nersc.gov\/assets\/Uploads\/Nersc-BB-EUP-CUG.pdf"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2013.74"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00012"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3208040.3208051"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2016.2546248"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2751504.2751509"},{"key":"e_1_3_2_2_10_1","volume-title":"Taming of the Shrew: Modeling the Normal and Faulty Behaviour of Large-scale HPC Systems. 2012 IEEE 26th International Parallel and Distributed Processing Symposium","author":"Gainaru Ana","year":"2012","unstructured":"Ana Gainaru, Franck Cappello, and William Kramer. 2012a. Taming of the Shrew: Modeling the Normal and Faulty Behaviour of Large-scale HPC Systems. 2012 IEEE 26th International Parallel and Distributed Processing Symposium (2012), 1168--1179."},{"key":"e_1_3_2_2_11_1","volume-title":"Fault Prediction under the Microscope: A Closer Look into HPC Systems (SC '12)","author":"Gainaru Ana","unstructured":"Ana Gainaru, Franck Cappello, Marc Snir, and William Kramer. 2012b. Fault Prediction under the Microscope: A Closer Look into HPC Systems (SC '12). IEEE Computer Society Press, Washington, DC, USA, Article Article 77, 11 pages."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2018.00021"},{"key":"e_1_3_2_2_13_1","volume-title":"Development of Naturally Fault Tolerant Algorithms for Computing on 100,000 Processors. (01","author":"Geist Al","year":"2003","unstructured":"Al Geist and Christian Engelmann. 2003. Development of Naturally Fault Tolerant Algorithms for Computing on 100,000 Processors. (01 2003). https:\/\/www.csm.ornl.gov\/ geist\/Lyon2002-geist.pdf"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2014.2360536"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/1345206.1345230"},{"key":"e_1_3_2_2_16_1","volume-title":"Evaluating Burst Buffer Placement in HPC Systems","author":"Khetawat Harsh","unstructured":"Harsh Khetawat, Christopher Zimmer, Frank Mueller, Scott Atchley, Sudharshan Vazhkudai, and Misbah Mubarak. 2019. Evaluating Burst Buffer Placement in HPC Systems. In IEEE Cluster."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSST.2012.6232369"},{"key":"e_1_3_2_2_18_1","volume-title":"2008 IEEE International Symposium on Parallel and Distributed Processing. 1--9. https:\/\/doi.org\/10","author":"Liu Yudan","year":"2008","unstructured":"Yudan Liu, Raja Nassar, Chokchai Leangsuksun, Nichamon Naksinehaboon, Mihaela Paun, and Stephen L. Scott. 2008. An optimal checkpoint\/restart model for a large scale high performance computing system. In 2008 IEEE International Symposium on Parallel and Distributed Processing. 1--9. https:\/\/doi.org\/10.1109\/IPDPS.2008.4536279"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.2172\/1222713"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2010.18"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00099"},{"key":"e_1_3_2_2_22_1","unstructured":"ORNL. 2020. Spectral Library. https:\/\/www.olcf.ornl.gov\/spectral-library\/"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.5555\/2388996.2389022"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid.2014.24"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1088\/1742--6596"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2009.4"},{"key":"e_1_3_2_2_27_1","unstructured":"SimPy Team. 2020. SimPy: Discrete-Event Simulation for Python. https:\/\/pypi.org\/project\/simpy\/"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2014.101"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Sudharshan S. Vazhkudai Bronis R. de Supinski Arthur S. Bland Al Geist James Sexton Jim Kahle Christopher J. Zimmer Scott Atchley Sarp Oral Don E. Maxwell and et al. 2018. The Design Deployment and Evaluation of the CORAL Pre-Exascale Systems (Proceedings of the International Conference for High Performance Computing Networking Storage and Analysis SC '18). IEEE Press 12.","DOI":"10.1109\/SC.2018.00055"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2016.10.002"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2017.257"},{"key":"e_1_3_2_2_32_1","volume-title":"Proactive Process-Level Live Migration in HPC Environments. In SC '08: Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing. https:\/\/doi.org\/10","author":"Wang Chao","unstructured":"Chao Wang, Frank Mueller, Christian Engelmann, and Stephen L. Scott. 2008. Proactive Process-Level Live Migration in HPC Environments. In SC '08: Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing. https:\/\/doi.org\/10.1145\/1413370.1413414"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2011.10.009"},{"key":"e_1_3_2_2_34_1","volume-title":"SLURM: Simple Linux Utility for Resource Management. In Job Scheduling Strategies for Parallel Processing","author":"Yoo Andy B.","year":"2003","unstructured":"Andy B. Yoo, Morris A. Jette, and Mark Grondona. 2003. SLURM: Simple Linux Utility for Resource Management. In Job Scheduling Strategies for Parallel Processing, Dror Feitelson, Larry Rudolph, and Uwe Schwiegelshohn (Eds.). Springer Berlin Heidelberg, Berlin, Heidelberg, 44--60."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/361147.361115"}],"event":{"name":"HPDC '20: The 29th International Symposium on High-Performance Parallel and Distributed Computing","location":"Stockholm Sweden","acronym":"HPDC '20","sponsor":["University of Arizona University of Arizona","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 29th International Symposium on High-Performance Parallel and Distributed Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3369583.3392672","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3369583.3392672","content-type":"text\/html","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3369583.3392672","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3369583.3392672","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:44:58Z","timestamp":1750203898000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3369583.3392672"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,6,23]]},"references-count":35,"alternative-id":["10.1145\/3369583.3392672","10.1145\/3369583"],"URL":"https:\/\/doi.org\/10.1145\/3369583.3392672","relation":{},"subject":[],"published":{"date-parts":[[2020,6,23]]},"assertion":[{"value":"2020-06-23","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}