{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T08:35:22Z","timestamp":1777106122652,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":14,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,2,21]],"date-time":"2023-02-21T00:00:00Z","timestamp":1676937600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Hong Kong RGC","award":["HKU 17204619, 17208920"],"award-info":[{"award-number":["HKU 17204619, 17208920"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,2,25]]},"DOI":"10.1145\/3572848.3577510","type":"proceedings-article","created":{"date-parts":[[2023,2,21]],"date-time":"2023-02-21T16:02:30Z","timestamp":1676995350000},"page":"447-449","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Swift"],"prefix":"10.1145","author":[{"given":"Yuchen","family":"Zhong","sequence":"first","affiliation":[{"name":"The University of Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guangming","family":"Sheng","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Juncheng","family":"Liu","sequence":"additional","affiliation":[{"name":"OneFlow Inc., China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinhui","family":"Yuan","sequence":"additional","affiliation":[{"name":"OneFlow Inc., China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chuan","family":"Wu","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,2,21]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"The Horovod Authors. 2022. Elastic Horovod. https:\/\/horovod.readthedocs.io\/en\/stable\/elastic_include.html."},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of Advances in Neural Information Processing Systems.","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Proceedings of Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies.","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of International Conference on Learning Representations.","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In Proceedings of International Conference on Learning Representations."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of International Conference on Data Engineering.","author":"Hwang Jeong-Hyon","year":"2005","unstructured":"Jeong-Hyon Hwang, Magdalena Balazinska, Alex Rasin, Ugur Cetintemel, Michael Stonebraker, and Stan Zdonik. 2005. High-availability Algorithms for Distributed Stream Processing. In Proceedings of International Conference on Data Engineering."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of International Conference on Learning Representations.","author":"Kingma Diederik P","year":"2015","unstructured":"Diederik P Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In Proceedings of International Conference on Learning Representations."},{"key":"e_1_3_2_1_7_1","unstructured":"Alex Krizhevsky Geoffrey Hinton et al. 2009. Learning Multiple Layers of Features from Tiny Images. (2009)."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 4th Conference on Machine Learning and Systems.","author":"Maeng Kiwan","year":"2021","unstructured":"Kiwan Maeng, Shivam Bharuka, Isabel Gao, Mark Jeffrey, Vikram Saraph, Bor-Yiing Su, Caroline Trippel, Jiyan Yang, Mike Rabbat, Brandon Lucia, and Carole-Jean Wu. 2021. Understanding and Improving Failure Tolerant Training for Deep Learning Recommendation with Partial Recovery. In Proceedings of the 4th Conference on Machine Learning and Systems."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the 19th USENIX Conference on File and Storage Technologies.","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. CheckFreq: Frequent, Fine-Grained DNN Checkpointing. In Proceedings of the 19th USENIX Conference on File and Storage Technologies."},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of Advances in Neural Information Processing Systems.","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Proceedings of Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1264"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of International Conference on Learning Representations.","author":"You Yang","year":"2020","unstructured":"Yang You, Jing Li, Sashank Reddi, Jonathan Hseu, Sanjiv Kumar, Srinadh Bhojanapalli, Xiaodan Song, James Demmel, Kurt Keutzer, and Cho-Jui Hsieh. 2020. Large Batch Optimization for Deep Learning: Training BERT in 76 Minutes. In Proceedings of International Conference on Learning Representations."},{"key":"e_1_3_2_1_14_1","volume-title":"Wide Residual Networks. arXiv preprint","author":"Zagoruyko Sergey","year":"2016","unstructured":"Sergey Zagoruyko and Nikos Komodakis. 2016. Wide Residual Networks. arXiv preprint (2016)."}],"event":{"name":"PPoPP '23: The 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","location":"Montreal QC Canada","acronym":"PPoPP '23","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3572848.3577510","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3572848.3577510","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T18:08:09Z","timestamp":1750183689000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3572848.3577510"}},"subtitle":["Expedited Failure Recovery for Large-Scale DNN Training"],"short-title":[],"issued":{"date-parts":[[2023,2,21]]},"references-count":14,"alternative-id":["10.1145\/3572848.3577510","10.1145\/3572848"],"URL":"https:\/\/doi.org\/10.1145\/3572848.3577510","relation":{},"subject":[],"published":{"date-parts":[[2023,2,21]]},"assertion":[{"value":"2023-02-21","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}