{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,30]],"date-time":"2026-06-30T15:57:17Z","timestamp":1782835037763,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,22]],"date-time":"2024-04-22T00:00:00Z","timestamp":1713744000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,22]]},"DOI":"10.1145\/3627703.3650085","type":"proceedings-article","created":{"date-parts":[[2024,4,18]],"date-time":"2024-04-18T06:28:28Z","timestamp":1713421708000},"page":"1110-1125","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":19,"title":["Just-In-Time Checkpointing: Low Cost Error Recovery from Deep Learning Training Failures"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2944-1632","authenticated-orcid":false,"given":"Tanmaey","family":"Gupta","sequence":"first","affiliation":[{"name":"Microsoft Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0125-1956","authenticated-orcid":false,"given":"Sanjeev","family":"Krishnan","sequence":"additional","affiliation":[{"name":"Microsoft Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3870-6671","authenticated-orcid":false,"given":"Rituraj","family":"Kumar","sequence":"additional","affiliation":[{"name":"Microsoft Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8570-4577","authenticated-orcid":false,"given":"Abhishek","family":"Vijeev","sequence":"additional","affiliation":[{"name":"Microsoft Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4862-0057","authenticated-orcid":false,"given":"Bhargav","family":"Gulavani","sequence":"additional","affiliation":[{"name":"Microsoft Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0354-6204","authenticated-orcid":false,"given":"Nipun","family":"Kwatra","sequence":"additional","affiliation":[{"name":"Microsoft Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0007-6040","authenticated-orcid":false,"given":"Ramachandran","family":"Ramjee","sequence":"additional","affiliation":[{"name":"Microsoft Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6190-9733","authenticated-orcid":false,"given":"Muthian","family":"Sivathanu","sequence":"additional","affiliation":[{"name":"Microsoft Research"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"BLOOM Training. https:\/\/huggingface.co\/blog\/bloom-megatron-deepspeed#training-difficulties."},{"key":"e_1_3_2_1_2_1","unstructured":"Boost Checkpoint Speed and Reduce Cost with Nebula. https:\/\/learn.microsoft.com\/en-us\/azure\/machine-learning\/reference-checkpoint-performance-for-large-models."},{"key":"e_1_3_2_1_3_1","unstructured":"CRIU: Checkpoint Restore in Userspace. https:\/\/criu.org\/Main_Page."},{"key":"e_1_3_2_1_4_1","unstructured":"Deepspeed. https:\/\/www.deepspeed.ai\/."},{"key":"e_1_3_2_1_5_1","unstructured":"DeepSpeed: Extreme-scale model training for everyone. https:\/\/www.microsoft.com\/en-us\/research\/blog\/deepspeed-extreme-scale-model-training-for-everyone\/."},{"key":"e_1_3_2_1_6_1","unstructured":"Elastic Horovod. https:\/\/horovod.readthedocs.io\/en\/stable\/elastic_include.html."},{"key":"e_1_3_2_1_7_1","unstructured":"Fully Sharded Data Parallel: faster AI training with fewer GPUs. https:\/\/engineering.fb.com\/2021\/07\/15\/open-source\/fsdp\/."},{"key":"e_1_3_2_1_8_1","unstructured":"Hugging Face Transformers. https:\/\/huggingface.co\/docs\/transformers."},{"key":"e_1_3_2_1_9_1","unstructured":"ld.so - Linux manual page. https:\/\/man7.org\/linux\/man-pages\/man8\/ld.so.8.html."},{"key":"e_1_3_2_1_10_1","unstructured":"NVIDIA Collective Communication Library (NCCL). https:\/\/developer.nvidia.com\/nccl."},{"key":"e_1_3_2_1_11_1","unstructured":"Pytorch. https:\/\/pytorch.org\/."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.5555\/524778"},{"key":"e_1_3_2_1_13_1","volume-title":"Language models are few-shot learners. arXiv preprint arXiv:2005.14165","author":"Brown Tom B","year":"2020","unstructured":"Tom B Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. arXiv preprint arXiv:2005.14165, 2020."},{"key":"e_1_3_2_1_14_1","unstructured":"Aakanksha Chowdhery Sharan Narang Jacob Devlin Maarten Bosma Gaurav Mishra Adam Roberts Paul Barham Hyung Won Chung Charles Sutton Sebastian Gehrmann Parker Schuh Kensen Shi Sasha Tsvyashchenko Joshua Maynez Abhishek Rao Parker Barnes Yi Tay Noam Shazeer Vinodkumar Prabhakaran Emily Reif Nan Du Ben Hutchinson Reiner Pope James Bradbury Jacob Austin Michael Isard Guy Gur-Ari Pengcheng Yin Toju Duke Anselm Levskaya Sanjay Ghemawat Sunipa Dev Henryk Michalewski Xavier Garcia Vedant Misra Kevin Robinson Liam Fedus Denny Zhou Daphne Ippolito David Luan Hyeontaek Lim Barret Zoph Alexander Spiridonov Ryan Sepassi David Dohan Shivani Agrawal Mark Omernick Andrew M. Dai Thanumalayan Sankaranarayana Pillai Marie Pellat Aitor Lewkowycz Erica Moreira Rewon Child Oleksandr Polozov Katherine Lee Zongwei Zhou Xuezhi Wang Brennan Saeta Mark Diaz Orhan Firat Michele Catasta Jason Wei Kathy Meier-Hellstern Douglas Eck Jeff Dean Slav Petrov and Noah Fiedel. Palm: Scaling language modeling with pathways 2022."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.5555\/1134241.1708449"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2014.122"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-84696-0"},{"key":"e_1_3_2_1_18_1","first-page":"929","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Eisenman Assaf","year":"2022","unstructured":"Assaf Eisenman, Kiran Kumar Matam, Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Krishnakumar Nair, Misha Smelyanskiy, and Murali Annavaram. Check-N-Run: a checkpointing system for training deep learning recommendation models. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22), pages 929--943, Renton, WA, April 2022. USENIX Association."},{"key":"e_1_3_2_1_19_1","volume-title":"A survey of rollback-recovery protocols in message-passing systems. ACM Comput. Surv., 34(3), sep","author":"Elnozahy E. N.","year":"2002","unstructured":"E. N. (Mootaz) Elnozahy, Lorenzo Alvisi, Yi-Min Wang, and David B. Johnson. A survey of rollback-recovery protocols in message-passing systems. ACM Comput. Surv., 34(3), sep 2002."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.5555\/3358807.3358888"},{"issue":"12","key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","DOI":"10.1109\/TC.2008.90","article-title":"Adaptive fault management of parallel applications for high-performance computing","volume":"57","author":"Lan Zhiling","year":"2008","unstructured":"Zhiling Lan and Yawei Li. Adaptive fault management of parallel applications for high-performance computing. IEEE Transactions on Computers, 57(12), 2008.","journal-title":"IEEE Transactions on Computers"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the 4th MLSys Conference","author":"Maeng Kiwan","year":"2021","unstructured":"Kiwan Maeng, Shivam Bharuka, Isabel Gao, Mark C. Jeffrey, Vikram Saraph, Bor-Yiing Su, Caroline Trippel, Jiyan Yang, Mike Rabbat, Brandon Lucia, and Carole-Jean W. Cpr: Understanding and improving failure tolerant training for deep learning recommendation with partial recovery. In Proceedings of the 4th MLSys Conference, 2021."},{"key":"e_1_3_2_1_23_1","first-page":"203","volume-title":"19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. Check-Freq: Frequent, Fine-Grained DNN checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21), pages 203--216. USENIX Association, February 2021."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid49817.2020.00-76"},{"key":"e_1_3_2_1_25_1","volume-title":"Gpt-4 technical report","author":"AI.","year":"2023","unstructured":"OpenAI. Gpt-4 technical report, 2023."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid51090.2021.00020"},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the 36th International Conference on Machine Learning, Proceedings of Machine Learning Research","author":"Qiao Aurick","year":"2019","unstructured":"Aurick Qiao, Bryon Aragam, Bingjing Zhang, and Eric Xing. Fault tolerance in iterative-convergent machine learning. In Proceedings of the 36th International Conference on Machine Learning, Proceedings of Machine Learning Research, 2019."},{"key":"e_1_3_2_1_28_1","volume-title":"Zero: Memory optimization towards training A trillion parameter models. CoRR, abs\/1910.02054","author":"Rajbhandari Samyam","year":"2019","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. Zero: Memory optimization towards training A trillion parameter models. CoRR, abs\/1910.02054, 2019."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.5555\/829523.830989"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2013.145"},{"key":"e_1_3_2_1_31_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using gpu model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. Megatron-lm: Training multi-billion parameter language models using gpu model parallelism. arXiv preprint arXiv:1909.08053, 2019."},{"key":"e_1_3_2_1_32_1","volume-title":"Planet-scale, preemptive and elastic scheduling of ai workloads","author":"Shukla Dharma","year":"2022","unstructured":"Dharma Shukla, Muthian Sivathanu, Srinidhi Viswanatha, Bhargav Gulavani, Rimma Nehme, Amey Agrawal, Chen Chen, Nipun Kwatra, Ramachandran Ramjee, Pankaj Sharma, Atul Katiyar, Vipul Modi, Vaibhav Sharma, Abhishek Singh, Shreshth Singhal, Kaustubh Welankar, Lu Xun, Ravi Anupindi, Karthik Elangovan, Hasibur Rahman, Zhou Lin, Rahul Seetharaman, Cheng Xu, Eddie Ailijiang, Suresh Krishnappa, and Mark Russinovich. Singularity: Planet-scale, preemptive and elastic scheduling of ai workloads, 2022. https:\/\/arxiv.org\/abs\/2202.07848."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056044"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2014.101"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613145"},{"key":"e_1_3_2_1_36_1","volume-title":"Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. Opt: Open pre-trained transformer language models","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. Opt: Open pre-trained transformer language models, 2022."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER.2019.8890989"},{"key":"e_1_3_2_1_38_1","first-page":"447","volume-title":"Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, PPoPP '23","author":"Zhong Yuchen","year":"2023","unstructured":"Yuchen Zhong, Guangming Sheng, Juncheng Liu, Jinhui Yuan, and Chuan Wu. Swift: Expedited failure recovery for large-scale dnn training. In Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, PPoPP '23, page 447--449, New York, NY, USA, 2023. Association for Computing Machinery."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472897"}],"event":{"name":"EuroSys '24: Nineteenth European Conference on Computer Systems","location":"Athens Greece","acronym":"EuroSys '24","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Nineteenth European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627703.3650085","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3627703.3650085","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T01:08:28Z","timestamp":1755824908000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627703.3650085"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,22]]},"references-count":39,"alternative-id":["10.1145\/3627703.3650085","10.1145\/3627703"],"URL":"https:\/\/doi.org\/10.1145\/3627703.3650085","relation":{},"subject":[],"published":{"date-parts":[[2024,4,22]]},"assertion":[{"value":"2024-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}