{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T20:55:22Z","timestamp":1760043322492,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072302,61960206002"],"award-info":[{"award-number":["62072302,61960206002"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Alibaba Innovation Research Project","award":["2022010307"],"award-info":[{"award-number":["2022010307"]}]},{"name":"the Key R&D Program of Zhejiang Province","award":["2023R5202"],"award-info":[{"award-number":["2023R5202"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3689031.3696088","type":"proceedings-article","created":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T06:25:20Z","timestamp":1742970320000},"page":"1334-1349","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["FlowCheck: Decoupling Checkpointing and Training of Large-Scale Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5016-8472","authenticated-orcid":false,"given":"Zimeng","family":"Huang","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China and Alibaba Cloud Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3031-5930","authenticated-orcid":false,"given":"Hao","family":"Nie","sequence":"additional","affiliation":[{"name":"Alibaba Cloud, Hangzhou, China and Peking University Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2511-0003","authenticated-orcid":false,"given":"Haonan","family":"Jia","sequence":"additional","affiliation":[{"name":"Alibaba Cloud, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6711-4342","authenticated-orcid":false,"given":"Bo","family":"Jiang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9018-8881","authenticated-orcid":false,"given":"Junchen","family":"Guo","sequence":"additional","affiliation":[{"name":"Alibaba Cloud, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2502-8686","authenticated-orcid":false,"given":"Jianyuan","family":"Lu","sequence":"additional","affiliation":[{"name":"Alibaba Cloud, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6384-1165","authenticated-orcid":false,"given":"Rong","family":"Wen","sequence":"additional","affiliation":[{"name":"Alibaba Cloud, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8096-5528","authenticated-orcid":false,"given":"Biao","family":"Lyu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China and Alibaba Cloud Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7756-0952","authenticated-orcid":false,"given":"Shunmin","family":"Zhu","sequence":"additional","affiliation":[{"name":"Hangzhou Feitian Cloud Hangzhou, China and Alibaba Cloud Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0357-8356","authenticated-orcid":false,"given":"Xinbing","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. Tcpdump. https:\/\/github.com\/the-tcpdump-group\/tcpdump."},{"key":"e_1_3_2_1_2_1","unstructured":"2015. Memory Transactions. https:\/\/docs.nvidia.com\/gameworks\/content\/developertools\/desktop\/analysis\/report\/cudaexperiments\/sourcelevel\/memorytransactions.htm."},{"key":"e_1_3_2_1_3_1","unstructured":"2016. NVIDIA NVLINK. https:\/\/www.nvidia.com\/en-us\/design-visualization\/nvlink-bridges\/."},{"key":"e_1_3_2_1_4_1","volume-title":"https:\/\/github.com\/bigscience-workshop\/bigscience\/blob\/master\/train\/tr11-176B-ml\/chronicles.md","author":"Chronicles BLOOM","year":"2022","unstructured":"2023. BLOOM Chronicles. https:\/\/github.com\/bigscience-workshop\/bigscience\/blob\/master\/train\/tr11-176B-ml\/chronicles.md, 2022."},{"key":"e_1_3_2_1_5_1","unstructured":"2024. AMD Instinct\u2122 MI300X Accelerators. https:\/\/www.amd.com\/en\/products\/accelerators\/instinct\/mi300\/mi300x.html"},{"key":"e_1_3_2_1_6_1","volume-title":"Vidushi Vashishth, Kexin Rong, and Alexey Tumanov.","author":"Agrawal Amey","year":"2023","unstructured":"Amey Agrawal, Sameer Reddy, Satwik Bhattamishra, Venkata Prabhakara Sarath Nookala, Vidushi Vashishth, Kexin Rong, and Alexey Tumanov. 2023. DynaQuant: Compressing Deep Learning Training Checkpoints via Dynamic Quantization. http:\/\/arxiv.org\/abs\/2306.11800 arXiv:2306.11800 [cs]."},{"key":"e_1_3_2_1_7_1","first-page":"1","article-title":"PaLM: Scaling Language Modeling with Pathways","volume":"24","author":"Chowdhery Aakanksha","year":"2023","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. 2023. PaLM: Scaling Language Modeling with Pathways. Journal of Machine Learning Research 24, 240 (2023), 1--113.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_8_1","volume-title":"Leonardo Bautista-Gomez, and Franck Cappello.","author":"Di Sheng","year":"2014","unstructured":"Sheng Di, Mohamed Slim Bouguerra, Leonardo Bautista-Gomez, and Franck Cappello. 2014. Optimization of multi-level checkpoint model for large scale HPC applications. In 2014 IEEE 28th international parallel and distributed processing symposium. IEEE, 1181--1190."},{"key":"e_1_3_2_1_9_1","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Eisenman Assaf","year":"2022","unstructured":"Assaf Eisenman, Kiran Kumar Matam, Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Krishnakumar Nair, Misha Smelyanskiy, and Murali Annavaram. 2022. Check-N-Run: A checkpointing system for training deep learning recommendation models. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). 929--943."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/SRDS47363.2019.00025"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2024.3373763"},{"key":"e_1_3_2_1_13_1","unstructured":"Andrew Gibiansky. 2017. Bringing HPC techniques to deep learning. http:\/\/research.baidu.com\/bringing-hpc-techniques-deep-learning."},{"key":"e_1_3_2_1_14_1","volume-title":"large minibatch SGD: Training Imagenet in 1 hour. arXiv preprint arXiv:1706.02677","author":"Goyal Priya","year":"2017","unstructured":"Priya Goyal, Piotr Doll\u00e1r, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. 2017. Accurate, large minibatch SGD: Training Imagenet in 1 hour. arXiv preprint arXiv:1706.02677 (2017)."},{"key":"e_1_3_2_1_15_1","volume-title":"Accelerating Network Receive Processing. In Linux Symposium. Citeseer, 281","author":"Grover Andrew","year":"2005","unstructured":"Andrew Grover and Christopher Leech. 2005. Accelerating Network Receive Processing. In Linux Symposium. Citeseer, 281."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/1592568.1592577"},{"key":"e_1_3_2_1_17_1","volume-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in Neural Information Processing Systems 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. 2019. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607102"},{"key":"e_1_3_2_1_19_1","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU clusters for DNN training workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). 947--960."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3028367"},{"key":"e_1_3_2_1_21_1","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Jiang Ziheng","year":"2024","unstructured":"Ziheng Jiang, Haibin Lin, Yinmin Zhong, Qi Huang, Yangrui Chen, Zhi Zhang, Yanghua Peng, Xiang Li, Cong Xie, Shibiao Nong, et al. 2024. MegaScale: Scaling large language model training to more than 10,000 GPUs. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). 745--760."},{"key":"e_1_3_2_1_22_1","volume-title":"On large-batch training for deep learning: Generalization gap and sharp minima. arXiv preprint arXiv:1609.04836","author":"Keskar Nitish Shirish","year":"2016","unstructured":"Nitish Shirish Keskar, Dheevatsa Mudigere, Jorge Nocedal, Mikhail Smelyanskiy, and Ping Tak Peter Tang. 2016. On large-batch training for deep learning: Generalization gap and sharp minima. arXiv preprint arXiv:1609.04836 (2016)."},{"key":"e_1_3_2_1_23_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_24_1","unstructured":"Shen Li Yanli Zhao Rohan Varma Omkar Salpekar Pieter Noordhuis Teng Li Adam Paszke Jeff Smith Brian Vaughan Pritam Damania et al. 2020. Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704 (2020)."},{"key":"e_1_3_2_1_25_1","first-page":"637","article-title":"Understanding and improving failure tolerant training for deep learning recommendation with partial recovery","volume":"3","author":"Maeng Kiwan","year":"2021","unstructured":"Kiwan Maeng, Shivam Bharuka, Isabel Gao, Mark Jeffrey, Vikram Saraph, Bor-Yiing Su, Caroline Trippel, Jiyan Yang, Mike Rabbat, Brandon Lucia, et al. 2021. Understanding and improving failure tolerant training for deep learning recommendation with partial recovery. Proceedings of Machine Learning and Systems 3 (2021), 637--651.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_26_1","volume-title":"Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. CheckFreq: Frequent, Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21). 203--216."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_29_1","unstructured":"NVIDIA. 2021. NCCL. https:\/\/developer.nvidia.com\/NCCL."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2024.3377531"},{"key":"e_1_3_2_1_32_1","volume-title":"Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_33_1","unstructured":"Shaden Smith Mostofa Patwary Brandon Norick Patrick LeGresley Samyam Rajbhandari Jared Casper Zhun Liu Shrimai Prabhumoye George Zerveas Vijay Korthikanti et al. 2022. Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B A Large-Scale Generative Language Model. arXiv preprint arXiv:2201.11990 (2022)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.5574"},{"key":"e_1_3_2_1_35_1","volume-title":"DLRover: An Elastic Deep Training Extension with Auto Job Resource Recommendation. arXiv preprint arXiv:2304.01468","author":"Wang Qinlong","year":"2023","unstructured":"Qinlong Wang, Bo Sang, Haitao Zhang, Mingjie Tang, and Ke Zhang. 2023. DLRover: An Elastic Deep Training Extension with Auto Job Resource Recommendation. arXiv preprint arXiv:2304.01468 (2023)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2022.04.019"},{"key":"e_1_3_2_1_37_1","volume-title":"Bingsheng He, and Xiaowen Chu.","author":"Wang Yuxin","year":"2023","unstructured":"Yuxin Wang, Shaohuai Shi, Xin He, Zhenheng Tang, Xinglin Pan, Yang Zheng, Xiaoyu Wu, Amelie Chi Zhou, Bingsheng He, and Xiaowen Chu. 2023. Reliable and Efficient In-Memory Fault Tolerance of Large Language Model Pretraining. arXiv preprint arXiv:2310.12670 (2023)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613145"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3567505"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2017.102"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS57527.2023.00035"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492866.3549716"},{"key":"e_1_3_2_1_43_1","volume-title":"LoRA-FA: Memory-efficient Low-rank Adaptation for Large Language Models Fine-tuning. arXiv preprint arXiv:2308.03303","author":"Zhang Longteng","year":"2023","unstructured":"Longteng Zhang, Lin Zhang, Shaohuai Shi, Xiaowen Chu, and Bo Li. 2023. LoRA-FA: Memory-efficient Low-rank Adaptation for Large Language Models Fine-tuning. arXiv preprint arXiv:2308.03303 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"Xi Victoria Lin, et al","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.14778\/3611479.3611514"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer et al. 2023. Pytorch FSDP: experiences on scaling fully sharded data parallel. arXiv preprint arXiv:2304.11277 (2023).","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577510"},{"key":"e_1_3_2_1_48_1","volume-title":"Parallelized stochastic gradient descent. Advances in Neural Information Processing Systems 23","author":"Zinkevich Martin","year":"2010","unstructured":"Martin Zinkevich, Markus Weimer, Lihong Li, and Alex Smola. 2010. Parallelized stochastic gradient descent. Advances in Neural Information Processing Systems 23 (2010)."}],"event":{"name":"EuroSys '25: Twentieth European Conference on Computer Systems","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"],"location":"Rotterdam Netherlands","acronym":"EuroSys '25"},"container-title":["Proceedings of the Twentieth European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689031.3696088","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3689031.3696088","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:21:22Z","timestamp":1755775282000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689031.3696088"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":48,"alternative-id":["10.1145\/3689031.3696088","10.1145\/3689031"],"URL":"https:\/\/doi.org\/10.1145\/3689031.3696088","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}