{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:54:38Z","timestamp":1776930878455,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","funder":[{"name":"the National Natural Science Foundation of China","award":["62272185"],"award-info":[{"award-number":["62272185"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759891","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:04:47Z","timestamp":1762963487000},"page":"1113-1126","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["LowDiff: Efficient Frequent Checkpointing via Low-Cost Differential for High-Performance Distributed Training Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1143-052X","authenticated-orcid":false,"given":"Chenxuan","family":"Yao","sequence":"first","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9790-7651","authenticated-orcid":false,"given":"Feifan","family":"Liu","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1265-7141","authenticated-orcid":false,"given":"Yuchong","family":"Hu","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2432-0092","authenticated-orcid":false,"given":"Zhengyu","family":"Liu","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7321-9264","authenticated-orcid":false,"given":"Xinjue","family":"Zheng","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6391-7798","authenticated-orcid":false,"given":"Wenxiang","family":"Zhou","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","first-page":"265","volume-title":"12th USENIX symposium on operating systems design and implementation (OSDI 16)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, and Michael Isard. 2016. TensorFlow: a system for Large-Scale machine learning. In 12th USENIX symposium on operating systems design and implementation (OSDI 16). USENIX Association, 265\u2013283."},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-47448-4_5"},{"key":"e_1_3_3_2_4_2","unstructured":"Alham\u00a0Fikri Aji and Kenneth Heafield. 2017. Sparse communication for distributed gradient descent. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1704.05021 (2017)."},{"key":"e_1_3_3_2_5_2","unstructured":"Dan Alistarh Demjan Grubic Jerry Li Ryota Tomioka and Milan Vojnovic. 2017. QSGD: Communication-efficient SGD via gradient quantization and encoding. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_2_6_2","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared\u00a0D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry and Amanda Askell. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877\u20131901."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS57875.2023.00015"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD58817.2023.00031"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/Cluster48925.2021.00019"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_3_2_11_2","unstructured":"Tim Dettmers. 2015. 8-bit approximations for parallelism in deep learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1511.04561 (2015)."},{"key":"e_1_3_3_2_12_2","unstructured":"Jacob Devlin Ming-Wei Chang Kenton Lee and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1810.04805 (2018)."},{"key":"e_1_3_3_2_13_2","first-page":"929","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Eisenman Assaf","year":"2022","unstructured":"Assaf Eisenman, Kiran\u00a0Kumar Matam, Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Krishnakumar Nair, Misha Smelyanskiy, and Murali Annavaram. 2022. Check-N-Run: A checkpointing system for training deep learning recommendation models. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). USENIX Association, 929\u2013943."},{"key":"e_1_3_3_2_14_2","unstructured":"Fartash Faghri Iman Tabrizian Ilia Markov Dan Alistarh Daniel\u00a0M Roy and Ali Ramezani-Kebrya. 2020. Adaptive gradient quantization for data-parallel sgd. Advances in neural information processing systems 33 (2020) 3174\u20133185."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Jiarui Fang Haohuan Fu Guangwen Yang and Cho-Jui Hsieh. 2019. RedSync: reducing synchronization bandwidth for distributed deep learning training system. J. Parallel and Distrib. Comput. 133 (2019) 30\u201339.","DOI":"10.1016\/j.jpdc.2019.05.016"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472904"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3650085"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_2_19_2","first-page":"439","volume-title":"Proceedings of the Ninth Asian Conference on Machine Learning","volume":"77","author":"Hermans Joeri\u00a0R.","year":"2017","unstructured":"Joeri\u00a0R. Hermans, Gerasimos Spanakis, and Rico M\u00f6ckel. 2017. Accumulated Gradient Normalization. In Proceedings of the Ninth Asian Conference on Machine Learning , Vol.\u00a077. PMLR, 439\u2013454."},{"key":"e_1_3_3_2_20_2","first-page":"947","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU clusters for DNN training workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). USENIX Association, 947\u2013960."},{"key":"e_1_3_3_2_21_2","volume-title":"ICLR (Poster)","author":"Kingma Diederik\u00a0P.","year":"2015","unstructured":"Diederik\u00a0P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization.. In ICLR (Poster). OpenReview.net."},{"key":"e_1_3_3_2_22_2","unstructured":"Alex Krizhevsky and Geoffrey Hinton. 2009. Learning multiple layers of features from tiny images. University of Toronto (2009)."},{"key":"e_1_3_3_2_23_2","unstructured":"Mu Li David\u00a0G Andersen Alexander\u00a0J Smola and Kai Yu. 2014. Communication efficient distributed machine learning with the parameter server. Advances in Neural Information Processing Systems 27 (2014)."},{"key":"e_1_3_3_2_24_2","volume-title":"Big learning NIPS workshop","author":"Li Mu","year":"2013","unstructured":"Mu Li, Li Zhou, Zichao Yang, Aaron Li, Fei Xia, David\u00a0G Andersen, and Alexander Smola. 2013. Parameter server for distributed machine learning. In Big learning NIPS workshop , Vol.\u00a06. Lake Tahoe, CA, Curran Associates."},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508399"},{"key":"e_1_3_3_2_26_2","unstructured":"Shen Li Yanli Zhao Rohan Varma Omkar Salpekar Pieter Noordhuis Teng Li Adam Paszke Jeff Smith Brian Vaughan and Pritam Damania. 2020. Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2006.15704 (2020)."},{"key":"e_1_3_3_2_27_2","unstructured":"Yujun Lin Song Han Huizi Mao Yu Wang and William\u00a0J Dally. 2017. Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1712.01887 (2017)."},{"key":"e_1_3_3_2_28_2","unstructured":"Ahmed M\u00a0Abdelmoniem Ahmed Elzanaty Mohamed-Slim Alouini and Marco Canini. 2021. An efficient statistical-based gradient compression technique for distributed training systems. Proceedings of Machine Learning and Systems 3 (2021) 297\u2013322."},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/3625549.3658685"},{"key":"e_1_3_3_2_30_2","unstructured":"Stephen Merity Caiming Xiong James Bradbury and Richard Socher. 2016. Pointer sentinel mixture models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1609.07843 (2016)."},{"key":"e_1_3_3_2_31_2","unstructured":"Meta. 2022. OPT-175B logbook. https:\/\/github.com\/facebookresearch\/metaseq\/blob\/main\/projects\/OPT\/chronicles\/OPT175B_Logbook.pdf\/."},{"key":"e_1_3_3_2_32_2","unstructured":"Microsoft. 2019. Deepspeed. https:\/\/github.com\/deepspeedai\/DeepSpeed\/."},{"key":"e_1_3_3_2_33_2","unstructured":"Microsoft. 2019. Deepspeedexample. https:\/\/github.com\/deepspeedai\/DeepSpeedExamples\/."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3625549.3658678"},{"key":"e_1_3_3_2_35_2","first-page":"203","volume-title":"19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. CheckFreq: Frequent,Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21). USENIX Association, 203\u2013216."},{"key":"e_1_3_3_2_36_2","unstructured":"OpenAI. 2022. ChatGPT. https:\/\/openai.com\/index\/chatgpt\/."},{"key":"e_1_3_3_2_37_2","unstructured":"OpenAI. 2024. GPT-4 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2303.08774\u00a0[cs.CL]"},{"key":"e_1_3_3_2_38_2","unstructured":"Adam Paszke Sam Gross Francisco Massa Adam Lerer James Bradbury Gregory Chanan Trevor Killeen Zeming Lin Natalia Gimelshein and Luca Antiga. 2019. Pytorch: An imperative style high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_3_2_39_2","unstructured":"Pytorch. 2016. https:\/\/github.com\/pytorch\/pytorch."},{"key":"e_1_3_3_2_40_2","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_3_2_42_2","unstructured":"Pranav Rajpurkar Robin Jia and Percy Liang. 2018. Know What You Don\u2019t Know: Unanswerable Questions for SQuAD. CoRR abs\/1806.03822 (2018). arXiv:https:\/\/arXiv.org\/abs\/1806.03822http:\/\/arxiv.org\/abs\/1806.03822"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356222"},{"key":"e_1_3_3_2_45_2","unstructured":"Alexander Sergeev and Mike\u00a0Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. arxiv:https:\/\/arXiv.org\/abs\/1802.05799\u00a0[cs.LG]"},{"key":"e_1_3_3_2_46_2","unstructured":"Shaohuai Shi Zhenheng Tang Qiang Wang Kaiyong Zhao and Xiaowen Chu. 2019. Layer-wise adaptive gradient sparsification for distributed deep learning with convergence guarantees. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1911.08727 (2019)."},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2019.00220"},{"key":"e_1_3_3_2_48_2","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1409.1556 (2014)."},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575712"},{"key":"e_1_3_3_2_50_2","unstructured":"Sebastian\u00a0U Stich Jean-Baptiste Cordonnier and Martin Jaggi. 2018. Sparsified SGD with memory. Advances in neural information processing systems 31 (2018)."},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707255"},{"key":"e_1_3_3_2_52_2","first-page":"559","volume-title":"22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)","author":"Wan Borui","year":"2025","unstructured":"Borui Wan, Mingji Han, Yiyao Sheng, Yanghua Peng, Haibin Lin, Mofan Zhang, Zhichao Lai, Menghan Yu, Junda Zhang, Zuquan Song, et\u00a0al. 2025. ByteCheckpoint: A Unified Checkpointing System for Large Foundation Model Development. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25). USENIX Association, 559\u2013578."},{"key":"e_1_3_3_2_53_2","unstructured":"Guanhua Wang Olatunji Ruwase Bing Xie and Yuxiong He. 2024. FastPersist: Accelerating Model Checkpointing in Deep Learning. arxiv:https:\/\/arXiv.org\/abs\/2406.13768\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2406.13768"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613145"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"crossref","unstructured":"Ru Zhang Wencong Xiao Hongyu Zhang Yu Liu Haoxiang Lin and Mao Yang. 2020. An empirical study on program failures of deep learning jobs. In 2020 IEEE\/ACM 42nd International Conference on Software Engineering (ICSE). IEEE 1159-1170 (2020).","DOI":"10.1145\/3377811.3380362"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer Alban Desmaison Can Balioglu Pritam Damania Bernard Nguyen Geeta Chauhan Yuchen Hao Ajit Mathews and Shen Li. 2023. PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel. Proceedings of the VLDB Endowment 16 (09 2023) 3848\u20133860. 10.14778\/3611540.3611569","DOI":"10.14778\/3611540.3611569"}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759891","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:42:01Z","timestamp":1773254521000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759891"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":55,"alternative-id":["10.1145\/3712285.3759891","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759891","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}