{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T18:12:22Z","timestamp":1759947142310,"version":"build-2065373602"},"reference-count":46,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T00:00:00Z","timestamp":1756771200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T00:00:00Z","timestamp":1756771200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62025208,62421002"],"award-info":[{"award-number":["62025208,62421002"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,9,2]]},"DOI":"10.1109\/cluster59342.2025.11186488","type":"proceedings-article","created":{"date-parts":[[2025,10,7]],"date-time":"2025-10-07T17:35:09Z","timestamp":1759858509000},"page":"1-12","source":"Crossref","is-referenced-by-count":0,"title":["Capricorn: Efficient In-Memory Checkpointing for MoE Model Training with Dynamicity Awareness"],"prefix":"10.1109","author":[{"given":"Wenqian","family":"Xie","sequence":"first","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China"}]},{"given":"Zhiquan","family":"Lai","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China"}]},{"given":"Shengwei","family":"Li","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China"}]},{"given":"Weijie","family":"Liu","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China"}]},{"given":"Wei","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China"}]},{"given":"Yanqi","family":"Hao","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China"}]},{"given":"Dongsheng","family":"Li","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology,National Key Laboratory of Parallel and Distributed Computing,Changsha,China"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/vl\/N19-142"},{"key":"ref3","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume":"35","author":"Wei","year":"2022","journal-title":"Advances in neural information processing systems"},{"key":"ref4","first-page":"269","article-title":"Tutel: Adaptive mixture-of-experts at scale","volume-title":"Proceedings of Machine Learning and Systems","volume":"5","author":"Hwang","year":"2023"},{"key":"ref5","first-page":"18332","article-title":"Deepspeed-moe: Advancing mixture-ofexperts inference and training to power next-generation ai scale","volume-title":"International Conference on Machine Learning. PMLR","author":"Rajbhandari","year":"2022"},{"issue":"1","key":"ref6","first-page":"5232","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2022","journal-title":"The Journal of Machine Learning Research"},{"key":"ref7","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023","journal-title":"arXiv preprint"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00078"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604869"},{"key":"ref10","first-page":"74","article-title":"Lancet: Accelerating mixture-of-experts training via whole graph computationcommunication overlapping","volume-title":"Proceedings of Machine Learning and Systems","volume":"6","author":"Jiang","year":"2024"},{"key":"ref11","first-page":"945","article-title":"Accelerating distributed moe training and inference with lina","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Li","year":"2023"},{"volume-title":"Qwen3","year":"2025","author":"Cloud","key":"ref12"},{"key":"ref13","article-title":"Deepseek-v3 technical report","author":"Liu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref14","first-page":"745","article-title":"Megascale: Scaling large language model training to more than 10,000 gpus","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Jiang","year":"2024"},{"key":"ref15","first-page":"709","article-title":"Characterization of large language model development in the datacenter","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Hu","year":"2024"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref17","first-page":"203","article-title":"Checkfreq: Frequent,fine-graineddnn checkpointing","volume-title":"19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan","year":"2021"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3625549.3658685"},{"key":"ref19","first-page":"arXiv","article-title":"Bytecheckpoint: A unified checkpointing system for llm development","author":"Wan","year":"2024","journal-title":"arXiv e-prints"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613145"},{"issue":"1","key":"ref21","first-page":"5485","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"The Journal of Machine Learning Research"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1907.11692"},{"key":"ref23","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020","journal-title":"arXiv preprint"},{"key":"ref24","article-title":"Gshard: Scaling giant models with conditional computation and automatic sharding","volume-title":"International Conference on Learning Representations","author":"Lepikhin","year":"2021"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3650085"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519584"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3567505"},{"key":"ref28","first-page":"23274","article-title":"Dragonn: Distributed randomized approximate gradients of neural networks","volume-title":"International Conference on Machine Learning. PMLR","author":"Wang","year":"2022"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3405671.3405810"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-57077-4_10"},{"key":"ref31","article-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","author":"Shoeybi","year":"2019","journal-title":"arXiv preprint"},{"volume-title":"Openwebtext corpus","year":"2019","author":"Gokaslan","key":"ref32"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"article-title":"Welcome to the torchsnapshot documentation","volume-title":"PyTorch","year":"2024","key":"ref34"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519584"},{"key":"ref36","first-page":"172","article-title":"DeepFreeze: Towards Scalable Asynchronous Checkpointing of Deep Learning Models","volume-title":"2020 20th IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID)","author":"Nicolae"},{"volume-title":"FastPersist: Accelerating Model Checkpointing in Deep Learning","year":"2024","author":"Wang","key":"ref37"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707255"},{"key":"ref39","first-page":"150","article-title":"A Cost-Efficient FailureTolerant Scheme for Distributed DNN Training","volume-title":"2023 IEEE 41st International Conference on Computer Design (ICCD)","author":"Chen"},{"key":"ref40","article-title":"Reliable and efficient in-memory fault tolerance of large language model pretraining","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3650085"},{"key":"ref42","first-page":"382","article-title":"Oobleck: Resilient Distributed Training of Large Models Using Pipeline Templates","volume-title":"Proceedings of the 29 th Symposium on Operating Systems Principles","author":"Jang"},{"journal-title":"Parcae: Proactive, Liveput-Optimized DNN Training on Preemptible Instances.","author":"Duan","key":"ref43"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640411"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695960"},{"journal-title":"Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs.","author":"Thorpe","key":"ref46"}],"event":{"name":"2025 IEEE International Conference on Cluster Computing (CLUSTER)","start":{"date-parts":[[2025,9,2]]},"location":"United Kingdom","end":{"date-parts":[[2025,9,5]]}},"container-title":["2025 IEEE International Conference on Cluster Computing (CLUSTER)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11186399\/11186452\/11186488.pdf?arnumber=11186488","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T17:36:46Z","timestamp":1759945006000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11186488\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,2]]},"references-count":46,"URL":"https:\/\/doi.org\/10.1109\/cluster59342.2025.11186488","relation":{},"subject":[],"published":{"date-parts":[[2025,9,2]]}}}