{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T18:22:29Z","timestamp":1763922149316,"version":"3.45.0"},"publisher-location":"Cham","reference-count":20,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032076113","type":"print"},{"value":"9783032076120","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-07612-0_13","type":"book-chapter","created":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T17:57:14Z","timestamp":1763920634000},"page":"162-171","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Evaluation of\u00a0Distributed Asynchronous Checkpointing in\u00a0High-Performance Computing"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8872-6804","authenticated-orcid":false,"given":"Riccardo","family":"Scheda","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8157-1459","authenticated-orcid":false,"given":"Domitilla","family":"Brandoni","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1856-4452","authenticated-orcid":false,"given":"Laura","family":"Cavalli","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0753-2571","authenticated-orcid":false,"given":"Laura","family":"Morselli","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,24]]},"reference":[{"key":"13_CR1","unstructured":"Optimizing checkpointing efficiency with pytorch dcp. https:\/\/discuss.pytorch.org\/t\/distributed-w-torchtitan-optimizing-checkpointing-efficiency-with-pytorch-dcp\/211250, Accessed 24 Feb 2025"},{"key":"13_CR2","unstructured":"Reducing model checkpointing times by over 10x with pytorch distributed asynchronous checkpointing. https:\/\/pytorch.org\/blog\/reducing-checkpointing-times, Accessed 24 Feb 2025"},{"key":"13_CR3","doi-asserted-by":"publisher","unstructured":"Cineca, supercomputing centre, supercomputing applications and innovation department, leonardo: a pan-european pre-exascale supercomputer for hpc and ai applications. Journal of Large-Scale Research Facilities 8, A186 (2024). https:\/\/doi.org\/10.17815\/jlsrf-8-186","DOI":"10.17815\/jlsrf-8-186"},{"key":"13_CR4","doi-asserted-by":"publisher","unstructured":"Chen, M., et al.: Efficientqat: efficient quantization-aware training for large language models. ArXiv abs\/2407.11062 (2024).https:\/\/doi.org\/10.48550\/arXiv.2407.11062","DOI":"10.48550\/arXiv.2407.11062"},{"key":"13_CR5","unstructured":"Chowdhery, A., et al.: Palm: scaling language modeling with pathways (2022). https:\/\/arxiv.org\/abs\/2204.02311"},{"key":"13_CR6","doi-asserted-by":"publisher","unstructured":"Dash, S., et al.: Optimizing distributed training on frontier for large language models, pp. 1\u201311 (2023). https:\/\/doi.org\/10.48550\/arXiv.2312.12705","DOI":"10.48550\/arXiv.2312.12705"},{"key":"13_CR7","unstructured":"Dubey, A., et\u00a0al.: The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)"},{"key":"13_CR8","unstructured":"Fedus, W., Zoph, B., Shazeer, N.: Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity (2022). https:\/\/arxiv.org\/abs\/2101.03961"},{"key":"13_CR9","unstructured":"Gu, A., Dao, T.: Mamba: linear-time sequence modeling with selective state spaces. arXiv preprint arXiv:2312.00752 (2023)"},{"key":"13_CR10","unstructured":"Gugger, S., et al.: Accelerate: training and inference at scale made simple, efficient and adaptable. https:\/\/github.com\/huggingface\/accelerate (2022)"},{"key":"13_CR11","doi-asserted-by":"publisher","unstructured":"Hoffmann, J., et al.: Training compute-optimal large language models. ArXiv abs\/2203.15556 (2022). https:\/\/doi.org\/10.48550\/arXiv.2203.15556","DOI":"10.48550\/arXiv.2203.15556"},{"key":"13_CR12","unstructured":"Liang, W., et al.: Torchtitan: one-stop pytorch native solution for production ready LLM pre-training (2024). https:\/\/arxiv.org\/abs\/2410.06511"},{"key":"13_CR13","doi-asserted-by":"publisher","unstructured":"Maurya, A., Underwood, R., Rafique, M.M., Cappello, F., Nicolae, B.: Datastates-LLM: Lazy asynchronous checkpointing for large language models. In: Proceedings of the 33rd International Symposium on High-Performance Parallel and Distributed Computing, pp. 227\u2013239. HPDC \u201924, ACM (2024). https:\/\/doi.org\/10.1145\/3625549.3658685","DOI":"10.1145\/3625549.3658685"},{"key":"13_CR14","doi-asserted-by":"publisher","unstructured":"Mei, Z., Fu, W., Li, K., Wang, G., Zhang, H., Wu, Y.: Realhf: optimized RLHF training for large language models through parameter reallocation. ArXiv (2024). https:\/\/doi.org\/10.48550\/arXiv.2406.14088","DOI":"10.48550\/arXiv.2406.14088"},{"key":"13_CR15","unstructured":"Mohan, J., Phanishayee, A., Chidambaram, V.: $$\\{$$CheckFreq$$\\}$$: Frequent,$$\\{$$Fine-Grained$$\\}$$$$\\{$$DNN$$\\}$$ checkpointing. In: 19th USENIX Conference on File and Storage Technologies (FAST 21), pp. 203\u2013216 (2021)"},{"key":"13_CR16","doi-asserted-by":"crossref","unstructured":"Pang, B., Lee, L.: Seeing stars: exploiting class relationships for sentiment categorization with respect to rating scales. In: Proceedings of the ACL (2005)","DOI":"10.3115\/1219840.1219855"},{"key":"13_CR17","first-page":"30811","volume":"37","author":"G Penedo","year":"2024","unstructured":"Penedo, G., et al.: The fineweb datasets: decanting the web for the finest text data at scale. Adv. Neural. Inf. Process. Syst. 37, 30811\u201330849 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"13_CR18","unstructured":"Rojas, E., Kahira, A.N., Meneses, E., Gomez, L.B., Badia, R.M.: A study of checkpointing in large scale training of deep neural networks (2021). https:\/\/arxiv.org\/abs\/2012.00825"},{"key":"13_CR19","unstructured":"Touvron, H., et al.: Llama 2: open foundation and fine-tuned chat models (2023). https:\/\/arxiv.org\/abs\/2307.09288"},{"key":"13_CR20","unstructured":"Zhao, Y., et al.: Pytorch fsdp: experiences on scaling fully sharded data parallel (2023). https:\/\/arxiv.org\/abs\/2304.11277"}],"container-title":["Lecture Notes in Computer Science","High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-07612-0_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T17:57:17Z","timestamp":1763920637000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-07612-0_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,24]]},"ISBN":["9783032076113","9783032076120"],"references-count":20,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-07612-0_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,24]]},"assertion":[{"value":"24 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ISC High Performance","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on High Performance Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hamburg","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 June 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 June 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"40","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"supercomputing2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}