{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T19:15:45Z","timestamp":1758309345994,"version":"3.44.0"},"reference-count":34,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["World Wide Web"],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1007\/s11280-025-01367-7","type":"journal-article","created":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T06:59:48Z","timestamp":1754031588000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Research on storage optimization for efficient training of large language models"],"prefix":"10.1007","volume":"28","author":[{"given":"Biyun","family":"Shang","sequence":"first","affiliation":[]},{"given":"Feng","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Mo","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Junning","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Xinyuan","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Zhenjiang","family":"Dong","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,1]]},"reference":[{"key":"1367_CR1","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J.D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1367_CR2","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.-A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., et al.: Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971. (2023)"},{"key":"1367_CR3","unstructured":"Dubey, A., Jauhri, A., Pandey, A., Kadian, A., Al-Dahle, A., Letman, A., Mathur, A., Schelten, A., Yang, A., Fan, A., et al.: The llama 3 herd of models. arXiv preprint arXiv:2407.21783. (2024)"},{"key":"1367_CR4","doi-asserted-by":"crossref","unstructured":"Rajbhandari, S., Rasley, J., Ruwase, O., He, Y.: Zero: Memory optimizations toward training trillion parameter models. In: SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201316 (2020). IEEE","DOI":"10.1109\/SC41405.2020.00024"},{"key":"1367_CR5","doi-asserted-by":"crossref","unstructured":"Narayanan, D., Harlap, A., Phanishayee, A., Seshadri, V., Devanur, N.R., Ganger, G.R., Gibbons, P.B., Zaharia, M.: Pipedream: Generalized pipeline parallelism for dnn training. In: Proceedings of the 27th ACM symposium on operating systems principles, pp. 1\u201315 (2019)","DOI":"10.1145\/3341301.3359646"},{"key":"1367_CR6","doi-asserted-by":"crossref","unstructured":"Rhu, M., Gimelshein, N., Clemons, J., Zulfiqar, A., Keckler, S.W.: vdnn: Virtualized deep neural networks for scalable, memory-efficient neural network design. In: 2016 49th Annual IEEE\/ACM international symposium on microarchitecture (MICRO), pp. 1\u201313 (2016). IEEE","DOI":"10.1109\/MICRO.2016.7783721"},{"key":"1367_CR7","doi-asserted-by":"crossref","unstructured":"Rajbhandari, S., Ruwase, O., Rasley, J., Smith, S., He, Y.: Zero-infinity: Breaking the gpu memory wall for extreme scale deep learning. In: Proceedings of the international conference for high performance computing, networking, storage and analysis, pp. 1\u201314 (2021)","DOI":"10.1145\/3458817.3476205"},{"key":"1367_CR8","unstructured":"Bae, J., Lee, J., Jin, Y., Son, S., Kim, S., Jang, H., Ham, T.J., Lee, J.W.: $$\\{$$FlashNeuron$$\\}$$:$$\\{$$SSD-Enabled$$\\}$$$$\\{$$Large-Batch$$\\}$$ training of very deep neural networks. In: 19th USENIX conference on file and storage technologies (FAST 21), pp. 387\u2013401 (2021)"},{"key":"1367_CR9","unstructured":"Chen, T., Xu, B., Zhang, C., Guestrin, C.: Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174. (2016)"},{"key":"1367_CR10","unstructured":"Micikevicius, P., Narang, S., Alben, J., Diamos, G., Elsen, E., Garcia, D., Ginsburg, B., Houston, M., Kuchaiev, O., Venkatesh, G., et al.: Mixed precision training. arXiv preprint arXiv:1710.03740. (2017)"},{"key":"1367_CR11","unstructured":"Chen, J., Zheng, L., Yao, Z., Wang, D., Stoica, I., Mahoney, M., Gonzalez, J.: Actnn: Reducing training memory footprint via 2-bit activation compressed training. In: International conference on machine learning, pp. 1803\u20131813 (2021). PMLR"},{"key":"1367_CR12","unstructured":"Zhou, X., He, J., Zhou, W., Chen, H., Tang, Z., Zhao, H., Tong, X., Li, G., Chen, Y., Zhou, J., et al.: A survey of llm x data. arXiv preprint arXiv:2505.18458. (2025)"},{"key":"1367_CR13","doi-asserted-by":"crossref","unstructured":"Chang, E., Li, Y., Huber, P., Kant, D., Shi, Y., Chandra, V.: Automixer: Checkpoint artifacts as automatic data mixers. arXiv preprint arXiv:2506.21910. (2025)","DOI":"10.18653\/v1\/2025.acl-long.979"},{"key":"1367_CR14","unstructured":"Jiang, Z., Lin, H., Zhong, Y., Huang, Q., Chen, Y., Zhang, Z., Peng, Y., Li, X., Xie, C., Nong, S., et al.: $$\\{$$MegaScale$$\\}$$: Scaling large language model training to more than 10,000 $$\\{$$GPUs$$\\}$$. In: 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24), pp. 745\u2013760 (2024)"},{"key":"1367_CR15","doi-asserted-by":"crossref","unstructured":"Shang, B., Zhang, F., Xu, M., Xu, J., Dong, Z.: Lmstor: Storage acceleration design for large models. In: Asia-Pacific Web (APWeb) and Web-age information management (WAIM) joint international conference on web and big data, pp. 415\u2013426 (2024). Springer","DOI":"10.1007\/978-981-97-7244-5_36"},{"key":"1367_CR16","doi-asserted-by":"crossref","unstructured":"Gupta, T., Krishnan, S., Kumar, R., Vijeev, A., Gulavani, B., Kwatra, N., Ramjee, R., Sivathanu, M.: Just-in-time checkpointing: Low cost error recovery from deep learning training failures. In: Proceedings of the 19th european conference on computer systems, pp. 1110\u20131125 (2024)","DOI":"10.1145\/3627703.3650085"},{"key":"1367_CR17","unstructured":"Qin, R., Li, Z., He, W., Cui, J., Ren, F., Zhang, M., Wu, Y., Zheng, W., Xu, X.: Mooncake: Trading more storage for less computation\u2014a kvcache-centric architecture for serving llm chatbot. In: 23rd USENIX conference on file and storage technologies (FAST 25), pp. 155\u2013170 (2025). USENIX Association"},{"key":"1367_CR18","doi-asserted-by":"crossref","unstructured":"Maurya, A., Underwood, R., Rafique, M.M., Cappello, F., Nicolae, B.: Datastates-llm: Lazy asynchronous checkpointing for large language models. In: Proceedings of the 33rd international symposium on high-performance parallel and distributed computing, pp. 227\u2013239 (2024)","DOI":"10.1145\/3625549.3658685"},{"key":"1367_CR19","unstructured":"Hu, Q., Ye, Z., Wang, Z., Wang, G., Zhang, M., Chen, Q., Sun, P., Lin, D., Wang, X., Luo, Y., et al.: Characterization of large language model development in the datacenter. In: 21st USENIX symposium on networked systems design and implementation (NSDI 24), pp. 709\u2013729 (2024)"},{"key":"1367_CR20","unstructured":"Team, G., Georgiev, P., Lei, V.I., Burnell, R., Bai, L., Gulati, A., Tanzer, G., Vincent, D., Pan, Z., Wang, S., et al.: Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530. (2024)"},{"key":"1367_CR21","unstructured":"Pace, W., She, C., Xu, L., Jones, W., Lockett, A., Wang, J., Shah, R.: Lance: Efficient random access in columnar storage through adaptive structural encodings. arXiv preprint arXiv:2504.15247. (2025)"},{"key":"1367_CR22","unstructured":"Bai, J., Bai, S., Yang, S., Wang, S., Tan, S., Wang, P., Lin, J., Zhou, C., Zhou, J.: Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond. arXiv preprint arXiv:2308.12966. 1(2), 3 (2023)"},{"key":"1367_CR23","unstructured":"Shoeybi, M., Patwary, M., Puri, R., LeGresley, P., Casper, J., Catanzaro, B.: Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053. (2019)"},{"key":"1367_CR24","unstructured":"Mohan, J., Phanishayee, A., Chidambaram, V.: $$\\{$$CheckFreq$$\\}$$: Frequent,$$\\{$$Fine-Grained$$\\}$$$$\\{$$DNN$$\\}$$ checkpointing. In: 19th USENIX conference on file and storage technologies (FAST 21), pp. 203\u2013216 (2021)"},{"key":"1367_CR25","doi-asserted-by":"crossref","unstructured":"Agrawal, A., Reddy, S., Bhattamishra, S., Nookala, V.P.S., Vashishth, V., Rong, K., Tumanov, A.: Dynaquant: Compressing deep learning training checkpoints via dynamic quantization. arXiv preprint arXiv:2306.11800. (2023)","DOI":"10.1145\/3698038.3698553"},{"key":"1367_CR26","unstructured":"Eisenman, A., Matam, K.K., Ingram, S., Mudigere, D., Krishnamoorthi, R., Nair, K., Smelyanskiy, M., Annavaram, M.: $$\\{$$Check-N-Run$$\\}$$: A checkpointing system for training deep learning recommendation models. In: 19th USENIX symposium on networked systems design and implementation (NSDI 22), pp. 929\u2013943 (2022)"},{"key":"1367_CR27","unstructured":"Wan, B., Han, M., Sheng, Y., Lai, Z., Zhang, M., Zhang, J., Peng, Y., Lin, H., Liu, X., Wu, C.: Bytecheckpoint: A unified checkpointing system for llm development. arXiv e-prints, 2407 (2024)"},{"key":"1367_CR28","doi-asserted-by":"crossref","unstructured":"Wang, Z., Jia, Z., Zheng, S., Zhang, Z., Fu, X., Ng, T.E., Wang, Y.: Gemini: Fast failure recovery in distributed training with in-memory checkpoints. In: Proceedings of the 29th symposium on operating systems principles, pp. 364\u2013381 (2023)","DOI":"10.1145\/3600006.3613145"},{"key":"1367_CR29","unstructured":"Wang, Y., Shi, S., He, X., Tang, Z., Pan, X., Zheng, Y., Wu, X., Zhou, A.C., He, B., Chu, X.: Reliable and efficient in-memory fault tolerance of large language model pretraining. arXiv preprint arXiv:2310.12670. (2023)"},{"key":"1367_CR30","unstructured":"Wang, Q., Sang, B., Zhang, H., Tang, M., Zhang, K.: Dlrover: An elastic deep training extension with auto job resource recommendation. arXiv preprint arXiv:2304.01468. (2023)"},{"key":"1367_CR31","unstructured":"Ren, X., Zhou, P., Meng, X., Huang, X., Wang, Y., Wang, W., Li, P., Zhang, X., Podolskiy, A., Arshinov, G., et al.: Pangu-$$\\{$$$$\\backslash $$Sigma$$\\}$$: Towards trillion parameter language model with sparse heterogeneous computing. arXiv preprint arXiv:2303.10845. (2023)"},{"key":"1367_CR32","first-page":"17146","volume":"35","author":"J Xu","year":"2022","unstructured":"Xu, J., Wang, G., Yao, Y., Li, Z., Cao, C., Tong, H., et al.: A deep learning dataloader with shared data preparation. Adv. Neural. Inf. Process. Syst. 35, 17146\u201317156 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1367_CR33","unstructured":"Svogor, I., Eichenberger, C., Spanring, M., Neun, M., Kopp, M.: Profiling and improving the pytorch dataloader for high-latency storage: A technical report. arXiv preprint arXiv:2211.04908. (2022)"},{"key":"1367_CR34","volume-title":"Alluxio: A Virtual Distributed File System","author":"H Li","year":"2018","unstructured":"Li, H.: Alluxio: A Virtual Distributed File System. University of California, Berkeley, Berkeley, CA (2018)"}],"container-title":["World Wide Web"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11280-025-01367-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11280-025-01367-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11280-025-01367-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T15:15:40Z","timestamp":1758294940000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11280-025-01367-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,1]]},"references-count":34,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,9]]}},"alternative-id":["1367"],"URL":"https:\/\/doi.org\/10.1007\/s11280-025-01367-7","relation":{},"ISSN":["1386-145X","1573-1413"],"issn-type":[{"type":"print","value":"1386-145X"},{"type":"electronic","value":"1573-1413"}],"subject":[],"published":{"date-parts":[[2025,8,1]]},"assertion":[{"value":"11 January 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 July 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 July 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 August 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval"}},{"value":"The authors declare no competing interests.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing Interests"}}],"article-number":"54"}}