{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T19:20:23Z","timestamp":1773429623341,"version":"3.50.1"},"reference-count":25,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"name":"Institute of Information Communications Technology Planning Evaluation (IITP) grants"},{"name":"Korean Government, Ministry of Science and ICT","award":["2022-0-00498"],"award-info":[{"award-number":["2022-0-00498"]}]},{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea (NRF) Grant","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Korean Government, MSIT","award":["RS-2024-00416666"],"award-info":[{"award-number":["RS-2024-00416666"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/access.2024.3446770","type":"journal-article","created":{"date-parts":[[2024,8,20]],"date-time":"2024-08-20T15:46:37Z","timestamp":1724168797000},"page":"116891-116904","source":"Crossref","is-referenced-by-count":4,"title":["Optimizing Multi-Level Checkpointing for Distributed Deep Learning Workloads on Cloud Spot VM Clusters"],"prefix":"10.1109","volume":"12","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-1000-0162","authenticated-orcid":false,"given":"Yonghyeon","family":"Cho","sequence":"first","affiliation":[{"name":"webOS SW Development Group, LG Electronics, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7902-7459","authenticated-orcid":false,"given":"Yoochan","family":"Kim","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Sogang University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8424-002X","authenticated-orcid":false,"given":"Kihyun","family":"Kim","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Sogang University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6244-9865","authenticated-orcid":false,"given":"Jinwoo","family":"Kim","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Sogang University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0452-6336","authenticated-orcid":false,"given":"Hong-Yeon","family":"Kim","sequence":"additional","affiliation":[{"name":"Electronics and Telecommunications Research Institute (ETRI), Daejeon, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8786-3850","authenticated-orcid":false,"given":"Youngjae","family":"Kim","sequence":"additional","affiliation":[{"name":"Electronics and Telecommunications Research Institute (ETRI), Daejeon, South Korea"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/SCWS55283.2021.00018"},{"key":"ref2","first-page":"929","article-title":"Check-n-run: A checkpointing system for training deep learning recommendation models","volume-title":"Proc. 19th USENIX Symp. Networked Syst. Design Implement.","author":"Eisenman"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"ref4","article-title":"Horovod: Fast and easy distributed deep learning in TensorFlow","author":"Sergeev","year":"2018","journal-title":"arXiv:1802.05799"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.1811.06965"},{"key":"ref6","article-title":"Torchgpipe: On-the-fly pipeline parallelism for training giant models","author":"Kim","year":"2020","journal-title":"arXiv:2004.09910"},{"key":"ref7","first-page":"1","article-title":"Analysis and exploitation of dynamic pricing in the public cloud for ml training","volume-title":"Proc. Workshop Distrib. Infrastruct., Syst., Program., AI","author":"Narayanan"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2017.09.038"},{"key":"ref9","first-page":"203","article-title":"CheckFreq: Frequent, fine-grained DNN checkpointing","volume-title":"Proc. 19th USENIX Conf. File Storage Technol.","author":"Mohan"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER51413.2022.00044"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.3003307"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2010.18"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW50202.2020.00174"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid49817.2020.00-76"},{"key":"ref15","volume-title":"Spot Instance Interruptions","year":"2023"},{"key":"ref16","volume-title":"Prepare for Interruptions","year":"2023"},{"key":"ref17","volume-title":"Spot Instance Interruption Notices","year":"2023"},{"key":"ref18","volume-title":"Flexible I\/O Tester","author":"Axboe","year":"2023"},{"key":"ref19","volume-title":"Distributed Data Parallel","year":"2023"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1177\/1094342013488238"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063429"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00099"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3009184"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1016\/j.cosrev.2021.100398"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.3390\/app13158785"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6287639\/10380310\/10639967.pdf?arnumber=10639967","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T04:27:32Z","timestamp":1725164852000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10639967\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":25,"URL":"https:\/\/doi.org\/10.1109\/access.2024.3446770","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}