{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,12,14]],"date-time":"2024-12-14T05:15:54Z","timestamp":1734153354262,"version":"3.30.2"},"reference-count":34,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T00:00:00Z","timestamp":1731110400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T00:00:00Z","timestamp":1731110400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key R&D Program of China","doi-asserted-by":"publisher","award":["2022YFB4500405"],"award-info":[{"award-number":["2022YFB4500405"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U22A6001"],"award-info":[{"award-number":["U22A6001"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,11,9]]},"DOI":"10.1109\/nas63802.2024.10781359","type":"proceedings-article","created":{"date-parts":[[2024,12,12]],"date-time":"2024-12-12T19:07:24Z","timestamp":1734030444000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["TranLogs: Lossless Failure Recovery Empowered by Training Logs"],"prefix":"10.1109","author":[{"given":"Xiaoyu","family":"Liu","sequence":"first","affiliation":[{"name":"Zhejiang Lab,HangZhou,China"}]},{"given":"Lingfang","family":"Zeng","sequence":"additional","affiliation":[{"name":"Zhejiang Lab,HangZhou,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1088\/2632-2153\/abec21"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1088\/0253-6102\/71\/8\/955"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1038\/ncomms5308"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.3847\/1538-4365\/ab8868"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1093\/mnras\/stz2845"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.advwatres.2020.103619"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1021\/acs.est.1c01339"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1016\/j.mlwa.2021.100134"},{"key":"ref9","article-title":"Natural language processing advancements by deep learning: A survey[J]","author":"Torfi","year":"2023","journal-title":"arXiv preprint"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1002\/itl2.187"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2019.07.010"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.3390\/app10082749"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2976320"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1115\/1.4047855"},{"key":"ref15","article-title":"Microsoft Strung Together Tens of Thousands of Chips in a Pricey Supercomputer for OpenAI[Online]","volume-title":"Bloomberg","author":"Bass","year":"2023"},{"key":"ref16","first-page":"637","article-title":"Understanding and improving failure tolerant training for deep learning recommendation with partial recovery[J]","volume-title":"Proceedings of Machine Learning and Systems","volume":"3","author":"Maeng"},{"key":"ref17","first-page":"203","article-title":"CheckFreq: Frequent, Fine-GrainedDNN Checkpointing[C]","volume-title":"19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan","year":"2021"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICPADS56603.2022.00076"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCCN.2018.8487327"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/cloudnet.2018.8549548"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613145"},{"key":"ref22","first-page":"929","article-title":"Check-N-Run: A checkpointing system for training deep learning recommendation models[C]","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Eisenman","year":"2022"},{"article-title":"Microsoft. Boost Checkpoint Speed and Reduce Cost with Nebula[Online]","volume-title":"Azure Machine Learning","year":"2024","key":"ref23"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.14778\/3611479.3611514"},{"key":"ref25","article-title":"FastPersist: Accelerating Model Checkpointing in Deep Learning[J]","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD58817.2023.00031"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid49817.2020.00-76"},{"key":"ref28","article-title":"DynaQuant: Compressing Deep Learning Training Checkpoints via Dynamic Quantization[J]","author":"Agrawal","year":"2023","journal-title":"arXiv preprint"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/s00354-013-0302-4"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3650085"},{"volume-title":"facebookresearch. facebookresearch\/ResNeXt[Online]. github","year":"2021","key":"ref31"},{"volume-title":"NVIDIA. NVIDIA\/DeepLearningExamples[Online]. github","year":"2019","key":"ref32"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4612-4380-9_41"},{"volume-title":"Alluxio. Architecture[Online]. ALLUXIO","year":"2024","key":"ref34"}],"event":{"name":"2024 International Conference on Networking, Architecture and Storage (NAS)","start":{"date-parts":[[2024,11,9]]},"location":"Zhuhai, China","end":{"date-parts":[[2024,11,11]]}},"container-title":["2024 International Conference on Networking, Architecture and Storage (NAS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10781330\/10781339\/10781359.pdf?arnumber=10781359","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,13]],"date-time":"2024-12-13T06:29:50Z","timestamp":1734071390000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10781359\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,9]]},"references-count":34,"URL":"https:\/\/doi.org\/10.1109\/nas63802.2024.10781359","relation":{},"subject":[],"published":{"date-parts":[[2024,11,9]]}}}