{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T21:11:09Z","timestamp":1771535469392,"version":"3.50.1"},"reference-count":29,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2024,11,11]],"date-time":"2024-11-11T00:00:00Z","timestamp":1731283200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,11]],"date-time":"2024-11-11T00:00:00Z","timestamp":1731283200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Front. Comput. Sci."],"published-print":{"date-parts":[[2025,1]]},"DOI":"10.1007\/s11704-023-3401-5","type":"journal-article","created":{"date-parts":[[2024,11,11]],"date-time":"2024-11-11T09:19:10Z","timestamp":1731316750000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["BAFT: bubble-aware fault-tolerant framework for distributed DNN training with hybrid parallelism"],"prefix":"10.1007","volume":"19","author":[{"given":"Runzhe","family":"Chen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guandong","family":"Lu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yakai","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rui","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zheng","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanming","family":"Miao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhifang","family":"Cai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingwen","family":"Leng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,11]]},"reference":[{"issue":"6","key":"3401_CR1","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2017","unstructured":"Krizhevsky A, Sutskever I, Hinton G E. ImageNet classification with deep convolutional neural networks. Communications of the ACM, 2017, 60(6): 84\u201390","journal-title":"Communications of the ACM"},{"key":"3401_CR2","first-page":"159","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"T B Brown","year":"2020","unstructured":"Brown T B, Mann B, Ryder N, Subbiah M, Kaplan J, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D M, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D. Language models are few-shot learners. In: Proceedings of the 34th International Conference on Neural Information Processing Systems. 2020, 159"},{"key":"3401_CR3","first-page":"1414","volume-title":"Proceedings of the 55th IEEE\/ACM International Symposium on Microarchitecture (MICRO)","author":"C Guo","year":"2022","unstructured":"Guo C, Zhang C, Leng J, Liu Z, Yang F, Liu Y, Guo M, Zhu Y. Ant: exploiting adaptive numerical data type for low-bit deep neural network quantization. In: Proceedings of the 55th IEEE\/ACM International Symposium on Microarchitecture (MICRO). 2022, 1414\u20131433"},{"key":"3401_CR4","first-page":"1083","volume-title":"Proceedings of the 48th ACM\/IEEE Annual International Symposium on Computer Architecture (ISCA)","author":"Y Wang","year":"2021","unstructured":"Wang Y, Zhang C, Xie Z, Guo C, Liu Y, Leng J. Dual-side sparse tensor core. In: Proceedings of the 48th ACM\/IEEE Annual International Symposium on Computer Architecture (ISCA). 2021, 1083\u20131095"},{"key":"3401_CR5","first-page":"1","volume-title":"Procedings of SC20: International Conference for High Performance Computing, Networking, Storage and Analysis","author":"C Guo","year":"2020","unstructured":"Guo C, Hsueh B Y, Leng J, Qiu Y, Guan Y, Wang Z, Jia X, Li X, Guo M, Zhu Y A. Accelerating sparse DNN models without hardwaresupport via tile-wise sparsity. In: Procedings of SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. 2020, 1\u201315"},{"key":"3401_CR6","first-page":"738","volume-title":"Proceeding of the 40th IEEE International Conference on Computer Design (ICCD)","author":"C Guo","year":"2022","unstructured":"Guo C, Qiu Y, Leng J, Zhang C, Cao Y, Zhang Q, Liu Y, Yang F, Guo M. Nesting forward automatic differentiation for memory-efficient deep neural network training. In: Proceeding of the 40th IEEE International Conference on Computer Design (ICCD). 2022, 738\u2013745"},{"key":"3401_CR7","first-page":"1","volume-title":"Proceedings of 2020 IEEE Hot Chips 32 Symposium (HCS)","author":"J Choquette","year":"2020","unstructured":"Choquette J, Gandhi W. NVIDIA A100 GPU: performance & innovation for GPU computing. In: Proceedings of 2020 IEEE Hot Chips 32 Symposium (HCS). 2020, 1\u201343"},{"key":"3401_CR8","first-page":"388","volume-title":"Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS\u2019 22)","author":"Z Liu","year":"2022","unstructured":"Liu Z, Leng J, Zhang Z, Chen Q, Li C, and Guo M. VELTAIR: towards high-performance multi-tenant deep learning services via adaptive compilation and scheduling. In: Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS\u2019 22). 2022, 388\u2013401"},{"key":"3401_CR9","first-page":"947","volume-title":"Proceedings of 2019 USENIX Conference on USENIX Annual Technical Conference","author":"M Jeon","year":"2019","unstructured":"Jeon M, Venkataraman S, Phanishayee A, Qian U, Xiao W, Yang F. Analysis of large-scale multi-tenant GPU clusters for DNN training workloads. In: Proceedings of 2019 USENIX Conference on USENIX Annual Technical Conference. 2019, 947\u2013960"},{"key":"3401_CR10","first-page":"129","volume-title":"Proceedings of the 25th IEEE International Symposium on On-Line Testing and Robust System Design (IOLTS)","author":"D Gizopoulos","year":"2019","unstructured":"Gizopoulos D, Papadimitriou G, Chatzidimitriou A, Reddi V J, Salami B, Unsal O S, Kestelman A C, Leng J. Modern hardware margins: Cpus, gpus, fpgas recent system-level studies. In: Proceedings of the 25th IEEE International Symposium on On-Line Testing and Robust System Design (IOLTS). 2019, 129\u2013134"},{"issue":"2","key":"3401_CR11","doi-asserted-by":"publisher","first-page":"341","DOI":"10.1109\/TDMR.2020.2989813","volume":"20","author":"G Papadimitriou","year":"2020","unstructured":"Papadimitriou G, Chatzidimitriou A, Gizopoulos D, Reddi V J, Leng J, Salami B, Unsal O S, Kestelman A C. Exceeding conservative limits: a consolidated analysis on modern hardware margins. IEEE Transactions on Device and Materials Reliability, 2020, 20(2): 341\u2013350","journal-title":"IEEE Transactions on Device and Materials Reliability"},{"key":"3401_CR12","first-page":"4777","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Y Qiu","year":"2019","unstructured":"Qiu Y, Leng J, Guo C, Chen Q, Li C, Guo M, Zhu Y. Adversarial defense through network profiling based path extraction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2019, 4777\u20134786"},{"key":"3401_CR13","first-page":"44","volume-title":"Proceedings of 2020 IEEE International Symposium on High Performance Com puter Architecture (HPCA)","author":"J Leng","year":"2020","unstructured":"Leng J, Buyuktosunoglu A, Bertran R, Bose P, Chen Q, Guo M, Janapa Reddi V. Asymmetric resilience: exploiting task-level idempotency for transient error recovery in accelerator-based systems. In: Proceedings of 2020 IEEE International Symposium on High Performance Com puter Architecture (HPCA). 2020, 44\u201357"},{"key":"3401_CR14","first-page":"203","volume-title":"Proceedings of the 19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"J Mohan","year":"2021","unstructured":"Mohan J, Phanishayee A, Chidambaram V. CheckFreq: Frequent, fine-grained DNN checkpointing. In: Proceedings of the 19th USENIX Conference on File and Storage Technologies (FAST 21). 2021, 203\u2013216"},{"key":"3401_CR15","first-page":"929","volume-title":"Proceedings of the 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"A Eisenman","year":"2022","unstructured":"Eisenman A, Matam K K, Ingram S, Mudigere D, Krishnamoorthi R, Nair K, Smelyanskiy M, Annavaram M. Check-N-Run: a checkpointing system for training deep learning recommendation models. In: Proceedings of the 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). 2022, 929\u2013943"},{"key":"3401_CR16","first-page":"172","volume-title":"Proceedings of the 20th IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID)","author":"B Nicolae","year":"2020","unstructured":"Nicolae B, Li J, Wozniak J M, Bosilca G, Dorier M, Cappello F. DeepFreeze: towards scalable asynchronous checkpointing of deep learning models. In: Proceedings of the 20th IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID). 2020, 172\u2013181"},{"issue":"12","key":"3401_CR17","doi-asserted-by":"publisher","first-page":"3005","DOI":"10.14778\/3415478.3415530","volume":"13","author":"S Li","year":"2020","unstructured":"Li S, Zhao Y, Varma R, Salpekar O, Noordhuis P, Li T, Paszke A, Smith J, Vaughan B, Damania P, Chintala S. PyTorch distributed: experiences on accelerating data parallel training. Proceedings of the VLDB Endowment, 2020, 13(12): 3005\u20133018","journal-title":"Proceedings of the VLDB Endowment"},{"key":"3401_CR18","unstructured":"Zeng W, Ren X, Su T, Wang H, Liao Y, Wang Z, Jiang X, Yang Z, Wang K, Zhang X, Li C, Gong Z, Yao Y, Huang X, Wang J, Yu J, Guo Q, Yu Y, Zhang Y, Wang J, Tao H, Yan D, Yi Z, Peng F, Jiang F, Zhang H, Deng L, Zhang Y, Lin Z, Zhang C, Zhang S, Guo M, Gu S, Fan G, Wang Y, Jin X, Liu Q, Tian Y. PanGu-a: large-scale autoregressive pretrained Chinese language models with auto-parallel computation. 2021, arXiv preprint arXiv: 2104.12369"},{"key":"3401_CR19","first-page":"58","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"D Narayanan","year":"2021","unstructured":"Narayanan D, Shoeybi M, Casper J, LeGresley P, Patwary M, Korthikanti V, Vainbrand D, Kashinkunti P, Bernauer J, Catanzaro B, Phanishayee A, Zaharia M. Efficient large-scale language model training on GPU clusters using megatron-LM. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. 2021, 58"},{"key":"3401_CR20","first-page":"559","volume-title":"Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation","author":"L Zheng","year":"2022","unstructured":"Zheng L, Li Z, Zhang H, Zhuang Y, Chen Z, Huang Y, Wang Y, Xu Y, Zhuo D, Xing E P, Gonzalez J, Stoica I. Alpa: automating inter- and intra-operator parallelism for distributed deep learning. In: Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation. 2022, 559\u2013578"},{"key":"3401_CR21","unstructured":"Xu Y, Lee H, Chen D, Hechtman B A, Huang Y, Joshi R, Krikun M, Lepikhin D, Ly A, Maggioni M, Pang R, Shazeer N, Wang S, Wang T, Wu Y, Chen Z. GSPMD: general and scalable parallelization for ML computation graphs. 2021, arXiv preprint arXiv: 2105.04663"},{"key":"3401_CR22","first-page":"1","volume-title":"Proceedings of SC21: International Conference for High Performance Computing, Networking, Storage and Analysis","author":"S Li","year":"2021","unstructured":"Li S, Hoefler T. Chimera: efficiently training large-scale neural networks with bidirectional pipelines. In: Proceedings of SC21: International Conference for High Performance Computing, Networking, Storage and Analysis. 2021, 1\u201314"},{"key":"3401_CR23","volume-title":"Improving language understanding by generative pre-training","author":"A Radford","year":"2018","unstructured":"Radford A, Narasimhan K. Improving language understanding by generative pre-training. 2018"},{"key":"3401_CR24","first-page":"6000","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems","author":"A Vaswani","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez A N, Kaiser L, Polosukhin I. Attention is all you need. In: Proceedings of the 31st International Conference on Neural Information Processing Systems. 2017, 6000\u20136010"},{"key":"3401_CR25","volume-title":"Proceedings of the 5th International Conference on Learning Representations","author":"S Merity","year":"2017","unstructured":"Merity S, Xiong C, Bradbury J, Socher R. Pointer sentinel mixture models. In: Proceedings of the 5th International Conference on Learning Representations. 2017"},{"key":"3401_CR26","unstructured":"Chen Y, Yang Q, He S, Shi Z, Chen J. FTPipeHD: a fault-tolerant pipeline-parallel distributed training framework for heterogeneous edge devices. 2021, arXiv preprint arXiv: 2110.02781"},{"key":"3401_CR27","first-page":"911","volume-title":"Proceedings of 2019 IEEE International Parallel and Distributed Processing Symposium (IPDPS)","author":"B Nicolae","year":"2019","unstructured":"Nicolae B, Moody A, Gonsiorowski E, Mohror K, Cappello F. VeloC: towards high performance adaptive asynchronous checkpointing at large scale. In: Proceedings of 2019 IEEE International Parallel and Distributed Processing Symposium (IPDPS). 2019, 911\u2013920"},{"key":"3401_CR28","first-page":"3660","volume-title":"Proceedings of 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"P Li","year":"2021","unstructured":"Li P, Koyuncu E, Seferoglu H. Respipe: resilient model-distributed DNN training at edge networks. In: Proceedings of 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 2021, 3660\u20133664"},{"key":"3401_CR29","unstructured":"Maeng K, Bharuka S, Gao I, Jeffrey M C, Saraph V, Su B Y, Trippel C, Yang J, Rabbat M, Lucia B, Wu C J. CPR: understanding and improving failure tolerant training for deep learning recommendation with partial recovery. 2020, arXiv preprint arXiv: 2011.02999"}],"container-title":["Frontiers of Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11704-023-3401-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11704-023-3401-5","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11704-023-3401-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T20:27:42Z","timestamp":1771532862000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11704-023-3401-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,11]]},"references-count":29,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2025,1]]}},"alternative-id":["3401"],"URL":"https:\/\/doi.org\/10.1007\/s11704-023-3401-5","relation":{},"ISSN":["2095-2228","2095-2236"],"issn-type":[{"value":"2095-2228","type":"print"},{"value":"2095-2236","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,11]]},"assertion":[{"value":"12 May 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 September 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 November 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Competing interests\n                      The authors declare that they have no competing interests or financial conflicts to disclose.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics"}}],"article-number":"191102"}}