{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T16:05:04Z","timestamp":1781193904758,"version":"3.54.1"},"reference-count":127,"publisher":"Zhejiang University Press","issue":"3","license":[{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Front Inform Technol Electron Eng"],"published-print":{"date-parts":[[2025,3]]},"DOI":"10.1631\/fitee.2300710","type":"journal-article","created":{"date-parts":[[2025,3,17]],"date-time":"2025-03-17T00:59:26Z","timestamp":1742173166000},"page":"309-331","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Training large-scale language models with limited GPU memory: a survey","\u6709\u9650GPU\u663e\u5b58\u4e0b\u7684\u5927\u8bed\u8a00\u6a21\u578b\u8bad\u7ec3\u6280\u672f\u7efc\u8ff0"],"prefix":"10.1631","volume":"26","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8595-1547","authenticated-orcid":false,"given":"Yu","family":"Tang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Linbo","family":"Qiao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lujia","family":"Yin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Peng","family":"Liang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ao","family":"Shen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhilin","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lizhi","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9743-2034","authenticated-orcid":false,"given":"Dongsheng","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"635","published-online":{"date-parts":[[2025,3,17]]},"reference":[{"key":"ref1","first-page":"265","article-title":"TensorFlow: a system for large-scale machine learning","volume-title":"Proc 12th USENIX Conf on Operating Systems Design and Implementation","author":"Abadi","year":"2016"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/hpca51647.2021.00072"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2020.113790"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/0925-2312(93)90006-o"},{"key":"ref5","first-page":"1352","article-title":"ReZero is all you need: fast convergence at large depth","volume-title":"Proc 37th Conf on Uncertainty in Artificial Intelligence","author":"Bachlechner","year":"2021"},{"key":"ref6","first-page":"387","article-title":"FlashNeuron: SSD-enabled large-batch training of very deep neural networks","volume-title":"Proc 19th USENIX Conf on File and Storage Technologies","author":"Bae","year":"2021"},{"key":"ref7","first-page":"5151","article-title":"Scalable methods for 8-bit training of neural networks","volume-title":"Proc 32nd Int Conf on Neural Information Processing Systems","author":"Banner","year":"2018"},{"key":"ref8","first-page":"1826","article-title":"MOCCASIN: efficient tensor rematerialization for neural networks","volume-title":"Int Conf on Machine Learning","author":"Bartan","year":"2023"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1098\/rsta.2019.0049"},{"key":"ref10","first-page":"23844","article-title":"Efficient combination of rematerialization and offloading for training DNNs","volume-title":"Proc 35th Conf on Neural Information Processing Systems","author":"Beaumont","year":"2021"},{"key":"ref11","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc 34th Conf on Neural Information Processing Systems","author":"Brown","year":"2020"},{"key":"ref12","author":"Chen","year":"2024","journal-title":"DeepZero: scaling up zeroth-order optimization for deep model training"},{"key":"ref13","article-title":"A statistical frame-work for low-bitwidth training of deep neural networks","volume-title":"Proc 34th Int Conf on Neural Information Processing Systems","author":"Chen","year":"2020"},{"key":"ref14","first-page":"1803","article-title":"ActNN: reducing training memory footprint via 2-bit activation compressed training","volume-title":"Proc 38th Int Conf on Machine Learning","author":"Chen","year":"2021"},{"key":"ref15","author":"Chen","year":"2023","journal-title":"AutoDDL: automatic distributed deep learning with asymptotically optimal communication"},{"key":"ref16","author":"Chen","year":"2015","journal-title":"MXNet: a flexible and efficient machine learning library for heterogeneous distributed systems"},{"key":"ref17","author":"Chen","year":"2016","journal-title":"Training deep nets with sublinear memory cost"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/w14-4012"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/mm.2021.3061394"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1002\/aris.1440370103"},{"key":"ref21","first-page":"2260","article-title":"Momentum improves normal-ized SGD","volume-title":"Proc 37th Int Conf on Machine Learning","author":"Cutkosky","year":"2020"},{"key":"ref22","author":"Dao","year":"2023","journal-title":"FlashAttention-2: faster attention with better parallelism and work partitioning"},{"key":"ref23","first-page":"16344","article-title":"FlashAttention: fast and memory-efficient exact attention with IO-awareness","volume-title":"Proc 36th Conf on Neural Information Processing Systems","author":"Dao","year":"2022"},{"key":"ref24","first-page":"1223","article-title":"Large scale dis-tributed deep networks","volume-title":"Proc 25th Int Conf on Neural Information Processing Systems","author":"Dean","year":"2012"},{"key":"ref25","first-page":"4171","article-title":"BERT: pre-training of deep bidirectional Transformers for language under-standing","volume-title":"Proc Conf of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"Devlin","year":"2019"},{"key":"ref26","article-title":"Unified language model pre-training for natural language understanding and generation","volume-title":"Proc 33rd Int Conf on Neural Information Processing Systems","author":"Dong","year":"2019"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/tpds.2022.3219819"},{"issue":"1","key":"ref29","first-page":"120","article-title":"Switch Transformers: scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2022","journal-title":"J Mach Learn Res"},{"key":"ref30","article-title":"Don\u2019t waste your bits! Squeeze activations and gradients for deep neural networks via TINYSCRIPT","volume-title":"Proc 37th Int Conf on Machine Learning","author":"Fu","year":"2020"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/mm.2024.3373763"},{"key":"ref32","author":"Guan","year":"2019","journal-title":"XPipe: efficient pipeline model parallelism for multi-GPU DNN training"},{"key":"ref33","author":"Gusak","year":"2022","journal-title":"Survey on large scale neural network training"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/42411.42415"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2022.3152247"},{"key":"ref36","author":"Han","year":"2016","journal-title":"Deep compression: com-pressing deep neural networks with pruning, trained quantization and Huffman coding"},{"key":"ref37","first-page":"4150","article-title":"PipeTransformer: automated elastic pipelining for distributed training of large-scale models","volume-title":"Proc 38th Int Conf on Machine Learning","author":"He","year":"2021"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2016.90"},{"key":"ref39","author":"Herrmann","year":"2019","journal-title":"Optimal checkpointing for heterogeneous chains: how to train deep neural networks with limited memory"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378465"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/1090.001.0001"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378530"},{"key":"ref43","article-title":"GPipe: ef-ficient training of giant neural networks using pipeline parallelism","volume-title":"Proc 33rd Conf on Neural Information Pro-cessing Systems","author":"Huang","year":"2019"},{"key":"ref44","first-page":"497","article-title":"Checkmate: breaking the memory wall with optimal tensor rematerialization","volume-title":"Proc 3rd Conf on Machine Learning and Systems","author":"Jain","year":"2020"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2012.59"},{"key":"ref46","author":"Jia","year":"2018","journal-title":"Dissecting the NVIDIA Volta GPU architecture via microbenchmarking"},{"key":"ref47","first-page":"1","article-title":"Beyond data and model parallelism for deep neural networks","volume-title":"Proc 2nd Conf on Machine Learning and Systems","author":"Jia","year":"2019"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467304"},{"key":"ref49","author":"Kim","year":"2020","journal-title":"torchgpipe: on-the-fly pipeline parallelism for training giant models"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1406.3269"},{"key":"ref51","article-title":"Dynamic tensor rematerialization","volume-title":"Proc 9th Int Conf on Learning Representations","author":"Kirisame","year":"2021"},{"key":"ref52","article-title":"Reformer: the efficient Transformer","volume-title":"Proc 8th Int Conf on Learning Representations","author":"Kitaev","year":"2020"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.3390\/electronics11010141"},{"key":"ref54","first-page":"5","article-title":"Reducing activation recomputation in large Transformer models","volume-title":"Proc 6th Conf on Machine Learning and Systems","author":"Korthikanti","year":"2023"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1038\/nature14539"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1145\/301453.301487"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.5555\/2685048.2685095"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476145"},{"key":"ref61","author":"Li","year":"2022","journal-title":"Sequence parallelism: long sequence training from system perspective"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605613"},{"key":"ref63","author":"Liang","year":"2022","journal-title":"A survey on auto-parallelism of neural networks training"},{"key":"ref64","article-title":"Deep gradient compression: reducing the communication bandwidth for distributed training","volume-title":"Proc 6th Int Conf on Learning Representations","author":"Lin","year":"2018"},{"key":"ref65","author":"Lin","year":"2023","journal-title":"SuperScaler: supporting flexible DNN parallelization via a unified abstraction"},{"key":"ref66","first-page":"3731","article-title":"Zeroth-order stochastic variance reduction for nonconvex optimization","volume-title":"Proc 32nd Int Conf on Neural Information Processing Systems","author":"Liu","year":"2018"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00986"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607073"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.243"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508417"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-68279-0_17"},{"key":"ref72","author":"Micikevicius","year":"2018","journal-title":"Mixed precision training"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1145\/3230543.3230560"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/icde53745.2022.00241"},{"key":"ref76","author":"Achiam","year":"2024","journal-title":"GPT-4 technical report"},{"key":"ref77","first-page":"307","article-title":"HetPipe: enabling large DNN training on (Whimpy) heterogeneous GPU clusters through integration of pipelined model parallelism and data parallelism","volume-title":"Proc USENIX Annual Technical Conf","author":"Park","year":"2020"},{"key":"ref78","article-title":"PyTorch: an imperative style, high-performance deep learning library","volume-title":"Proc 33rd Int Conf on Neural Information Processing Systems","author":"Paszke","year":"2019"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378505"},{"key":"ref80","article-title":"The Kaldi speech recognition toolkit","volume-title":"Proc IEEE Workshop on Automatic Speech Recognition and Understanding","author":"Povey","year":"2011"},{"key":"ref81","author":"Pudipeddi","year":"2020","journal-title":"Training large neural networks with constant memory using a new execution algorithm"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1007\/s11431-020-1647-3"},{"issue":"1","key":"ref83","first-page":"140","article-title":"Exploring the limits of transfer learning with a unified text-to-text Transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J Mach Learn Res"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/sc41405.2020.00024"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"ref86","first-page":"18332","article-title":"DeepSpeed-MoE: advancing mixture-of-experts inference and training to power next-generation AI scale","volume-title":"Proc 39th Int Conf on Machine Learning","author":"Rajbhandari","year":"2022"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d16-1264"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref89","first-page":"551","article-title":"ZeRO-Offload: democratizing billion-scale model training","volume-title":"Proc USENIX Annual Technical Conf","author":"Ren","year":"2021"},{"key":"ref90","first-page":"91","article-title":"Faster R-CNN: towards real-time object detection with region proposal networks","volume-title":"Proc 28th Int Conf on Neural Information Processing Systems","author":"Ren","year":"2015"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/micro.2016.7783721"},{"key":"ref92","author":"Sergeev","year":"2018","journal-title":"Horovod: fast and easy distributed deep learning in TensorFlow"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507777"},{"key":"ref94","first-page":"4596","article-title":"Adafactor: adaptive learning rates with sublinear memory cost","volume-title":"Proc 35th Int Conf on Machine Learning","author":"Shazeer","year":"2018"},{"key":"ref95","author":"Shoeybi","year":"2020","journal-title":"Megatron-LM: training multi-billion parameter language models using model parallelism"},{"key":"ref96","article-title":"Hybrid 8-bit floating point (HFP8) training and inference for deep neural networks","volume-title":"Proc 33rd Int Conf on Neural Information Processing Systems","author":"Sun","year":"2019"},{"key":"ref97","article-title":"Ultra-low precision 4-bit training of deep neural networks","volume-title":"Proc 34th Int Conf on Neural Information Processing Systems","author":"Sun","year":"2020"},{"key":"ref98","author":"Sun","year":"2019","journal-title":"ERNIE: enhanced representation through knowledge integration"},{"key":"ref99","author":"Sun","year":"2021","journal-title":"ERNIE 3.0: large-scale knowledge enhanced pre-training for language understanding and generation"},{"key":"ref100","first-page":"1139","article-title":"On the importance of initialization and momentum in deep learning","volume-title":"Proc 30th Int Conf on Machine Learning","author":"Sutskever","year":"2013"},{"key":"ref101","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","volume-title":"Proc 27th Int Conf on Neural Information Processing Systems","author":"Sutskever","year":"2014"},{"key":"ref102","author":"Tang","year":"2022","journal-title":"DELTA: dynamically optimizing GPU memory beyond tensor recomputation"},{"key":"ref103","first-page":"267","article-title":"Unity: accelerating DNN training through joint optimization of algebraic transformations and parallelization","volume-title":"Proc 16th USENIX Symp on Operating Systems Design and Implementation","author":"Unger","year":"2022"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178491"},{"key":"ref106","first-page":"7686","article-title":"Training deep neural networks with 8-bit floating point numbers","volume-title":"Proc 32nd Int Conf on Neural Information Processing Systems","author":"Wang","year":"2018"},{"key":"ref107","article-title":"H3T: efficient integration of memory optimization and parallelism for high-throughput Transformer training","volume-title":"Proc 37th Conf on Neural Information Processing Systems","author":"Wang","year":"2023"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553516"},{"key":"ref109","article-title":"Training Transformers with 4-bit integers","volume-title":"Proc 37th Conf on Neural Information Processing Systems","author":"Xi","year":"2023"},{"key":"ref110","first-page":"10524","article-title":"On layer normalization in the Transformer architecture","volume-title":"Proc 37th Int Conf on Machine Learning","author":"Xiong","year":"2020"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1145\/2757667.2757684"},{"key":"ref112","author":"Yao","year":"2023","journal-title":"DeepSpeed-Chat: easy, fast and affordable RLHF training of ChatGPT-like models at all scales"},{"key":"ref113","article-title":"Large batch optimization for deep learning: training BERT in 76 minutes","volume-title":"Proc 8th Int Conf on Learning Representations","author":"You","year":"2020"},{"key":"ref114","author":"Yuan","year":"2022","journal-title":"OneFlow: redesign the distributed deep learning framework from scratch"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/icassp.2013.6639215"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d18-1009"},{"key":"ref117","author":"Zeng","year":"2021","journal-title":"PanGu-a: large-scale autoregressive pretrained Chinese language models with auto-parallel computation"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_23"},{"key":"ref119","article-title":"Coop: memory is not a commodity","volume-title":"Proc 34th Conf on Neural Information Processing Systems","author":"Zhang","year":"2023"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2021.07.001"},{"key":"ref121","first-page":"42018","article-title":"Rock-mate: an efficient, fast, automatic and generic tool for re-materialization in PyTorch","volume-title":"Proc 40th Int Conf on Machine Learning","author":"Zhao","year":"2023"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611569"},{"key":"ref123","first-page":"559","article-title":"Alpa: automating inter- and intra-operator parallelism for distributed deep learning","volume-title":"Proc 16th USENIX Symp on Operating Systems Design and Implementation","author":"Zheng","year":"2022"},{"key":"ref124","author":"Zhong","year":"2023","journal-title":"MQSP: micro-query sequence parallelism for linearly scaling long sequence Transformer"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1631\/fitee.2300089"},{"key":"ref126","author":"Zhou","year":"2018","journal-title":"DoReFa-Net: training low bitwidth convolutional neural networks with low bitwidth gradients"},{"key":"ref127","author":"Zhuang","year":"2022","journal-title":"Understanding AdamW through proximal methods and scale-freeness"}],"container-title":["Frontiers of Information Technology &amp; Electronic Engineering"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1631\/FITEE.2300710.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1631\/FITEE.2300710\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1631\/FITEE.2300710.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T06:59:08Z","timestamp":1771657148000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1631\/FITEE.2300710"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3]]},"references-count":127,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,3]]}},"alternative-id":["710"],"URL":"https:\/\/doi.org\/10.1631\/fitee.2300710","relation":{},"ISSN":["2095-9184","2095-9230"],"issn-type":[{"value":"2095-9184","type":"print"},{"value":"2095-9230","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,3]]},"assertion":[{"value":"17 October 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 March 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 March 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Dongsheng LI is a corresponding expert of\n                      Frontiers of Information Technology & Electronic Engineering\n                      , and he was not involved with the peer review process of this paper. All the authors declare that they have no conflict of interest.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}