{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T12:26:46Z","timestamp":1780576006529,"version":"3.54.1"},"reference-count":36,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2023,1,1]]},"DOI":"10.1109\/tpds.2022.3219819","type":"journal-article","created":{"date-parts":[[2022,11,7]],"date-time":"2022-11-07T22:29:01Z","timestamp":1667860141000},"page":"304-315","source":"Crossref","is-referenced-by-count":33,"title":["Parallel Training of Pre-Trained Models via Chunk-Based Dynamic Memory Management"],"prefix":"10.1109","volume":"34","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6724-2763","authenticated-orcid":false,"given":"Jiarui","family":"Fang","sequence":"first","affiliation":[{"name":"Tencent Inc., Shenzhen, Guangdong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zilin","family":"Zhu","sequence":"additional","affiliation":[{"name":"Tencent Inc., Shenzhen, Guangdong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shenggui","family":"Li","sequence":"additional","affiliation":[{"name":"National Singapore University, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hui","family":"Su","sequence":"additional","affiliation":[{"name":"Tencent Inc., Shenzhen, Guangdong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yang","family":"Yu","sequence":"additional","affiliation":[{"name":"Tencent Inc., Shenzhen, Guangdong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jie","family":"Zhou","sequence":"additional","affiliation":[{"name":"Tencent Inc., Shenzhen, Guangdong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yang","family":"You","sequence":"additional","affiliation":[{"name":"National Singapore University, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref33","doi-asserted-by":"crossref","first-page":"72","DOI":"10.1016\/j.ipl.2006.11.009","article-title":"A short proof of optimality for the min cache replacement algorithm","volume":"102","author":"roy","year":"2007","journal-title":"Inf Process Lett"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1147\/sj.92.0078"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1147\/sj.52.0078"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441578"},{"key":"ref36","article-title":"Colossal-AI: A unified deep learning system for large-scale parallel training","author":"li","year":"2021"},{"key":"ref35","article-title":"NVIDIA DGX A100 the universal system for AI infrastructure","year":"2020"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1147\/rd.181.0002"},{"key":"ref10","article-title":"Mixed precision training","author":"micikevicius","year":"2017"},{"key":"ref11","first-page":"1232","article-title":"Large scale distributed deep networks","author":"dean","year":"2012","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"ref13","article-title":"Mesh-TensorFlow: Deep learning for supercomputers","author":"shazeer","year":"2018"},{"key":"ref14","first-page":"103","article-title":"GPipe: Efficient training of giant neural networks using pipeline parallelism","author":"huang","year":"2019","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref16","first-page":"6543","article-title":"TeraPipe: Token-level pipeline parallelism for training large-scale language models","author":"li","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref17","article-title":"Training deep nets with sublinear memory cost","author":"chen","year":"2016"},{"key":"ref18","first-page":"4125","article-title":"Memory-efficient backpropagation through time","author":"gruslys","year":"2016","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref19","article-title":"Optimal checkpointing for heterogeneous chains: How to train deep neural networks with limited memory","author":"herrmann","year":"2019"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3368089.3417050"},{"key":"ref4","article-title":"Megatron-LM: Training multi-billion parameter language models using model parallelism","author":"shoeybi","year":"2019"},{"key":"ref27","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"radford","year":"2019","journal-title":"OpenAIRE blog"},{"key":"ref3","first-page":"1877","article-title":"Language models are few-shot learners","author":"brown","year":"2020","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"ref29","article-title":"On-device neural net inference with mobile GPUs","author":"lee","year":"2019"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref8","first-page":"551","article-title":"ZeRO-Offload: Democratizing billion-scale model training","author":"ren","year":"2021","journal-title":"Proc USENIX Annu Tech Conf"},{"key":"ref7","article-title":"Training large neural networks with constant memory using a new execution algorithm","author":"pudipeddi","year":"2020"},{"key":"ref2","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018"},{"key":"ref9","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref1","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2020"},{"key":"ref20","first-page":"8026","article-title":"Pytorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref22","first-page":"200","article-title":"Dynamic memory management for GPU-based training of deep neural networks","author":"shriram","year":"2019","journal-title":"Proc IEEE Int Parallel Distrib Process Symp"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783721"},{"key":"ref24","first-page":"23844","article-title":"Efficient combination of rematerialization and offloading for training DNNs","author":"beaumont","year":"2021","journal-title":"Proc 35th Conf Neural Inf Process Syst"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-57675-2_10"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2019.2928289"},{"key":"ref25","article-title":"DeepSpeed open-source code","year":"2021"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/71\/9956797\/09940581.pdf?arnumber=9940581","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,19]],"date-time":"2022-12-19T19:47:03Z","timestamp":1671479223000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9940581\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,1]]},"references-count":36,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2022.3219819","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"value":"1045-9219","type":"print"},{"value":"1558-2183","type":"electronic"},{"value":"2161-9883","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,1,1]]}}}