{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T17:18:21Z","timestamp":1775841501764,"version":"3.50.1"},"reference-count":50,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"8","license":[{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key R&#x0026;D Program of China","award":["2022ZD0116315"],"award-info":[{"award-number":["2022ZD0116315"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U23B2048"],"award-info":[{"award-number":["U23B2048"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U22B2037"],"award-info":[{"award-number":["U22B2037"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"China National Postdoctoral Program for Innovative Talents","award":["BX20230012"],"award-info":[{"award-number":["BX20230012"]}]},{"name":"Beijing R&#x0026;D Program","award":["Z231100010323002"],"award-info":[{"award-number":["Z231100010323002"]}]},{"name":"Beijing NSF","award":["4244080"],"award-info":[{"award-number":["4244080"]}]},{"name":"ZTE-PKU joint program"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Knowl. Data Eng."],"published-print":{"date-parts":[[2024,8]]},"DOI":"10.1109\/tkde.2024.3370614","type":"journal-article","created":{"date-parts":[[2024,2,27]],"date-time":"2024-02-27T19:26:44Z","timestamp":1709062004000},"page":"3906-3920","source":"Crossref","is-referenced-by-count":12,"title":["Improving Automatic Parallel Training via Balanced Memory Workload Optimization"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-8375-493X","authenticated-orcid":false,"given":"Yujie","family":"Wang","sequence":"first","affiliation":[{"name":"School of CS and Key Lab of High Confidence Software Technologies (MOE), Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9297-642X","authenticated-orcid":false,"given":"Youhe","family":"Jiang","sequence":"additional","affiliation":[{"name":"School of CS and Key Lab of High Confidence Software Technologies (MOE), Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9371-8358","authenticated-orcid":false,"given":"Xupeng","family":"Miao","sequence":"additional","affiliation":[{"name":"Computer Science Department, Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1658-0380","authenticated-orcid":false,"given":"Fangcheng","family":"Fu","sequence":"additional","affiliation":[{"name":"School of CS and Key Lab of High Confidence Software Technologies (MOE), Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0267-775X","authenticated-orcid":false,"given":"Shenhan","family":"Zhu","sequence":"additional","affiliation":[{"name":"School of CS and Key Lab of High Confidence Software Technologies (MOE), Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6766-757X","authenticated-orcid":false,"given":"Xiaonan","family":"Nie","sequence":"additional","affiliation":[{"name":"School of CS and Key Lab of High Confidence Software Technologies (MOE), Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2616-2273","authenticated-orcid":false,"given":"Yaofeng","family":"Tu","sequence":"additional","affiliation":[{"name":"ZTE Company, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1681-4677","authenticated-orcid":false,"given":"Bin","family":"Cui","sequence":"additional","affiliation":[{"name":"School of CS and Key Lab of High Confidence Software Technologies (MOE), Peking University, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/s11390-022-2140-7"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/s41019-022-00187-3"},{"issue":"S1","key":"ref5","first-page":"27","article-title":"End-to-end chinese entity recognition based on BERT-BiLSTM-att-CRF","volume":"20","author":"Li","year":"2022","journal-title":"ZTE Commun."},{"issue":"2","key":"ref6","first-page":"11","article-title":"Deep learning-based semantic feature extraction: A literature review and future directions","volume":"21","author":"Deng","year":"2023","journal-title":"ZTE Commun."},{"key":"ref7","article-title":"Language models are few-shot learners","volume-title":"Proc. 34th Int. Conf. Neural Inf. Process. Syst.","author":"Brown"},{"key":"ref8","article-title":"Do transformers really perform bad for graph representation?","author":"Ying","year":"2021"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/s41019-023-00207-w"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3357895"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"issue":"8","key":"ref12","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref13","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref15","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref16","first-page":"8821","article-title":"Zero-shot text-to-image generation","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","author":"Ramesh"},{"key":"ref17","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020"},{"key":"ref18","first-page":"2595","article-title":"Parallelized stochastic gradient descent","volume-title":"Proc. 23rd Int. Conf. Neural Inf. Process. Syst.","author":"Zinkevich"},{"key":"ref19","first-page":"1223","article-title":"Large scale distributed deep networks","volume-title":"Proc. 25th Int. Conf. Neural Inf. Process. Syst.","author":"Dean"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3452773"},{"key":"ref21","doi-asserted-by":"crossref","DOI":"10.1145\/3458817.3476209","article-title":"Efficient large-scale language model training on GPU clusters using megatron-LM","volume-title":"Proc. Int. Conf. High Perform. Comput. Netw. Storage Anal.","author":"Narayanan"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.1811.06965"},{"key":"ref23","first-page":"7937","article-title":"Memory-efficient pipeline-parallel DNN training","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","author":"Narayanan"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref25","first-page":"269","article-title":"Pipemare: Asynchronous pipeline parallel DNN training","volume-title":"Proc. Mach. Learn. Syst.","author":"Yang"},{"key":"ref26","first-page":"2279","article-title":"Exploring hidden dimensions in parallelizing convolutional neural networks","volume-title":"Proc. 35th Int. Conf. Mach. Learn.","author":"Jia"},{"key":"ref27","first-page":"1","article-title":"Beyond data and model parallelism for deep neural networks","volume-title":"Proc. Mach. Learn. Syst.","author":"Jia"},{"key":"ref28","first-page":"267","article-title":"Unity: Accelerating dnn training through joint optimization of algebraic transformations and parallelization","volume-title":"Proc. 16th USENIX Symp. Operating Syst. Des. Implementation","author":"Unger"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303953"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3132413"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref34","article-title":"RoBERTa: A robustly optimized BERT pretraining approach","author":"Liu","year":"2019"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"ref37","article-title":"FairScale: A general purpose modular PyTorch library for high performance and large scale training","author":"Baines","year":"2021"},{"key":"ref38","first-page":"7937","article-title":"Memory-efficient pipeline-parallel DNN training","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Narayanan"},{"key":"ref39","article-title":"Using DeepSpeed and megatron to train megatron-turing nlg 530b, a large-scale generative language model","author":"Smith","year":"2022"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00049"},{"key":"ref43","article-title":"Hetu: A highly efficient automatic parallel distributed deep learning system","volume":"66","author":"Miao","year":"2022","journal-title":"Sci. China Inf. Sci."},{"issue":"2","key":"ref44","first-page":"312","article-title":"HET: Scaling out huge embedding model training via cache-enabled distributed framework","volume-title":"Proc. VLDB Endowment","volume":"15","author":"Miao"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3514221.3517902"},{"key":"ref46","article-title":"HetuMoE: An efficient trillion-scale mixture-of-expert distributed training system","author":"Nie","year":"2022"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.14778\/3570690.3570697"},{"key":"ref48","article-title":"PyTorch: An imperative style, high-performance deep learning library","volume-title":"Proc. 33rd Int. Conf. Neural Inf. Process. Syst.","author":"Paszke"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"ref50","article-title":"GSPMD: General and scalable parallelization for ML computation graphs","author":"Xu","year":"2021"},{"key":"ref51","first-page":"559","article-title":"Alpa: Automating inter- and intra-operator parallelism for distributed deep learning","volume-title":"Proc. 16th USENIX Symp. Operating Syst. Des. Implementation","author":"Zheng"},{"key":"ref53","article-title":"Automatic cross-replica sharding of weight update in data-parallel training","author":"Xu","year":"2020"}],"container-title":["IEEE Transactions on Knowledge and Data Engineering"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/69\/10589999\/10449463.pdf?arnumber=10449463","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,18]],"date-time":"2024-07-18T05:18:21Z","timestamp":1721279901000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10449463\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8]]},"references-count":50,"journal-issue":{"issue":"8"},"URL":"https:\/\/doi.org\/10.1109\/tkde.2024.3370614","relation":{},"ISSN":["1041-4347","1558-2191","2326-3865"],"issn-type":[{"value":"1041-4347","type":"print"},{"value":"1558-2191","type":"electronic"},{"value":"2326-3865","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,8]]}}}