{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,16]],"date-time":"2026-03-16T22:40:23Z","timestamp":1773700823327,"version":"3.50.1"},"reference-count":102,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"8","license":[{"start":{"date-parts":[[2023,8,1]],"date-time":"2023-08-01T00:00:00Z","timestamp":1690848000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,8,1]],"date-time":"2023-08-01T00:00:00Z","timestamp":1690848000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,8,1]],"date-time":"2023-08-01T00:00:00Z","timestamp":1690848000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62025208"],"award-info":[{"award-number":["62025208"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2023,8]]},"DOI":"10.1109\/tpds.2023.3281931","type":"journal-article","created":{"date-parts":[[2023,6,26]],"date-time":"2023-06-26T18:50:48Z","timestamp":1687805448000},"page":"2377-2390","source":"Crossref","is-referenced-by-count":29,"title":["A Survey on Auto-Parallelism of Large-Scale Deep Learning Training"],"prefix":"10.1109","volume":"34","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5590-5179","authenticated-orcid":false,"given":"Peng","family":"Liang","sequence":"first","affiliation":[{"name":"State Key Laboratory of Parallel and Distributed Processing, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8595-1547","authenticated-orcid":false,"given":"Yu","family":"Tang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Parallel and Distributed Processing, National University of Defense Technology, Changsha, China"}]},{"given":"Xiaoda","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co. Ltd., Shenzhen, China"}]},{"given":"Youhui","family":"Bai","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co. Ltd., Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9517-2845","authenticated-orcid":false,"given":"Teng","family":"Su","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co. Ltd., Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3458-4732","authenticated-orcid":false,"given":"Zhiquan","family":"Lai","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Parallel and Distributed Processing, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8285-2738","authenticated-orcid":false,"given":"Linbo","family":"Qiao","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Parallel and Distributed Processing, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9743-2034","authenticated-orcid":false,"given":"Dongsheng","family":"Li","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Parallel and Distributed Processing, National University of Defense Technology, Changsha, China"}]}],"member":"263","reference":[{"key":"ref57","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref56","first-page":"6543","article-title":"TeraPipe: Token-level pipeline parallelism for training large-scale language models","author":"li","year":"2021","journal-title":"Proc 38th Int Conf Mach Learn"},{"key":"ref59","first-page":"2274","article-title":"Exploring hidden dimensions in accelerating convolutional neural networks","author":"jia","year":"2018","journal-title":"Proc 35th Int Conf Mach Learn"},{"key":"ref58","article-title":"Sequence parallelism: Making 4D parallelism possible","author":"li","year":"2021"},{"key":"ref53","article-title":"End-to-end adaptive distributed training on PaddlePaddle","author":"ao","year":"2021"},{"key":"ref52","first-page":"8024","article-title":"PyTorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref55","article-title":"GShard: Scaling giant models with conditional computation and automatic sharding","author":"lepikhin","year":"2021","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref54","first-page":"265","article-title":"TensorFlow: A system for large-scale machine learning","author":"abadi","year":"2016","journal-title":"Proc 12th USENIX Conf Operating Syst Des Implementation"},{"key":"ref51","article-title":"OneFlow: Redesign the distributed deep learning framework from scratch","author":"yuan","year":"2021"},{"key":"ref50","year":"2019"},{"key":"ref46","article-title":"Maximizing parallelism in distributed training for huge neural networks","author":"bian","year":"2021"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref48","article-title":"Language models are few-shot learners","author":"brown","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref47","article-title":"Colossal-AI: A unified deep learning system for large-scale parallel training","author":"bian","year":"2021"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.3047974"},{"key":"ref43","first-page":"1","article-title":"GEMS: GPU-enabled memory-aware model-parallelism system for distributed DNN training","author":"jain","year":"2020","journal-title":"Proc Int Conf High Perform Comput Netw Storage Anal"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/347837.347846"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3363554"},{"key":"ref7","first-page":"559","article-title":"Alpa: Automating inter- and intra-operator parallelism for distributed deep learning","author":"zheng","year":"2022","journal-title":"Proc 16th USENIX Symp Oper Syst Des Implementation"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3377454"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.3003307"},{"key":"ref6","first-page":"7937","article-title":"Memory-efficient pipeline-parallel DNN training","author":"narayanan","year":"2021","journal-title":"Proc 38th Int Conf Mach Learn"},{"key":"ref5","article-title":"Megatron-LM: Training multi-billion parameter language models using model parallelism","author":"shoeybi","year":"2019"},{"key":"ref100","first-page":"463","article-title":"A unified architecture for accelerating distributed DNN training in heterogeneous GPU\/CPU clusters","author":"jiang","year":"2020","journal-title":"Proc 14th USENIX Symp Oper Syst Des Implementation"},{"key":"ref101","first-page":"93","article-title":"Overlap communication with dependent computation via decomposition in large deep learning models","author":"wang","year":"2023","journal-title":"Proc 6th Int Conf Archit Support Program Lang Oper Syst"},{"key":"ref40","article-title":"PanGu-$\\alpha$?: Large-scale autoregressive pretrained Chinese language models with auto-parallel computation","author":"zeng","year":"2021"},{"key":"ref35","first-page":"15451","article-title":"Efficient algorithms for device placement of DNN graph operators","author":"tarnawski","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref37","article-title":"Reducing activation recomputation in large transformer models","author":"korthikanti","year":"2022"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3132413"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS49936.2021.00111"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS49936.2021.00109"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3437984.3458829"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476145"},{"key":"ref38","first-page":"103","article-title":"GPipe: Efficient training of giant neural networks using pipeline parallelism","author":"huang","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref24","article-title":"NP-completeness of dynamic remapping","author":"kremer","year":"1993","journal-title":"Proc 4th Workshop Compilers Parallel Comput"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/FMPC.1990.89493"},{"key":"ref26","author":"mohri","year":"2018","journal-title":"Foundations of Machine Learning"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1016\/0743-7315(91)90090-V"},{"key":"ref20","first-page":"548","article-title":"Synthesizing optimal parallelism placement and reduction strategies on hierarchical systems for deep learning","author":"xie","year":"2022","journal-title":"Proc Mach Learn Syst"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/291891.291901"},{"key":"ref21","article-title":"Memory-efficient array redistribution through portable collective communication","author":"rink","year":"2021"},{"key":"ref28","first-page":"24829","article-title":"Piper: Multidimensional planner for DNN parallelization","author":"tarnawski","year":"2021","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1126\/science.153.3731.34"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3151739"},{"key":"ref13","article-title":"Training deep nets with sublinear memory cost","author":"chen","year":"2016"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-85665-6_13"},{"key":"ref15","article-title":"HeterPS: Distributed deep learning with reinforcement learning based scheduling in heterogeneous environments","author":"liu","year":"2021"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2018.2877359"},{"key":"ref97","first-page":"10435","article-title":"Mesh-TensorFlow: Deep learning for supercomputers","author":"shazeer","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref96","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"Proc Conf North Amer Chapter Assoc Comput Linguistics Hum Lang Technol"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00027"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507778"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303953"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2023.3247001"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00036"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1147\/JRD.2019.2947013"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2019.00057"},{"key":"ref93","article-title":"GSPMD: General and scalable parallelization for ML computation graphs","author":"xu","year":"2021"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3094364"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.5244\/C.30.87"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1145\/2640087.2644155"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1145\/3464298.3476132"},{"key":"ref90","first-page":"4502","article-title":"Interaction networks for learning about objects, relations and physics","author":"battaglia","year":"2016","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370308"},{"key":"ref86","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"raffel","year":"2020","journal-title":"J Mach Learn Res"},{"key":"ref85","article-title":"XLA: Compiling machine learning for peak performance","author":"sabne","year":"2020"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00065"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1007\/s10732-010-9143-1"},{"key":"ref81","first-page":"1025","article-title":"Inductive representation learning on large graphs","author":"hamilton","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.3030548"},{"key":"ref83","first-page":"1899","article-title":"Graying the black box: Understanding DQNs","author":"zahavy","year":"2016"},{"key":"ref80","author":"miller","year":"1985","journal-title":"Markov Decision Process"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-80"},{"key":"ref78","first-page":"1310","article-title":"On the difficulty of training recurrent neural networks","author":"pascanu","year":"2013","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/TCIAIG.2012.2186810"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1201\/b14835"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1007\/BF00992696"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3062721"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1613\/jair.301"},{"key":"ref2","article-title":"Language models are few-shot learners","author":"brown","year":"2020"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1038\/nature14539"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1007\/BF01386232"},{"key":"ref70","article-title":"High-performance, distributed training of large-scale deep learning recommendation models","author":"mudigere","year":"2021"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1145\/1837274.1837289"},{"key":"ref72","author":"schrijver","year":"1998","journal-title":"Theory of Linear and Integer Programming"},{"key":"ref68","first-page":"1676","article-title":"Spotlight: Optimizing device placement for training deep neural networks","author":"gao","year":"2018","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref67","article-title":"Placeto: Learning generalizable device placement algorithms for distributed machine learning","author":"addanki","year":"2019"},{"key":"ref69","article-title":"Reinforced genetic algorithm learning for optimizing computation graphs","author":"paliwal","year":"2020","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref64","article-title":"GDP: Generalized device placement for dataflow graphs","author":"zhou","year":"2019"},{"key":"ref63","first-page":"2430","article-title":"Device placement optimization with reinforcement learning","author":"mirhoseini","year":"2017","journal-title":"Proc 34th Int Conf Mach Learn"},{"key":"ref66","article-title":"A hierarchical model for device placement","author":"mirhoseini","year":"2018","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref65","first-page":"187","article-title":"Improving the accuracy, scalability, and performance of graph neural networks with ROC","author":"jia","year":"2020","journal-title":"Proc Mach Learn Syst"},{"key":"ref60","article-title":"Automap: Towards ergonomic automated parallelism for ML models","author":"schaarschmidt","year":"2021"},{"key":"ref62","article-title":"Auto-MAP: A DQN framework for exploring distributed execution plans for DNN workloads","author":"wang","year":"2020"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.29"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/71\/10153956\/10163912.pdf?arnumber=10163912","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,24]],"date-time":"2023-07-24T17:57:19Z","timestamp":1690221439000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10163912\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,8]]},"references-count":102,"journal-issue":{"issue":"8"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2023.3281931","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"value":"1045-9219","type":"print"},{"value":"1558-2183","type":"electronic"},{"value":"2161-9883","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,8]]}}}