{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T04:13:51Z","timestamp":1773116031908,"version":"3.50.1"},"reference-count":62,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2022,3,1]],"date-time":"2022-03-01T00:00:00Z","timestamp":1646092800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"name":"Huawei Innovation Research Program","award":["HK RGC ECS 27200916"],"award-info":[{"award-number":["HK RGC ECS 27200916"]}]},{"name":"Huawei Innovation Research Program","award":["HK RGC GRF 17207117"],"award-info":[{"award-number":["HK RGC GRF 17207117"]}]},{"name":"Huawei Innovation Research Program","award":["17202318"],"award-info":[{"award-number":["17202318"]}]},{"name":"Huawei Innovation Research Program","award":["27208720"],"award-info":[{"award-number":["27208720"]}]},{"name":"Croucher Innovation Award"},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61802358"],"award-info":[{"award-number":["61802358"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"USTC Research Funds of Double First-Class Initiative","award":["YD2150002006"],"award-info":[{"award-number":["YD2150002006"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2022,3,1]]},"DOI":"10.1109\/tpds.2021.3094364","type":"journal-article","created":{"date-parts":[[2021,7,2]],"date-time":"2021-07-02T19:29:50Z","timestamp":1625254190000},"page":"489-506","source":"Crossref","is-referenced-by-count":42,"title":["vPipe: A Virtualized Acceleration System for Achieving Efficient and Scalable Pipeline Parallel DNN Training"],"prefix":"10.1109","volume":"33","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1643-2583","authenticated-orcid":false,"given":"Shixiong","family":"Zhao","sequence":"first","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"given":"Fanxin","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2807-9780","authenticated-orcid":false,"given":"Xusheng","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"given":"Xiuxian","family":"Guan","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8684-8509","authenticated-orcid":false,"given":"Jianyu","family":"Jiang","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"given":"Dong","family":"Huang","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"given":"Yuhao","family":"Qing","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"given":"Sen","family":"Wang","sequence":"additional","affiliation":[{"name":"2012 Labs, Theory Lab, Huawei Technoloies, Co. Ltd, Shenzhen, China"}]},{"given":"Peng","family":"Wang","sequence":"additional","affiliation":[{"name":"2012 Labs, Theory Lab, Huawei Technoloies, Co. Ltd, Shenzhen, China"}]},{"given":"Gong","family":"Zhang","sequence":"additional","affiliation":[{"name":"2012 Labs, Theory Lab, Huawei Technoloies, Co. Ltd, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7064-6120","authenticated-orcid":false,"given":"Cheng","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, University of Science and Technology of China, Hefei, Anhui, China"}]},{"given":"Ping","family":"Luo","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7746-440X","authenticated-orcid":false,"given":"Heming","family":"Cui","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33014780"},{"key":"ref38","first-page":"1","article-title":"ZeRO: Memory optimizations toward training trillion parameter models","author":"rajbhandari","year":"2020","journal-title":"Proc Int Conf High Perform Comput Netw Storage Anal"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1287\/opre.43.3.477"},{"key":"ref31","article-title":"Regularizing and optimizing LSTM language models","author":"merity","year":"2017"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.29"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378505"},{"key":"ref36","first-page":"8026","article-title":"PyTorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref35","first-page":"25","article-title":"Hermes: Dynamic partitioning for distributed social network graph databases","author":"nicoara","year":"2015","journal-title":"Proc Intl Conf Extending Database Technology"},{"key":"ref34","article-title":"DyNet: The dynamic neural network toolkit","author":"neubig","year":"2017"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/BigComp.2018.00136"},{"key":"ref62","article-title":"Few-shot neural architecture search","author":"zhao","year":"2020"},{"key":"ref61","year":"0"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/2640087.2644155"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICSI.1990.138741"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2016.12.038"},{"key":"ref2","first-page":"97","article-title":"An integrated genetic algorithm with dynamic hill climbing for VLSI circuit partitioning","author":"areibi","year":"2000","journal-title":"Proc Genet Evol Comput Conf"},{"key":"ref1","first-page":"265","article-title":"TensorFlow: A system for large-scale machine learning","author":"abadi","year":"2016","journal-title":"Proc 12th USENIX Symp Operating Syst Des Implementation"},{"key":"ref20","first-page":"6869","article-title":"Quantized neural networks: Training neural networks with low precision weights and activations","volume":"18","author":"hubara","year":"2017","journal-title":"J Mach Learn Res"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/2189750.2151001"},{"key":"ref21","article-title":"Beyond data and model parallelism for deep neural networks","author":"jia","year":"2018"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/224170.224229"},{"key":"ref23","first-page":"113","article-title":"Multilevel graph partitioning schemes","author":"karypis","year":"0"},{"key":"ref26","article-title":"One weird trick for parallelizing convolutional neural networks","author":"krizhevsky","year":"2014"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1002\/j.1538-7305.1970.tb01770.x"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1016\/j.orl.2005.10.003"},{"key":"ref51","year":"0"},{"key":"ref59","article-title":"PipeMare: Asynchronous pipeline parallel DNN training","author":"yang","year":"2019"},{"key":"ref58","article-title":"Google&#x2019;s neural machine translation system: Bridging the gap between human and machine translation","author":"wu","year":"2016"},{"key":"ref57","article-title":"AlphaX: Exploring neural architectures with deep neural networks and Monte Carlo tree search","author":"wang","year":"2019"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178491"},{"key":"ref55","article-title":"Sample-efficient neural architecture search by learning action space","author":"wang","year":"2019"},{"key":"ref54","doi-asserted-by":"crossref","DOI":"10.1109\/SC41405.2020.00023","article-title":"Scaling distributed deep learning workloads beyond the memory capacity with KARMA","author":"wahib","year":"2020"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"ref52","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref10","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD.1993.580083"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783721"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/BF01201263"},{"key":"ref13","volume":"3","author":"fj\u00e4llstr\u00f6m","year":"1998","journal-title":"Algorithms for graph partitioning A survey"},{"key":"ref14","article-title":"XPipe: Efficient pipeline model parallelism for multi-GPU DNN training","author":"guan","year":"2019"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1137\/S0895479892238270"},{"key":"ref17","first-page":"28-es","article-title":"A multi-level algorithm for partitioning graphs","author":"hendrickson","year":"0","journal-title":"Proc ACM\/IEEE Conf Supercomputing"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378530"},{"key":"ref19","first-page":"103","article-title":"GPipe: Efficient training of giant neural networks using pipeline parallelism","author":"huang","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref4","year":"0"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CCECE.1998.685556"},{"key":"ref6","article-title":"Language models are few-shot learners","author":"brown","year":"2020"},{"key":"ref5","first-page":"499","article-title":"PipeSwitch: Fast pipelined context switching for deep learning applications","author":"bai","year":"2020","journal-title":"Proc 14th USENIX Symp Operating Syst Des Implementation"},{"key":"ref8","article-title":"MXNet: A flexible and efficient machine learning library for heterogeneous distributed systems","author":"chen","year":"2015"},{"key":"ref7","article-title":"A heuristic for reducing fill-in in sparse matrix factorization","author":"bui","year":"1993","journal-title":"Tech Rep"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/115992.116012"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1355"},{"key":"ref45","article-title":"The evolved transformer","author":"so","year":"2019"},{"key":"ref48","first-page":"538","article-title":"Unified geometric approach to graph separators","author":"teng","year":"0"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TEVC.2017.2778089"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W16-2323"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/2670338"},{"key":"ref44","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2014"},{"key":"ref43","article-title":"Horovod: Fast and easy distributed deep learning in tensorflow","author":"sergeev","year":"2018"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/71\/9497774\/09472938.pdf?arnumber=9472938","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T19:20:48Z","timestamp":1733340048000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9472938\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3,1]]},"references-count":62,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2021.3094364","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"value":"1045-9219","type":"print"},{"value":"1558-2183","type":"electronic"},{"value":"2161-9883","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,3,1]]}}}