{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,2]],"date-time":"2025-11-02T00:20:23Z","timestamp":1762042823483,"version":"build-2065373602"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"name":"The National Key Research and Development Program of China","award":["2018YFB0203803"],"award-info":[{"award-number":["2018YFB0203803"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1711263"],"award-info":[{"award-number":["U1711263"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1811461"],"award-info":[{"award-number":["U1811461"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1801266"],"award-info":[{"award-number":["U1801266"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003453","name":"Natural Science Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2018B030312002"],"award-info":[{"award-number":["2018B030312002"]}],"id":[{"id":"10.13039\/501100003453","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003453","name":"Natural Science Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2018A030313492"],"award-info":[{"award-number":["2018A030313492"]}],"id":[{"id":"10.13039\/501100003453","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2022,3]]},"DOI":"10.1007\/s11227-021-04083-x","type":"journal-article","created":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T10:17:17Z","timestamp":1633083437000},"page":"5565-5587","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Iteration number-based hierarchical gradient aggregation for distributed deep learning"],"prefix":"10.1007","volume":"78","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6798-9683","authenticated-orcid":false,"given":"Danyang","family":"Xiao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinxin","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jieying","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yunfei","family":"Du","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weigang","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,10,1]]},"reference":[{"key":"4083_CR1","unstructured":"Vipul G, Dhruv C, Ping Tak\u00a0Peter T, Xiaohan W, Xing W, Yuzhen H, Arun K, Kannan R, and Michael\u00a0WM (2021) Training recommender systems at scale: Communication-efficient model and data parallelism. In KDD \u201921: The 27th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Virtual Event, Singapore, August 14-18, 2021, pages 2928\u20132936. ACM,"},{"key":"4083_CR2","unstructured":"Wayne X, Lingfeng W, Fil A, Jasha D, Xuedong H, and Andreas S (2018) The microsoft 2017 conversational speech recognition system. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pages 5934\u20135938. IEEE"},{"issue":"12","key":"4083_CR3","doi-asserted-by":"publisher","first-page":"2802","DOI":"10.1109\/TPDS.2020.3003307","volume":"31","author":"M Langer","year":"2020","unstructured":"Langer M, He Z, Rahayu W, Xue Y (2020) Distributed training of deep learning models: a taxonomic perspective. IEEE Trans Parallel Distrib Syst 31(12):2802\u20132818","journal-title":"IEEE Trans Parallel Distrib Syst"},{"issue":"11","key":"4083_CR4","doi-asserted-by":"publisher","first-page":"2449","DOI":"10.1109\/TPDS.2019.2913833","volume":"30","author":"Y You","year":"2019","unstructured":"You Y, Zhang Z, Hsieh C-J, Demmel J, Keutzer K (2019) Fast deep neural network training on distributed systems and cloud tpus. IEEE Trans Parallel Distrib Syst 30(11):2449\u20132462","journal-title":"IEEE Trans Parallel Distrib Syst"},{"key":"4083_CR5","unstructured":"J-H Park, G Yun, C-M Yi, N-T Nguyen, S Lee, J Choi, S-H Noh, and Y-R Choi (2020) Hetpipe: Enabling large DNN training on (whimpy) heterogeneous GPU clusters through integration of pipelined model parallelism and data parallelism. In 2020 USENIX Annual Technical Conference, USENIX ATC 2020, July 15-17, 2020, pages 307\u2013321. USENIX Association"},{"key":"4083_CR6","doi-asserted-by":"crossref","unstructured":"Alexander T (2011) BSP (bulk synchronous parallelism). In: Padua David A (ed) Encyclopedia of Parallel Computing. Springer, New York, pp 192\u2013199","DOI":"10.1007\/978-0-387-09766-4_311"},{"issue":"9","key":"4083_CR7","doi-asserted-by":"publisher","first-page":"9971","DOI":"10.1007\/s11227-021-03673-z","volume":"77","author":"S Barrachina","year":"2021","unstructured":"Barrachina S, Castell\u00f3 A, Catal\u00e1n M, Dolz Manuel F, Mestre Jos\u00e9 I (2021) Pydtnn: a user-friendly and extensible framework for distributed deep learning. J Supercomput 77(9):9971\u20139987","journal-title":"J Supercomput"},{"issue":"1","key":"4083_CR8","doi-asserted-by":"publisher","first-page":"47","DOI":"10.1007\/s11227-019-02845-2","volume":"76","author":"E Yang","year":"2020","unstructured":"Yang E, Kang D-K, Youn C-H (2020) BOA: batch orchestration algorithm for straggler mitigation of distributed DL training in heterogeneous GPU cluster. J Supercomput 76(1):47\u201367","journal-title":"J Supercomput"},{"key":"4083_CR9","doi-asserted-by":"crossref","unstructured":"Henriksen T, Thor\u00f8e F, Elsman M, Oancea Cosmin E (2019) Incremental flattening for nested data parallelism. pp 53\u201367","DOI":"10.1145\/3300173"},{"key":"4083_CR10","doi-asserted-by":"crossref","unstructured":"Wang F, Zhang W, Guo H, Hao M, Gangzhao L, Wang Z (2021) Automatic translation of data parallel programs for heterogeneous parallelism through openmp offloading. J Supercomput 77(5):4957\u20134987","DOI":"10.1007\/s11227-020-03452-2"},{"key":"4083_CR11","unstructured":"Yanping H, Youlong C, Ankur B, Orhan F, Dehao C, Mia C, HyoukJoong L, Jiquan N, Quoc\u00a0V L, Yonghui W, and Zhifeng C (2019) Gpipe: Efficient training of giant neural networks using pipeline parallelism. In H.\u00a0Wallach, H.\u00a0Larochelle, A.\u00a0Beygelzimer, F.\u00a0d\u2019 Alch\u00e9-Buc, E.\u00a0Fox, and R.\u00a0Garnett, editors, Advances in Neural Information Processing Systems 32, pages 103\u2013112. 2019"},{"issue":"7","key":"4083_CR12","first-page":"1665","volume":"32","author":"D Jiangsu","year":"2021","unstructured":"Jiangsu D, Zhu X, Shen M, Yunfei D, Yutong L, Xiao N, Liao X (2021) Model parallelism optimization for distributed inference via decoupled CNN structure. IEEE Trans Parallel Distrib Syst 32(7):1665\u20131676","journal-title":"IEEE Trans Parallel Distrib Syst"},{"key":"4083_CR13","unstructured":"Alexander S and Mike D\u00a0B (2018) Horovod: fast and easy distributed deep learning in tensorflow. arXiv preprint arXiv:1802.05799"},{"key":"4083_CR14","unstructured":"Mu L, David GA, Jun WP, Alexander JS, Amr A, Vanja J, James L, Eugene JS, and Bor-Yiing S (2014) Scaling distributed machine learning with the parameter server. In 11th $$\\{$$USENIX$$\\}$$ Symposium on Operating Systems Design and Implementation ($$\\{$$OSDI$$\\}$$ 14), pages 583\u2013598"},{"issue":"11","key":"4083_CR15","doi-asserted-by":"publisher","first-page":"1877","DOI":"10.14778\/3407790.3407796","volume":"13","author":"A Renz-Wieland","year":"2020","unstructured":"Renz-Wieland A, Gemulla R, Zeuch S, Markl V (2020) Dynamic parameter allocation in parameter servers. Proc VLDB Endow 13(11):1877\u20131890","journal-title":"Proc VLDB Endow"},{"key":"4083_CR16","unstructured":"Jeffrey D, Greg C, Rajat M, Kai C, Matthieu D, Mark M, Marc\u2019aurelio R, Andrew S, Paul T, Ke Y, et\u00a0al (2012) Large scale distributed deep networks. In Advances in neural information processing systems, pages 1223\u20131231"},{"key":"4083_CR17","doi-asserted-by":"publisher","first-page":"118","DOI":"10.1016\/j.ins.2020.05.121","volume":"548","author":"D Xiao","year":"2021","unstructured":"Xiao D, Mei Y, Kuang D, Chen M, Guo B, Weigang W (2021) Egc: entropy-based gradient compression for distributed deep learning. Inf Sci 548:118\u2013134","journal-title":"Inf Sci"},{"key":"4083_CR18","unstructured":"Zijie Y, Danyang X, Mengqiang C, Jieying Z, and Weigang W (2020) Dual-way gradient sparsification for asynchronous distributed deep learning. In 49th International Conference on Parallel Processing - ICPP, ICPP \u201920. Association for Computing Machinery"},{"key":"4083_CR19","doi-asserted-by":"crossref","unstructured":"Cui H, Zhang H, Ganger GR, Gibbons Phillip B, Xing Eric P (2016). Geeps: scalable deep learning on distributed gpus with a gpu-specialized parameter server. London, United Kingdom","DOI":"10.1145\/2901318.2901323"},{"key":"4083_CR20","unstructured":"Zhang R, Kwok J (2014) Asynchronous distributed admm for consensus optimization. pp 1701\u20131709"},{"key":"4083_CR21","unstructured":"Wei Z, Suyog G, Xiangru L, and Ji\u00a0L. Staleness-aware async-sgd for distributed deep learning. arXiv preprint arXiv:1511.05950, 2015"},{"key":"4083_CR22","unstructured":"Qirong H, James C, Henggang C, Seunghak L, Jin KK, Phillip BG, Garth AG, Greg G, and Eric PX (2013) More effective distributed ml via a stale synchronous parallel parameter server. In Advances in neural information processing systems, pages 1223\u20131231"},{"key":"4083_CR23","unstructured":"(2014) Caffe (2014) Convolutional architecture for fast feature embedding. In: J Yangqing, S Evan, D Jeff, K Sergey, L Jonathan, G Ross, G Sergio, D Trevor (eds). pp 675\u2013678"},{"issue":"12","key":"4083_CR24","doi-asserted-by":"publisher","first-page":"3740","DOI":"10.1109\/TAC.2016.2525015","volume":"61","author":"R Feyzmahdavian Hamid","year":"2016","unstructured":"Feyzmahdavian Hamid R, Aytekin A, Johansson M (2016) An asynchronous mini-batch algorithm for regularized stochastic optimization. IEEE Trans Autom Control 61(12):3740\u20133754","journal-title":"IEEE Trans Autom Control"},{"key":"4083_CR25","unstructured":"LeCun Y et al (2015) Lenet-5, convolutional neural networks. 20:5http:\/\/yann.lecun.com\/exdb\/lenet"},{"key":"4083_CR26","unstructured":"LeCun Y, Cortes C, Burges CJ (2010) Mnist handwritten digit database. AT&T Labs [Online]. Available 2:18http:\/\/yann.lecun.com\/exdb\/mnist"},{"key":"4083_CR27","unstructured":"Alex K, Vinod N, and Geoffrey H (2009) Cifar-10 and cifar-100 datasets. https:\/\/www.cs.toronto.edu\/kriz\/cifar.html, 6"},{"key":"4083_CR28","doi-asserted-by":"crossref","unstructured":"Chen M, Yan Z, Ren J, Wu W (2020) Standard deviation based adaptive gradient compression for distributed deep learning. pp 529\u2013538","DOI":"10.1109\/CCGrid49817.2020.00-40"},{"key":"4083_CR29","first-page":"4452","volume":"31","author":"U Stich Sebastian","year":"2018","unstructured":"Stich Sebastian U, Cordonnier J-B, Jaggi M (2018) Sparsified SGD with memory. Adv Neural Inform Process Syst 31:4452\u20134463","journal-title":"Adv Neural Inform Process Syst"},{"key":"4083_CR30","unstructured":"Hanlin T, Chen Y, Xiangru L, Tong Z, and Ji L. Doublesqueeze: Parallel stochastic gradient descent with double-pass error-compensated compression. In Proceedings of the 36th International Conference on Machine Learning, ICML 2019, 9-15 June 2019, Long Beach, California, USA, volume\u00a097 of Proceedings of Machine Learning Research, pages 6155\u20136165. PMLR, 2019"},{"key":"4083_CR31","unstructured":"(2018). In: L Yujun, S Han, M Huizi, W Yu, D William (eds)"},{"key":"4083_CR32","unstructured":"Chia-Yu C, Jungwook C, Daniel B, Ankur A, Wei Z, and Kailash G. Adacomp: Adaptive residual gradient compression for data-parallel distributed training. In Thirty-Second AAAI Conference on Artificial Intelligence, 2018"},{"key":"4083_CR33","doi-asserted-by":"crossref","unstructured":"Seide F, Hao F, Droppo J, Li G, Dong Y (2014) 1-bit stochastic gradient descent and its application to data-parallel distributed training of speech dnns","DOI":"10.21437\/Interspeech.2014-274"},{"key":"4083_CR34","first-page":"1509","volume":"5","author":"W Wei","year":"2017","unstructured":"Wei W, Cong X, Feng Y, Chunpeng W, Yandan W, Yiran C, Hai L (2017) Terngrad: ternary gradients to reduce communication in distributed deep learning. Adv Neural Inf Process Syst 5:1509\u20131519","journal-title":"Adv Neural Inf Process Syst"},{"key":"4083_CR35","unstructured":"Ron B, Itay H, Elad H, and Daniel S. Scalable methods for 8-bit training of neural networks. In S.\u00a0Bengio, H.\u00a0Wallach, H.\u00a0Larochelle, K.\u00a0Grauman, N.\u00a0Cesa-Bianchi, and R.\u00a0Garnett, editors, Advances in Neural Information Processing Systems 31, pages 5145\u20135153. Curran Associates, Inc., 2018"},{"key":"4083_CR36","first-page":"1269","volume":"14","author":"LD Emily","year":"2014","unstructured":"Emily LD, Wojciech Z, Joan B, Yann L, Rob F (2014) Exploiting linear structure within convolutional networks for efficient evaluation. Adv Neural Inf Process Syst 14:1269\u20131277","journal-title":"Adv Neural Inf Process Syst"},{"key":"4083_CR37","unstructured":"Dan A, Li J, Tomioka R (2016) and Milan Vojnovic. Randomized quantization for communication-optimal stochastic gradient descent, Qsgd"},{"key":"4083_CR38","first-page":"165","volume":"13","author":"D Ofer","year":"2012","unstructured":"Ofer D, Ran G-B, Ohad S, Lin X (2012) Optimal distributed online prediction using mini-batches. J Mach Learn Res 13:165\u2013202","journal-title":"J Mach Learn Res"},{"issue":"1\u20132","key":"4083_CR39","doi-asserted-by":"publisher","first-page":"703","DOI":"10.14778\/1920841.1920931","volume":"3","author":"A Smola","year":"2010","unstructured":"Smola A, Narayanamurthy S (2010) An architecture for parallel topic models. Proc VLDB Endow 3(1\u20132):703\u2013710","journal-title":"Proc VLDB Endow"},{"key":"4083_CR40","doi-asserted-by":"crossref","unstructured":"Ahmed A, Aly M, Gonzalez J, Narayanamurthy S, Smola Alexander J (2012) Scalable inference in latent variable models. pp 123\u2013132","DOI":"10.1145\/2124295.2124312"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-021-04083-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-021-04083-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-021-04083-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,3,15]],"date-time":"2022-03-15T15:30:19Z","timestamp":1647358219000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-021-04083-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,1]]},"references-count":40,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2022,3]]}},"alternative-id":["4083"],"URL":"https:\/\/doi.org\/10.1007\/s11227-021-04083-x","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"type":"print","value":"0920-8542"},{"type":"electronic","value":"1573-0484"}],"subject":[],"published":{"date-parts":[[2021,10,1]]},"assertion":[{"value":"7 September 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 October 2021","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}