{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T00:54:24Z","timestamp":1767142464083,"version":"build-2238731810"},"reference-count":52,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2022,3,3]],"date-time":"2022-03-03T00:00:00Z","timestamp":1646265600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,3,3]],"date-time":"2022-03-03T00:00:00Z","timestamp":1646265600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"name":"the National Key R&D Program of China under contract","award":["2017YFB1002201"],"award-info":[{"award-number":["2017YFB1002201"]}]},{"name":"the National Natural Science Fund for Distinguished Young Scholar","award":["61625204"],"award-info":[{"award-number":["61625204"]}]},{"name":"the Key Program of National Science Foundation of China","award":["61836006"],"award-info":[{"award-number":["61836006"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2022,7]]},"DOI":"10.1007\/s11227-021-04267-5","type":"journal-article","created":{"date-parts":[[2022,3,3]],"date-time":"2022-03-03T09:04:35Z","timestamp":1646298275000},"page":"12410-12433","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["FLSGD: free local SGD with parallel synchronization"],"prefix":"10.1007","volume":"78","author":[{"given":"Qing","family":"Ye","sequence":"first","affiliation":[]},{"given":"Yuhao","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Mingjia","family":"Shi","sequence":"additional","affiliation":[]},{"given":"Jiancheng","family":"Lv","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,3,3]]},"reference":[{"issue":"6","key":"4267_CR1","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2017","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2017) Imagenet classification with deep convolutional neural networks. Commun ACM 60(6):84\u201390. https:\/\/doi.org\/10.1145\/3065386","journal-title":"Commun ACM"},{"key":"4267_CR2","unstructured":"Dayiheng L, Yeyun G, Jie F, Yu Y, Jiusheng C, Daxin J, Jiancheng L, Nan D (2020) Rikinet: reading wikipedia pages for natural question answering. ACL. pp 6762\u20136771"},{"issue":"8","key":"4267_CR3","first-page":"1296","volume":"22","author":"CKB Sim","year":"2014","unstructured":"Sim CKB (2014) A spectral masking approach to noise-robust speech recognition using deep neural networks. IEEE Trans Audio Speech Lang Process Publ IEEE Signal Process Soc 22(8):1296\u20131305","journal-title":"IEEE Trans Audio Speech Lang Process Publ IEEE Signal Process Soc"},{"key":"4267_CR4","doi-asserted-by":"publisher","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), vol 2016. pp 770\u2013778. https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"4267_CR5","doi-asserted-by":"publisher","unstructured":"Deng J, Dong W, Socher R, Li L-J, Li K, Li FF (2009) Imagenet: a large-scale hierarchical image database. pp 248\u2013255. https:\/\/doi.org\/10.1109\/CVPR.2009.5206848","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"4267_CR6","unstructured":"Dean J, Corrado GS, Monga R, Chen K, Devin M, Le QV, Mao MZ, Ranzato M, Senior A, Tucker P, Yang K, Ng, AY (2012) Large scale distributed deep networks. In: Proceedings of the 25th International Conference on Neural Information Processing Systems, vol 1. NIPS\u201912, Curran Associates Inc., Red Hook, NY, USA, pp 1223\u20131231"},{"key":"4267_CR7","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y Lecun","year":"2015","unstructured":"Lecun Y, Bengio Y, Hinton G (2015) Deep learning. Nature 521:436\u2013444. https:\/\/doi.org\/10.1038\/nature14539","journal-title":"Nature"},{"key":"4267_CR8","doi-asserted-by":"publisher","unstructured":"Zhang H, Hsieh C-J, Akella V (2016) Hogwild++: A new mechanism for decentralized asynchronous stochastic gradient descent. In: 2016 IEEE 16th International Conference on Data Mining (ICDM). pp 629\u2013638. https:\/\/doi.org\/10.1109\/ICDM.2016.0074","DOI":"10.1109\/ICDM.2016.0074"},{"key":"4267_CR9","first-page":"165","volume":"13","author":"O Dekel","year":"2012","unstructured":"Dekel O, Gilad-Bachrach R, Shamir O, Xiao L (2012) Optimal distributed online prediction using mini-batches. J Mach Learn Res 13:165\u2013202","journal-title":"J Mach Learn Res"},{"key":"4267_CR10","doi-asserted-by":"publisher","unstructured":"Zhao X, An A, Liu J, Chen BX (2019) Dynamic stale synchronous parallel distributed training for deep learning. https:\/\/doi.org\/10.1109\/ICDCS.2019.00149","DOI":"10.1109\/ICDCS.2019.00149"},{"key":"4267_CR11","doi-asserted-by":"publisher","unstructured":"Yu H, Yang S, Zhu S (2019) Parallel restarted sgd with faster convergence and less communication: demystifying why model averaging works for deep learning. In: Proceedings of the AAAI Conference on Artificial Intelligence vol 33, pp 5693\u20135700. https:\/\/doi.org\/10.1609\/aaai.v33i01.33015693","DOI":"10.1609\/aaai.v33i01.33015693"},{"key":"4267_CR12","unstructured":"Stich SU (2019) Local SGD converges fast and communicates little. In: International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=S1g2JnRcFX"},{"key":"4267_CR13","unstructured":"Zhang J, Sa CD, Mitliagkas I, R\u00e9 C (2021) Parallel SGD: when does averaging help?. CoRR. arXiv:1606.07365"},{"key":"4267_CR14","unstructured":"Wang J, Joshi G (2019) Adaptive communication strategies to achieve the best error-runtime trade-off in local-update SGD. In: Talwalkar A, Smith V, Zaharia M (eds) Proceedings of Machine Learning and Systems, 2019, MLSys 2019, Stanford, CA, USA, March 31\u2013April 2, 2019, mlsys.org, 2019"},{"key":"4267_CR15","unstructured":"Zhang S, Choromanska A, LeCun Y (2015) Deep learning with elastic averaging SGD. In: Cortes C, Lawrence ND, Lee DD, Sugiyama M, Garnett R (eds) Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, December 7\u201312, 2015, Montreal, Quebec, Canada. pp 685\u2013693"},{"key":"4267_CR16","doi-asserted-by":"crossref","unstructured":"Zhao X, Papagelis M, An A, Chen BX, Liu J, Hu Y (2019) Elastic bulk synchronous parallel model for distributed deep learning. In: Wang J, Shim K, Wu X (eds) 2019 IEEE International Conference on Data Mining. ICDM 2019, Beijing, China, November 8\u201311, 2019, IEEE, pp 1504\u20131509","DOI":"10.1109\/ICDM.2019.00198"},{"key":"4267_CR17","unstructured":"Lian X, Huang Y, Li Y, Liu J (2015) Asynchronous parallel stochastic gradient for nonconvex optimization. In: Proceedings of the 28th International Conference on Neural Information Processing Systems, vol 2, NIPS\u201915. MIT Press, Cambridge, pp 2737\u20132745"},{"key":"4267_CR18","doi-asserted-by":"crossref","unstructured":"Seide F, Fu H, Droppo J, Li G, Yu D (2014) 1-bit stochastic gradient descent and its application to data-parallel distributed training of speech DNNs. pp 1058\u20131062","DOI":"10.21437\/Interspeech.2014-274"},{"key":"4267_CR19","unstructured":"Alistarh D, Grubic D, Li J, Tomioka R, Vojnovic M (2017) QSGD: communication-efficient SGD via gradient quantization and encoding. In: Guyon I, von Luxburg U, Bengio S, Wallach HM, Fergus R, Vishwanathan SVN, Garnett R (eds) Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4\u20139, 2017, Long Beach, CA, USA. pp 1709\u20131720"},{"key":"4267_CR20","doi-asserted-by":"crossref","unstructured":"Aji AF, Heafield K (2017) Sparse communication for distributed gradient descent. In: Palmer M, Hwa R, Riedel S (eds) Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing, EMNLP 2017, Copenhagen, Denmark, September 9\u201311, 2017. Association for Computational Linguistics, pp 440\u2013445","DOI":"10.18653\/v1\/D17-1045"},{"key":"4267_CR21","doi-asserted-by":"crossref","unstructured":"Shi S, Wang Q, Zhao K, Tang Z, Wang Y, Huang X, Chu X (2019) A distributed synchronous SGD algorithm with global top-k sparsification for low bandwidth networks. In: 2019 IEEE 39th International Conference on Distributed Computing Systems (ICDCS), pp 2238\u20132247","DOI":"10.1109\/ICDCS.2019.00220"},{"key":"4267_CR22","unstructured":"Vogels T, Karimireddy SP, Jaggi M (2020) Practical low-rank communication compression in decentralized deep learning. In: Larochelle H, Ranzato M, Hadsell R, Balcan M, Lin H (eds) Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6\u201312, 2020, virtual"},{"key":"4267_CR23","unstructured":"Zhang H, Zheng Z, Xu S, Dai W, Ho Q, Liang X, Hu Z, Wei J, Xie P, Xing EP (2017) Poseidon: an efficient communication architecture for distributed deep learning on GPU clusters. In: USENIX Annual Technical Conference (USENIX ATC 17). USENIX Association, Santa Clara, CA, pp 181\u2013193"},{"issue":"9","key":"4267_CR24","doi-asserted-by":"publisher","first-page":"2144","DOI":"10.1109\/TPDS.2021.3062721","volume":"32","author":"S Wang","year":"2021","unstructured":"Wang S, Pi A, Zhou X, Wang J, Xu CZ (2021) Overlapping communication with computation in parameter server for scalable dl training. IEEE Trans Parallel Distrib Syst 32(9):2144\u20132159","journal-title":"IEEE Trans Parallel Distrib Syst"},{"key":"4267_CR25","unstructured":"Li Y, Yu M, Li S, Avestimehr S, Kim NS, Schwing A (2018) Pipe-SGD: A decentralized pipelined SGD framework for distributed deep net training. In: Proceedings of the 32nd International Conference on Neural Information Processing Systems, NIPS\u201918. Curran Associates Inc., Red Hook, NY, USA, pp 8056\u20138067"},{"key":"4267_CR26","unstructured":"Lin T, Stich SU, Patel KK, Jaggi M (2020) Don\u2019t use large mini-batches, use local SGD. In: International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=B1eyO1BFPr"},{"key":"4267_CR27","unstructured":"Ye Q, Zhou Y, Shi M, Sun Y, Lv J (2020) DBS: Dynamic batch size for distributed deep neural network training. arXiv e-prints arXiv:2007.11831"},{"key":"4267_CR28","unstructured":"Zinkevich M, Weimer M, Smola AJ, Li L (2010) Parallelized stochastic gradient descent. In: Lafferty JD, Williams CKI, Shawe-Taylor J, Zemel RS, Culotta A (eds) Advances in Neural Information Processing Systems 23: 24th Annual Conference on Neural Information Processing Systems 2010, vol 2010. Proceedings of a Meeting held 6\u20139 December. Vancouver, British Columbia, Canada, Curran Associates Inc, pp 2595\u20132603"},{"key":"4267_CR29","unstructured":"Woodworth B, Kshitij Patel K, Stich SU, Dai Z, Bullins B, McMahan HB, Shamir O, Srebro N (2020) Is local SGD better than minibatch SGD?. arXiv e-prints arXiv:2002.07839"},{"key":"4267_CR30","unstructured":"Lin T, Stich SU, Patel KK, Jaggi M (2020) Don\u2019t use large mini-batches, use local SGD. In: 8th International Conference on Learning Representations, ICLR 2020. Addis Ababa, Ethiopia, April 26\u201330, 2020, OpenReview.net"},{"key":"4267_CR31","unstructured":"Kshitij Patel K, Dieuleveut A (2019) Communication trade-offs for synchronized distributed SGD with large step size. arXiv e-prints arXiv:1904.11325"},{"key":"4267_CR32","unstructured":"Jiang P, Agrawal G (2020) Adaptive periodic averaging: a practical approach to reducing communication in distributed learning. arXiv e-prints arXiv:2007.06134"},{"key":"4267_CR33","doi-asserted-by":"publisher","unstructured":"Ko Y, Choi K, Seo J, Kim S-W (2021) An in-depth analysis of distributed training of deep neural networks. In: IEEE International Parallel and Distributed Processing Symposium (IPDPS), vol 2021, pp 994\u20131003. https:\/\/doi.org\/10.1109\/IPDPS49936.2021.00108","DOI":"10.1109\/IPDPS49936.2021.00108"},{"key":"4267_CR34","doi-asserted-by":"publisher","unstructured":"Gupta S, Zhang W, Wang F (2016) Model accuracy and runtime tradeoff in distributed deep learning: a systematic study. In: Bonchi F, Domingo-Ferrer J, Baeza-Yates R, Zhou Z, Wu X (eds) IEEE 16th International Conference on Data Mining, ICDM 2016, December 12\u201315, 2016, Barcelona, Spain. IEEE Computer Society, pp 171\u2013180. https:\/\/doi.org\/10.1109\/ICDM.2016.0028","DOI":"10.1109\/ICDM.2016.0028"},{"key":"4267_CR35","unstructured":"Ho Q, Cipar J, Cui H, Lee S, Kim JK, Gibbons PB, Gibson GA, Ganger G, Xing EP (2013) More effective distributed ml via a stale synchronous parallel parameter server. In: Advances in Neural Information Processing Systems. pp 1223\u20131231"},{"key":"4267_CR36","unstructured":"Cipar J, Ho Q, Kim JK, Lee S, Ganger GR, Gibson G, Keeton K, Xing E (2013) Solving the straggler problem with bounded staleness. In: Presented as Part of the 14th Workshop on Hot Topics in Operating Systems"},{"key":"4267_CR37","unstructured":"Zhang W, Gupta S, Lian X, Liu J. Staleness-aware async-SGD for distributed deep learning. arXiv preprint arXiv:1511.05950"},{"key":"4267_CR38","unstructured":"Lian X, Zhang W, Zhang C, Liu J (2018) Asynchronous decentralized parallel stochastic gradient descent. In: Dy J, Krause A (eds) Proceedings of the 35th International Conference on Machine Learning, vol\u00a080 of Proceedings of Machine Learning Research, PMLR. pp 3043\u20133052"},{"key":"4267_CR39","unstructured":"Suresh AT, Yu FX, Kumar S, McMahan HB (2017) Distributed mean estimation with limited communication. In: Proceedings of the 34th International Conference on Machine Learning, vol 70, ICML\u201917, JMLR.org. p 3329\u20133337"},{"key":"4267_CR40","unstructured":"Wen W, Xu C, Yan F, Wu C, Wang Y, Chen Y, Li H (2017) Terngrad: ternary gradients to reduce communication in distributed deep learning. In: Guyon I, Luxburg UV, Bengio S, Wallach H, Fergus R, Vishwanathan S, Garnett R (eds) Advances in Neural Information Processing Systems 30. Curran Associates Inc, pp 1509\u20131519"},{"key":"4267_CR41","unstructured":"Tang H, Yu C, Lian X, Zhang T, Liu J (2019) DoubleSqueeze: Parallel stochastic gradient descent with double-pass error-compensated compression. In: Chaudhuri K, Salakhutdinov R (eds) Proceedings of the 36th International Conference on Machine Learning, vol\u00a097 of Proceedings of Machine Learning Research, PMLR. pp 6155\u20136165"},{"key":"4267_CR42","doi-asserted-by":"crossref","unstructured":"Sattler F, Wiedemann S, M\u00fcller K, Samek W (2019) Sparse binary compression: towards distributed deep learning with minimal communication. In: 2019 International Joint Conference on Neural Networks (IJCNN). pp 1\u20138","DOI":"10.1109\/IJCNN.2019.8852172"},{"key":"4267_CR43","unstructured":"Wang H, Sievert S, Liu S, Charles ZB, Papailiopoulos DS, Wright S (2018) ATOMO: communication-efficient learning via atomic sparsification. In: Bengio S, Wallach HM, Larochelle H, Grauman K, Cesa-Bianchi N, Garnett R (eds) Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018, NeurIPS 2018, December 3\u20138, 2018, Montr\u00e9al, Canada. pp 9872\u20139883"},{"key":"4267_CR44","unstructured":"Vogels T, Karimireddy SP, Jaggi M (2019) Powersgd: practical low-rank gradient compression for distributed optimization. In: Wallach HM, Larochelle H, Beygelzimer A, d\u2019Alch\u00e9-Buc F, Fox EB, Garnett R (eds) Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8\u201314, 2019. Vancouver, BC, Canada, pp 14236\u201314245"},{"key":"4267_CR45","unstructured":"Alistarh D, Hoefler T, Johansson M, Khirirat S, Konstantinov N, Renggli C (2018) The convergence of sparsified gradient methods. In: Proceedings of the 32nd International Conference on Neural Information Processing Systems, NIPS\u201918. Curran Associates Inc., Red Hook, NY, USA, pp 5977\u20135987"},{"issue":"9","key":"4267_CR46","doi-asserted-by":"publisher","first-page":"2144","DOI":"10.1109\/TPDS.2021.3062721","volume":"32","author":"S Wang","year":"2021","unstructured":"Wang S, Pi A, Zhou X, Wang J, Xu C-Z (2021) Overlapping communication with computation in parameter server for scalable dl training. IEEE Trans Parallel Distrib Syst 32(9):2144\u20132159. https:\/\/doi.org\/10.1109\/TPDS.2021.3062721","journal-title":"IEEE Trans Parallel Distrib Syst"},{"issue":"8","key":"4267_CR47","doi-asserted-by":"publisher","first-page":"1903","DOI":"10.1109\/TPDS.2021.3052862","volume":"32","author":"S Shi","year":"2021","unstructured":"Shi S, Chu X, Li B (2021) Mg-wfbp: Merging gradients wisely for efficient communication in distributed deep learning. IEEE Trans Parallel Distrib Syst 32(8):1903\u20131917. https:\/\/doi.org\/10.1109\/TPDS.2021.3052862","journal-title":"IEEE Trans Parallel Distrib Syst"},{"key":"4267_CR48","unstructured":"Zheng S, Meng Q, Wang T, Chen W, Yu N, Ma Z-M, Liu T-Y (2017) Asynchronous stochastic gradient descent with delay compensation. In: International Conference on Machine Learning, PMLR. pp 4120\u20134129"},{"key":"4267_CR49","series-title":"Springer series in statistics","volume-title":"The elements of statistical learning","author":"J Friedman","year":"2001","unstructured":"Friedman J, Hastie T, Tibshirani R et al (2001) The elements of statistical learning, vol 1. Springer series in statistics. Springer, New York"},{"key":"4267_CR50","unstructured":"Krizhevsky A, Hinton G. Learning multiple layers of features from tiny images. Computer Science Department, University of Toronto, Tech. Rep 1"},{"key":"4267_CR51","doi-asserted-by":"publisher","unstructured":"Zhao H, Canny J (2014) Kylix: A sparse allreduce for commodity clusters. In: 2014 43rd International Conference on Parallel Processing. pp 273\u2013282. https:\/\/doi.org\/10.1109\/ICPP.2014.36","DOI":"10.1109\/ICPP.2014.36"},{"key":"4267_CR52","doi-asserted-by":"publisher","unstructured":"Huang G, Liu Z, Van Der Maaten L, Weinberger KQ (2017) Densely connected convolutional networks. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), vol 2017. pp 2261\u20132269. https:\/\/doi.org\/10.1109\/CVPR.2017.243","DOI":"10.1109\/CVPR.2017.243"}],"updated-by":[{"DOI":"10.1007\/s11227-022-04440-4","type":"correction","label":"Correction","source":"publisher","updated":{"date-parts":[[2022,3,29]],"date-time":"2022-03-29T00:00:00Z","timestamp":1648512000000}}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-021-04267-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-021-04267-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-021-04267-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,9]],"date-time":"2022-06-09T20:09:43Z","timestamp":1654805383000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-021-04267-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3,3]]},"references-count":52,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2022,7]]}},"alternative-id":["4267"],"URL":"https:\/\/doi.org\/10.1007\/s11227-021-04267-5","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,3,3]]},"assertion":[{"value":"21 December 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 March 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 March 2022","order":3,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Correction","order":4,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"A Correction to this paper has been published:","order":5,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"https:\/\/doi.org\/10.1007\/s11227-022-04440-4","URL":"https:\/\/doi.org\/10.1007\/s11227-022-04440-4","order":6,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}}]}}