{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T05:10:25Z","timestamp":1772082625091,"version":"3.50.1"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2023,3,19]],"date-time":"2023-03-19T00:00:00Z","timestamp":1679184000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,3,19]],"date-time":"2023-03-19T00:00:00Z","timestamp":1679184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Math. Prog. Comp."],"published-print":{"date-parts":[[2023,9]]},"DOI":"10.1007\/s12532-023-00237-5","type":"journal-article","created":{"date-parts":[[2023,3,27]],"date-time":"2023-03-27T01:53:35Z","timestamp":1679882015000},"page":"471-508","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Parallel and distributed asynchronous adaptive stochastic gradient methods"],"prefix":"10.1007","volume":"15","author":[{"given":"Yangyang","family":"Xu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yibo","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yonggui","family":"Yan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Colin","family":"Sutcher-Shepard","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Leopold","family":"Grinberg","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jie","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,3,19]]},"reference":[{"key":"237_CR1","unstructured":"Agarwal, A., Duchi, J.C.: Distributed delayed stochastic optimization. In: Advances in Neural Information Processing Systems, pp. 873\u2013881 (2011)"},{"issue":"1","key":"237_CR2","doi-asserted-by":"publisher","first-page":"168","DOI":"10.1109\/TAC.2020.2981035","volume":"66","author":"MS Assran","year":"2020","unstructured":"Assran, M.S., Rabbat, M.G.: Asynchronous gradient push. IEEE Trans. Autom. Control 66(1), 168\u2013183 (2020)","journal-title":"IEEE Trans. Autom. Control"},{"key":"237_CR3","doi-asserted-by":"crossref","unstructured":"B\u00e4ckstr\u00f6m, K., Papatriantafilou, M., Tsigas, P.: Mindthestep-asyncPSGD: adaptive asynchronous parallel stochastic gradient descent. In: 2019 IEEE International Conference on Big Data (Big Data), pp 16\u201325. IEEE (2019)","DOI":"10.1109\/BigData47090.2019.9006054"},{"issue":"1","key":"237_CR4","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/0005-1098(91)90003-K","volume":"27","author":"DP Bertsekas","year":"1991","unstructured":"Bertsekas, D.P., Tsitsiklis, J.N.: Some aspects of parallel and distributed iterative algorithms\u2014a survey. Automatica 27(1), 3\u201321 (1991)","journal-title":"Automatica"},{"issue":"3","key":"237_CR5","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1961189.1961199","volume":"2","author":"C-C Chang","year":"2011","unstructured":"Chang, C.-C., Lin, C.-J.: Libsvm: a library for support vector machines. ACM Trans. Intell. Syst. Technol. (TIST) 2(3), 1\u201327 (2011)","journal-title":"ACM Trans. Intell. Syst. Technol. (TIST)"},{"key":"237_CR6","unstructured":"Chen, J. Gu, Q.: Closing the generalization gap of adaptive gradient methods in training deep neural networks. arXiv preprint arXiv:1806.06763v1 (2018)"},{"key":"237_CR7","doi-asserted-by":"crossref","unstructured":"Chen, J., Zhou, D., Tang, Y., Yang, Z., Cao, Y., Gu, Q.: Closing the generalization gap of adaptive gradient methods in training deep neural networks. In: Proceedings of the Twenty-Ninth International Conference on International Joint Conferences on Artificial Intelligence, pp. 3267\u20133275 (2021)","DOI":"10.24963\/ijcai.2020\/452"},{"key":"237_CR8","unstructured":"Chen, X., Liu, S., Sun, R., Hong, M.: On the convergence of a class of Adam-type algorithms for non-convex optimization. In: International Conference on Learning Representations (2019)"},{"key":"237_CR9","unstructured":"Chrabaszcz, P., Loshchilov, I., Hutter, F.: A down sampled variant of imagenet as an alternative to the CIFAR datasets. arXiv preprint arXiv:1707.08819 (2017)"},{"key":"237_CR10","unstructured":"Darlow, L.N., Crowley, E.J., Antoniou, A., Storkey, A.J.: Cinic-10 is not imagenet or cifar-10. arXiv preprint arXiv:1810.03505 (2018)"},{"key":"237_CR11","unstructured":"Dean, J., Corrado, G., Monga, R., Chen, K., Devin, M., Mao, M., Ranzato, M., Senior, A., Tucker, P., Yang, K., Le, Q., Ng, A.: Large scale distributed deep networks. In: Advances in Neural Information Processing Systems, pp. 1223\u20131231 (2012)"},{"key":"237_CR12","first-page":"2121","volume":"12","author":"J Duchi","year":"2011","unstructured":"Duchi, J., Hazan, E., Singer, Y.: Adaptive subgradient methods for online learning and stochastic optimization. J. Mach. Learn. Res. 12, 2121\u20132159 (2011)","journal-title":"J. Mach. Learn. Res."},{"key":"237_CR13","unstructured":"Fang, B., Klabjan, D.: Convergence analyses of online ADAM algorithm in convex setting and two-layer ReLU neural network. arXiv preprint arXiv:1905.09356 (2019)"},{"issue":"12","key":"237_CR14","doi-asserted-by":"publisher","first-page":"3740","DOI":"10.1109\/TAC.2016.2525015","volume":"61","author":"HR Feyzmahdavian","year":"2016","unstructured":"Feyzmahdavian, H.R., Aytekin, A., Johansson, M.: An asynchronous mini-batch algorithm for regularized stochastic optimization. IEEE Trans. Autom. Control 61(12), 3740\u20133754 (2016)","journal-title":"IEEE Trans. Autom. Control"},{"issue":"4","key":"237_CR15","doi-asserted-by":"publisher","first-page":"2341","DOI":"10.1137\/120880811","volume":"23","author":"S Ghadimi","year":"2013","unstructured":"Ghadimi, S., Lan, G.: Stochastic first-and zeroth-order methods for nonconvex stochastic programming. SIAM J. Optim. 23(4), 2341\u20132368 (2013)","journal-title":"SIAM J. Optim."},{"key":"237_CR16","doi-asserted-by":"crossref","unstructured":"Guan, N., Shan, L., Yang, C., Xu, W., Zhang, M.: Delay compensated asynchronous ADAM algorithm for deep neural networks. In: 2017 IEEE International Symposium on Parallel and Distributed Processing with Applications and 2017 IEEE International Conference on Ubiquitous Computing and Communications (ISPA\/IUCC), pp. 852\u2013859. IEEE (2017)","DOI":"10.1109\/ISPA\/IUCC.2017.00130"},{"key":"237_CR17","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"237_CR18","unstructured":"Keskar, N.S., Mudigere, D., Nocedal, J., Smelyanskiy, M., Tang, P.T.P.: On large-batch training for deep learning: generalization gap and sharp minima. arXiv preprint arXiv:1609.04836 (2016)"},{"key":"237_CR19","unstructured":"Kingma, D.P., Ba, J.: ADAM: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"237_CR20","unstructured":"Krizhevsky, A., Hinton, G., et\u00a0al.: Learning multiple layers of features from tiny images Computer Science Department, University of Toronto, Tech. Rep. (2009)"},{"issue":"1","key":"237_CR21","first-page":"3140","volume":"19","author":"R Leblond","year":"2018","unstructured":"Leblond, R., Pedregosa, F., Lacoste-Julien, S.: Improved asynchronous parallel optimization analysis for stochastic incremental methods. J. Mach. Learn. Res. 19(1), 3140\u20133207 (2018)","journal-title":"J. Mach. Learn. Res."},{"issue":"11","key":"237_CR22","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y LeCun","year":"1998","unstructured":"LeCun, Y., Bottou, L., Bengio, Y., Haffner, P.: Gradient-based learning applied to document recognition. Proc. IEEE 86(11), 2278\u20132324 (1998)","journal-title":"Proc. IEEE"},{"key":"237_CR23","unstructured":"Lian, X., Huang, Y., Li, Y., Liu, J.: Asynchronous parallel stochastic gradient for nonconvex optimization. In: Advances in Neural Information Processing Systems pp. 2737\u20132745 (2015)"},{"key":"237_CR24","unstructured":"Lian, X., Zhang, W., Zhang, C., Liu, J.: Asynchronous decentralized parallel stochastic gradient descent. In: International Conference on Machine Learning, pp. 3043\u20133052 (2018)"},{"key":"237_CR25","unstructured":"Liu, J., Wright, S., R\u00e9, C., Bittorf, V., Sridhar, S.: An asynchronous parallel stochastic coordinate descent algorithm. In: International Conference on Machine Learning, pp. 469\u2013477 (2014)"},{"issue":"1","key":"237_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1561\/1900000062","volume":"9","author":"J Liu","year":"2020","unstructured":"Liu, J., Zhang, C., et al.: Distributed learning systems with first-order methods. Found Trends\u00ae Databases 9(1), 1\u2013100 (2020)","journal-title":"Found Trends\u00ae Databases"},{"key":"237_CR27","unstructured":"Luo, L., Xiong, Y., Liu, Y.: Adaptive gradient methods with dynamic bound of learning rate. In: International Conference on Learning Representations (2019)"},{"issue":"4","key":"237_CR28","doi-asserted-by":"publisher","first-page":"2202","DOI":"10.1137\/16M1057000","volume":"27","author":"H Mania","year":"2017","unstructured":"Mania, H., Pan, X., Papailiopoulos, D., Recht, B., Ramchandran, K., Jordan, M.I.: Perturbed iterate analysis for asynchronous stochastic optimization. SIAM J. Optim. 27(4), 2202\u20132229 (2017)","journal-title":"SIAM J. Optim."},{"key":"237_CR29","unstructured":"Masters, D., Luschi, C.: Revisiting small batch training for deep neural networks. arXiv preprint arXiv:1804.07612 (2018)"},{"key":"237_CR30","doi-asserted-by":"publisher","first-page":"6065","DOI":"10.1109\/TSP.2022.3223214","volume":"70","author":"P Nazari","year":"2022","unstructured":"Nazari, P., Tarzanagh, D.A., Michailidis, G.: Dadam: a consensus-based distributed adaptive gradient method for online optimization. IEEE Trans. Signal Process. 70, 6065\u20136079 (2022)","journal-title":"IEEE Trans. Signal Process."},{"issue":"4","key":"237_CR31","doi-asserted-by":"publisher","first-page":"1574","DOI":"10.1137\/070704277","volume":"19","author":"A Nemirovski","year":"2009","unstructured":"Nemirovski, A., Juditsky, A., Lan, G., Shapiro, A.: Robust stochastic approximation approach to stochastic programming. SIAM J. Optim. 19(4), 1574\u20131609 (2009)","journal-title":"SIAM J. Optim."},{"issue":"5","key":"237_CR32","doi-asserted-by":"publisher","first-page":"A2851","DOI":"10.1137\/15M1024950","volume":"38","author":"Z Peng","year":"2016","unstructured":"Peng, Z., Xu, Y., Yan, M., Yin, W.: ARock: an algorithmic framework for asynchronous parallel coordinate updates. SIAM J. Sci. Comput. 38(5), A2851\u2013A2879 (2016)","journal-title":"SIAM J. Sci. Comput."},{"issue":"1","key":"237_CR33","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1007\/s40305-017-0183-1","volume":"7","author":"Z Peng","year":"2019","unstructured":"Peng, Z., Xu, Y., Yan, M., Yin, W.: On the convergence of asynchronous parallel iteration with unbounded delays. J. Oper. Res. Soc. China 7(1), 5\u201342 (2019)","journal-title":"J. Oper. Res. Soc. China"},{"issue":"4","key":"237_CR34","doi-asserted-by":"publisher","first-page":"838","DOI":"10.1137\/0330046","volume":"30","author":"BT Polyak","year":"1992","unstructured":"Polyak, B.T., Juditsky, A.B.: Acceleration of stochastic approximation by averaging. SIAM J. Control. Optim. 30(4), 838\u2013855 (1992)","journal-title":"SIAM J. Control. Optim."},{"key":"237_CR35","unstructured":"Recht, B., Re, C., Wright, S., Niu, F.: Hogwild: a lock-free approach to parallelizing stochastic gradient descent. In: Advances in Neural Information Processing Systems, pp. 693\u2013701 (2011)"},{"key":"237_CR36","unstructured":"Reddi, S.J., Kale, S., Kumar, S.: On the convergence of ADAM and beyond. In: International Conference on Learning Representations (2018)"},{"issue":"3","key":"237_CR37","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1214\/aoms\/1177729586","volume":"22","author":"H Robbins","year":"1951","unstructured":"Robbins, H., Monro, S.: A stochastic approximation method. Ann. Math. Stat. 22(3), 400\u2013407 (1951)","journal-title":"Ann. Math. Stat."},{"key":"237_CR38","unstructured":"Springenberg, J.T., Dosovitskiy, A., Brox, T., Riedmiller, M.: Striving for simplicity: the all convolutional net. arXiv preprint arXiv:1412.6806 (2014)"},{"key":"237_CR39","unstructured":"Sra, S., Yu, A.W., Li, M., Smola, A.: Adadelay: delay adaptive distributed stochastic optimization. In: Artificial Intelligence and Statistics, pp. 957\u2013965 (2016)"},{"issue":"2","key":"237_CR40","first-page":"26","volume":"4","author":"T Tieleman","year":"2012","unstructured":"Tieleman, T., Hinton, G.: RMSProp: divide the gradient by a running average of its recent magnitude. COURSERA: Neural Netw. Mach. Learn. 4(2), 26\u201331 (2012)","journal-title":"COURSERA: Neural Netw. Mach. Learn."},{"key":"237_CR41","doi-asserted-by":"publisher","first-page":"61706","DOI":"10.1109\/ACCESS.2019.2916341","volume":"7","author":"PT Tran","year":"2019","unstructured":"Tran, P.T., Phong, L.T.: On the convergence proof of AMSGrad and a new version. IEEE Access 7, 61706\u201361716 (2019)","journal-title":"IEEE Access"},{"key":"237_CR42","unstructured":"Wang, G., Lu, S., Cheng, Q., Tu, W., Zhang, L.: SAdam: a variant of adam for strongly convex functions. In: International Conference on Learning Representations (2020)"},{"key":"237_CR43","unstructured":"Wu, J., Huang, W., Huang, J., Zhang, T.: Error compensated quantized SGD and its applications to large-scale distributed optimization. In: International Conference on Machine Learning, pp. 5325\u20135333 (2018)"},{"key":"237_CR44","doi-asserted-by":"crossref","unstructured":"Yan, Y., Yang, T., Li, Z., Lin, Q., Yang, Y.: A unified analysis of stochastic momentum methods for deep learning. In: Proceedings of the Twenty-Seventh International Joint Conference on Artificial Intelligence, IJCAI-18, pp. 2955\u20132961. International Joint Conferences on Artificial Intelligence Organization, 7 (2018)","DOI":"10.24963\/ijcai.2018\/410"},{"key":"237_CR45","doi-asserted-by":"crossref","unstructured":"Zagoruyko, S., Komodakis, N.: Wide residual networks. In: British Machine Vision Conference 2016. British Machine Vision Association (2016)","DOI":"10.5244\/C.30.87"},{"key":"237_CR46","unstructured":"Zhou, D., Tang, Y., Yang, Z., Cao, Y., Gu, Q.:. On the convergence of adaptive gradient methods for nonconvex optimization. arXiv preprint arXiv:1808.05671 (2018)"}],"container-title":["Mathematical Programming Computation"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s12532-023-00237-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s12532-023-00237-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s12532-023-00237-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,24]],"date-time":"2023-07-24T15:17:13Z","timestamp":1690211833000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s12532-023-00237-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,3,19]]},"references-count":46,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2023,9]]}},"alternative-id":["237"],"URL":"https:\/\/doi.org\/10.1007\/s12532-023-00237-5","relation":{},"ISSN":["1867-2949","1867-2957"],"issn-type":[{"value":"1867-2949","type":"print"},{"value":"1867-2957","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,3,19]]},"assertion":[{"value":"21 January 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 February 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 March 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"The full code was made available for review and has been released at .","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Code availability"}}]}}