{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,6]],"date-time":"2025-10-06T17:50:19Z","timestamp":1759773019821,"version":"3.37.3"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,6,21]],"date-time":"2023-06-21T00:00:00Z","timestamp":1687305600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,6,21]],"date-time":"2023-06-21T00:00:00Z","timestamp":1687305600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["11901338"],"award-info":[{"award-number":["11901338"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100019339","name":"Tsinghua Initiative Scientific Research Program","doi-asserted-by":"publisher","award":["11771243"],"award-info":[{"award-number":["11771243"]}],"id":[{"id":"10.13039\/501100019339","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Optim Lett"],"published-print":{"date-parts":[[2024,5]]},"DOI":"10.1007\/s11590-023-02026-4","type":"journal-article","created":{"date-parts":[[2023,6,21]],"date-time":"2023-06-21T04:14:23Z","timestamp":1687320863000},"page":"909-923","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Convergence rates of training deep neural networks via alternating minimization methods"],"prefix":"10.1007","volume":"18","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4488-6782","authenticated-orcid":false,"given":"Jintao","family":"Xu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chenglong","family":"Bao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenxun","family":"Xing","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,6,21]]},"reference":[{"key":"2026_CR1","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1007\/s10107-007-0133-5","volume":"116","author":"H Attouch","year":"2009","unstructured":"Attouch, H., Bolte, J.: On the convergence of the proximal algorithm for nonsmooth functions involving analytic features. Math. Program. 116, 5\u201316 (2009). https:\/\/doi.org\/10.1007\/s10107-007-0133-5","journal-title":"Math. Program."},{"issue":"2","key":"2026_CR2","doi-asserted-by":"publisher","first-page":"438","DOI":"10.1287\/moor.1100.0449","volume":"35","author":"H Attouch","year":"2010","unstructured":"Attouch, H., Bolte, J., Redont, P., Soubeyran, A.: Proximal alternating minimization and projection methods for nonconvex problems: an approach based on the Kurdyka-\u0141ojasiewicz inequality. Math. Oper. Res. 35(2), 438\u2013457 (2010). https:\/\/doi.org\/10.1287\/moor.1100.0449","journal-title":"Math. Oper. Res."},{"key":"2026_CR3","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1007\/s10107-011-0484-9","volume":"137","author":"H Attouch","year":"2013","unstructured":"Attouch, H., Bolte, J., Svaiter, B.F.: Convergence of descent methods for semi-algebraic and tame problems: proximal algorithms, forward-backward splitting, and regularized Gauss-Seidel methods. Math. Program. 137, 91\u2013129 (2013). https:\/\/doi.org\/10.1007\/s10107-011-0484-9","journal-title":"Math. Program."},{"issue":"2","key":"2026_CR4","doi-asserted-by":"publisher","first-page":"157","DOI":"10.1109\/72.279181","volume":"5","author":"Y Bengio","year":"1994","unstructured":"Bengio, Y., Simard, P., Frasconi, P.: Learning long-term dependencies with gradient descent is difficult. IEEE Trans. Neural Netw. 5(2), 157\u2013166 (1994). https:\/\/doi.org\/10.1109\/72.279181","journal-title":"IEEE Trans. Neural Netw."},{"issue":"4","key":"2026_CR5","doi-asserted-by":"publisher","first-page":"1205","DOI":"10.1137\/050644641","volume":"17","author":"J Bolte","year":"2007","unstructured":"Bolte, J., Daniilidis, A., Lewis, A.: The \u0141ojasiewicz inequality for nonsmooth subanalytic functions with applications to subgradient dynamical systems. SIAM J. Opt. 17(4), 1205\u20131223 (2007). https:\/\/doi.org\/10.1137\/050644641","journal-title":"SIAM J. Opt."},{"key":"2026_CR6","doi-asserted-by":"publisher","first-page":"459","DOI":"10.1007\/s10107-013-0701-9","volume":"146","author":"J Bolte","year":"2014","unstructured":"Bolte, J., Sabach, S., Teboulle, M.: Proximal alternating linearized minimization for nonconvex and nonsmooth problems. Math. Program. 146, 459\u2013494 (2014). https:\/\/doi.org\/10.1007\/s10107-013-0701-9","journal-title":"Math. Program."},{"issue":"1","key":"2026_CR7","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1561\/2200000016","volume":"3","author":"S Boyd","year":"2011","unstructured":"Boyd, S., Parikh, N., Chu, E., Peleato, B., Eckstein, J.: Distributed optimization and statistical learning via the alternating direction method of multipliers. Found. Trends Mach. Learn. 3(1), 1\u2013122 (2011). https:\/\/doi.org\/10.1561\/2200000016","journal-title":"Found. Trends Mach. Learn."},{"key":"2026_CR8","unstructured":"Carreira-Perpi\u00f1\u00e1n, M.\u00c1., Wang, W.: Distributed optimization of deeply nested systems. In: Kaski, S., Corander, J. (eds.) Proceedings of the Seventeenth International Conference on Artificial Intelligence and Statistics, Proceedings of Machine Learning Research, vol.\u00a033, pp. 10\u201319. PMLR (2014)"},{"key":"2026_CR9","doi-asserted-by":"crossref","unstructured":"Chen, L.C., Zhu, Y., Papandreou, G., Schroff, F., Adam, H.: Encoder-decoder with atrous separable convolution for semantic image segmentation. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 801\u2013818 (2018)","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"2026_CR10","doi-asserted-by":"publisher","unstructured":"Cheng, H.T., Koc, L., Harmsen, J., Shaked, T., Chandra, T., Aradhye, H., Anderson, G., Corrado, G., Chai, W., Ispir, M., Anil, R., Haque, Z., Hong, L., Jain, V., Liu, X., Shah, H.: Wide & deep learning for recommender systems. In: Proceedings of the 1st Workshop on Deep Learning for Recommender Systems, pp. 7\u201310 (2016). https:\/\/doi.org\/10.1145\/2988450.2988454","DOI":"10.1145\/2988450.2988454"},{"key":"2026_CR11","unstructured":"Choromanska, A., Cowen, B., Kumaravel, S., Luss, R., Rigotti, M., Rish, I., Kingsbury, B., DiAchille, P., Gurev, V., Tejwani, R., Bouneffouf, D.: Beyond backprop: Online alternating minimization with auxiliary variables. In: Chaudhuri, K., Salakhutdinov, R. (eds.) Proceedings of the 36th International Conference on Machine Learning, Proceedings of Machine Learning Research, vol.\u00a097, pp. 1193\u20131202. PMLR (2019). URL https:\/\/proceedings.mlr.press\/v97\/choromanska19a.html"},{"key":"2026_CR12","doi-asserted-by":"publisher","unstructured":"Covington, P., Adams, J., Sargin, E.: Deep neural networks for YouTube recommendations. In: Proceedings of the 10th ACM Conference on Recommender Systems, pp. 191\u2013198 (2016). https:\/\/doi.org\/10.1145\/2959100.2959190","DOI":"10.1145\/2959100.2959190"},{"key":"2026_CR13","doi-asserted-by":"publisher","first-page":"874","DOI":"10.1007\/s10957-014-0642-3","volume":"165","author":"P Frankel","year":"2015","unstructured":"Frankel, P., Garrigos, G., Peypouquet, J.: Splitting methods with variable metric for Kurdyka-\u0141ojasiewicz functions and general convergence rates. J. Opt. Theory Appl. 165, 874\u2013900 (2015). https:\/\/doi.org\/10.1007\/s10957-014-0642-3","journal-title":"J. Opt. Theory Appl."},{"key":"2026_CR14","unstructured":"Goodfellow, I., Bengio, Y., Courville, A.: Deep Learning. MIT Press (2016). URL http:\/\/www.deeplearningbook.org"},{"key":"2026_CR15","unstructured":"Gu, F., Askari, A., Ghaoui, L.E.: Fenchel lifted networks: A Lagrange relaxation of neural network training. In: S.\u00a0Chiappa, R.\u00a0Calandra (eds.) Proceedings of the Twenty Third International Conference on Artificial Intelligence and Statistics, Proceedings of Machine Learning Research, vol. 108, pp. 3362\u20133371. PMLR (2020). URL https:\/\/proceedings.mlr.press\/v108\/gu20a.html"},{"key":"2026_CR16","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"2026_CR17","doi-asserted-by":"publisher","unstructured":"Jagatap, G., Hegde, C.: Learning ReLU networks via alternating minimization. arXiv preprint arXiv: 1806.07863 (2018). https:\/\/doi.org\/10.48550\/arXiv.1806.07863","DOI":"10.48550\/arXiv.1806.07863"},{"key":"2026_CR18","doi-asserted-by":"publisher","unstructured":"Kiaee, F., Gagn\u00e9, C., Abbasi, M.: Alternating direction method of multipliers for sparse convolutional neural networks. arXiv preprint arXiv: 1611.01590 (2016). https:\/\/doi.org\/10.48550\/arXiv.1611.01590","DOI":"10.48550\/arXiv.1611.01590"},{"key":"2026_CR19","doi-asserted-by":"publisher","unstructured":"Krantz, S.G., Parks, H.R.: A Primer of Real Analytic Functions, 2 edn. Birkh\u00e4user, Boston (2002). https:\/\/doi.org\/10.1007\/978-0-8176-8134-0","DOI":"10.1007\/978-0-8176-8134-0"},{"issue":"3","key":"2026_CR20","doi-asserted-by":"publisher","first-page":"769","DOI":"10.5802\/aif.1638","volume":"48","author":"K Kurdyka","year":"1998","unstructured":"Kurdyka, K.: On gradients of functions definable in o-minimal structures. Ann. Inst. Fourier 48(3), 769\u2013783 (1998). https:\/\/doi.org\/10.5802\/aif.1638","journal-title":"Ann. Inst. Fourier"},{"key":"2026_CR21","doi-asserted-by":"publisher","unstructured":"Lau, T.T.K., Zeng, J., Wu, B., Yao, Y.: A proximal block coordinate descent algorithm for deep neural network training. arXiv preprint arXiv:1803.09082 (2018). https:\/\/doi.org\/10.48550\/arXiv.1803.09082","DOI":"10.48550\/arXiv.1803.09082"},{"key":"2026_CR22","doi-asserted-by":"publisher","first-page":"371","DOI":"10.1007\/s10107-015-0963-5","volume":"159","author":"G Li","year":"2016","unstructured":"Li, G., Pong, T.K.: Douglas-Rachford splitting for nonconvex optimization with application to nonconvex feasibility problems. Math. Program. 159, 371\u2013401 (2016). https:\/\/doi.org\/10.1007\/s10107-015-0963-5","journal-title":"Math. Program."},{"key":"2026_CR23","doi-asserted-by":"publisher","first-page":"1199","DOI":"10.1007\/s10208-017-9366-8","volume":"18","author":"G Li","year":"2018","unstructured":"Li, G., Pong, T.K.: Calculus of the exponent of Kurdyka-\u0141ojasiewicz inequality and its applications to linear convergence of first-order methods. Found. Comput. Math. 18, 1199\u20131232 (2018). https:\/\/doi.org\/10.1007\/s10208-017-9366-8","journal-title":"Found. Comput. Math."},{"key":"2026_CR24","doi-asserted-by":"publisher","unstructured":"Li, X., Milzarek, A., Qiu, J.: Convergence of random reshuffling under the Kurdyka-\u0141ojasiewicz inequality. arXiv preprint arXiv: 2110.04926 (2021). https:\/\/doi.org\/10.48550\/arXiv.2110.04926","DOI":"10.48550\/arXiv.2110.04926"},{"key":"2026_CR25","unstructured":"\u0141ojasiewicz, S.: Une propri\u00e9t\u00e9 topologique des sous-ensembles analytiques r\u00e9els. In: Les \u00c9quations aux D\u00e9riv\u00e9es Partielles, pp. 87\u201389. \u00c9ditions du Centre National de la Recherche Scientifique, Paris (1963)"},{"key":"2026_CR26","unstructured":"\u0141ojasiewicz, S.: Sur les trajectoires du gradient d\u2019une fonction analytique. In: Seminari di Geometria 1982-1983, pp. 115\u2013117. Dipartimento di Matematica, Universit\u00e0 di Bologna, Bologna (1984)"},{"issue":"5","key":"2026_CR27","doi-asserted-by":"publisher","first-page":"1575","DOI":"10.5802\/aif.1384","volume":"43","author":"S \u0141ojasiewicz","year":"1993","unstructured":"\u0141ojasiewicz, S.: Sur la g\u00e9om\u00e9trie semi- et sous- analytique. Ann. Inst. Fourier 43(5), 1575\u20131595 (1993). https:\/\/doi.org\/10.5802\/aif.1384","journal-title":"Ann. Inst. Fourier"},{"key":"2026_CR28","doi-asserted-by":"publisher","unstructured":"Luo, Z.Q., Pang, J.S., Ralph, D.: Mathematical Programs with Equilibrium Constraints. Cambridge University Press, Cambridge (1996). https:\/\/doi.org\/10.1017\/CBO9780511983658","DOI":"10.1017\/CBO9780511983658"},{"key":"2026_CR29","doi-asserted-by":"publisher","unstructured":"Mordukhovich, B.S.: Variational Analysis and Generalized Differentiation I: Basic Theory. Springer-Verlag, Berlin, Heidelberg (2006). https:\/\/doi.org\/10.1007\/3-540-31247-1","DOI":"10.1007\/3-540-31247-1"},{"key":"2026_CR30","doi-asserted-by":"publisher","unstructured":"Rockafellar, R.T., Wets, R.J.B.: Variational Analysis. Springer-Verlag, Berlin, Heidelberg (1998). https:\/\/doi.org\/10.1007\/978-3-642-02431-3","DOI":"10.1007\/978-3-642-02431-3"},{"key":"2026_CR31","doi-asserted-by":"publisher","unstructured":"Shi, H.J.M., Tu, S., Xu, Y., Yin, W.: A primer on coordinate descent algorithms. arXiv preprint arXiv:1610.00040 (2016). https:\/\/doi.org\/10.48550\/arXiv.1610.00040","DOI":"10.48550\/arXiv.1610.00040"},{"key":"2026_CR32","doi-asserted-by":"publisher","unstructured":"Sun, W., Yuan, Y.X.: Optimization Theory and Methods: Nonlinear Programming, vol.\u00a01, chap.\u00a01, pp. 1\u201370. Springer, New York, NY (2006). https:\/\/doi.org\/10.1007\/0-387-24976-1_1","DOI":"10.1007\/0-387-24976-1_1"},{"key":"2026_CR33","volume-title":"Advances in Neural Information Processing Systems","author":"I Sutskever","year":"2014","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: Ghahramani, Z., Welling, M., Cortes, C., Lawrence, N., Weinberger, K.Q. (eds.) Advances in Neural Information Processing Systems, vol. 27. Curran Associates Inc (2014)"},{"key":"2026_CR34","unstructured":"Taylor, G., Burmeister, R., Xu, Z., Singh, B., Patel, A., Goldstein, T.: Training neural networks without gradients: a scalable ADMM approach. In: Balcan, M.F., Weinberger, K.Q. (eds.) Proceedings of the 33rd International Conference on Machine Learning, Proceedings of Machine Learning Research, vol.\u00a048, pp. 2722\u20132731. PMLR (2016)"},{"key":"2026_CR35","volume-title":"Advances in Neural Information Processing Systems","author":"A Vaswani","year":"2017","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141, Polosukhin, I.: Attention is all you need. In: Guyon, I., Luxburg, U.V., Bengio, S., Wallach, H., Fergus, R., Vishwanathan, S., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 30. Curran Associates, Inc. (2017)"},{"key":"2026_CR36","doi-asserted-by":"publisher","unstructured":"Wang, J., Chai, Z., Cheng, Y., Zhao, L.: Toward model parallelism for deep neural network based on gradient-free ADMM framework. In: 2020 IEEE International Conference on Data Mining (ICDM), pp. 591\u2013600 (2020). https:\/\/doi.org\/10.1109\/ICDM50108.2020.00068","DOI":"10.1109\/ICDM50108.2020.00068"},{"key":"2026_CR37","doi-asserted-by":"publisher","first-page":"130","DOI":"10.1016\/j.neucom.2022.02.039","volume":"487","author":"J Wang","year":"2022","unstructured":"Wang, J., Li, H., Zhao, L.: Accelerated gradient-free neural network training by multi-convex alternating optimization. Neurocomputing 487, 130\u2013143 (2022). https:\/\/doi.org\/10.1016\/j.neucom.2022.02.039","journal-title":"Neurocomputing"},{"key":"2026_CR38","doi-asserted-by":"publisher","unstructured":"Wang, J., Yu, F., Chen, X., Zhao, L.: ADMM for efficient deep learning with global convergence. In: Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, KDD\u2019 19, p. 111-119. Association for Computing Machinery, New York, NY, USA (2019). https:\/\/doi.org\/10.1145\/3292500.3330936","DOI":"10.1145\/3292500.3330936"},{"issue":"3","key":"2026_CR39","doi-asserted-by":"publisher","first-page":"1758","DOI":"10.1137\/120887795","volume":"6","author":"Y Xu","year":"2013","unstructured":"Xu, Y., Yin, W.: A block coordinate descent method for regularized multiconvex optimization with applications to nonnegative tensor factorization and completion. SIAM J. Imaging Sci. 6(3), 1758\u20131789 (2013). https:\/\/doi.org\/10.1137\/120887795","journal-title":"SIAM J. Imaging Sci."},{"key":"2026_CR40","doi-asserted-by":"publisher","unstructured":"Yu, P., Li, G., Pong, T.K.: Kurdyka-\u0141ojasiewicz exponent via inf-projection. Found. Comput. Math. (2021). https:\/\/doi.org\/10.1007\/s10208-021-09528-6","DOI":"10.1007\/s10208-021-09528-6"},{"key":"2026_CR41","unstructured":"Zeng, J., Lau, T.T.K., Lin, S.B., Yao, Y.: Global convergence of block coordinate descent in deep learning. In: Chaudhuri, K., Salakhutdinov, R. (eds.) Proceedings of the 36th International Conference on Machine Learning, Proceedings of Machine Learning Research, vol.\u00a097, pp. 7313\u20137323. PMLR (2019)"},{"issue":"199","key":"2026_CR42","first-page":"1","volume":"22","author":"J Zeng","year":"2021","unstructured":"Zeng, J., Lin, S.B., Yao, Y., Zhou, D.X.: On ADMM in deep learning: convergence and saturation-avoidance. J. Mach. Learn. Res. 22(199), 1\u201367 (2021)","journal-title":"J. Mach. Learn. Res."},{"key":"2026_CR43","unstructured":"Zhang, Z., Brand, M.: Convergent block coordinate descent for training Tikhonov regularized deep neural networks. In: Guyon, I., Luxburg, U.V, Bengio, S., Wallach, H., Fergus, R., Vishwanathan, S., Garnett, R. (eds.) Advances in Neural Information Processing Systems. Curran Associates, Inc. (2017)"},{"key":"2026_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Chen, Y., Saligrama, V.: Efficient training of very deep neural networks for supervised hashing. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1487\u20131495 (2016)","DOI":"10.1109\/CVPR.2016.165"}],"container-title":["Optimization Letters"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11590-023-02026-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11590-023-02026-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11590-023-02026-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,12]],"date-time":"2024-04-12T03:31:01Z","timestamp":1712892661000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11590-023-02026-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,21]]},"references-count":44,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2024,5]]}},"alternative-id":["2026"],"URL":"https:\/\/doi.org\/10.1007\/s11590-023-02026-4","relation":{},"ISSN":["1862-4472","1862-4480"],"issn-type":[{"type":"print","value":"1862-4472"},{"type":"electronic","value":"1862-4480"}],"subject":[],"published":{"date-parts":[[2023,6,21]]},"assertion":[{"value":"23 August 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 June 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 June 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}