{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,23]],"date-time":"2026-02-23T21:45:11Z","timestamp":1771883111214,"version":"3.50.1"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2021,9,24]],"date-time":"2021-09-24T00:00:00Z","timestamp":1632441600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,9,24]],"date-time":"2021-09-24T00:00:00Z","timestamp":1632441600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2022,1]]},"DOI":"10.1007\/s10994-021-06056-w","type":"journal-article","created":{"date-parts":[[2021,9,24]],"date-time":"2021-09-24T20:23:48Z","timestamp":1632515028000},"page":"345-375","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":13,"title":["Understanding generalization error of SGD in nonconvex optimization"],"prefix":"10.1007","volume":"111","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3982-9145","authenticated-orcid":false,"given":"Yi","family":"Zhou","sequence":"first","affiliation":[]},{"given":"Yingbin","family":"Liang","sequence":"additional","affiliation":[]},{"given":"Huishuai","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,9,24]]},"reference":[{"issue":"1","key":"6056_CR1","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1007\/s10107-011-0484-9","volume":"137","author":"H Attouch","year":"2013","unstructured":"Attouch, H., Bolte, J., & Svaiter, B. (2013). Convergence of descent methods for semi-algebraic and tame problems: proximal algorithms, forward-backward splitting, and regularized Gauss-Seidel methods. Mathematical Programming, 137(1), 91\u2013129.","journal-title":"Mathematical Programming"},{"key":"6056_CR2","unstructured":"Bartlett, P., Foster, D.J., & Telgarsky, M. (2017). Spectrally-normalized margin bounds for neural networks. In: Proceedings of the Advances in Neural Information Processing Systems (NIPS), pp. 6240\u20136249."},{"key":"6056_CR3","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4419-9467-7","volume-title":"Convex Analysis and Monotone Operator Theory in Hilbert Spaces","author":"H Bauschke","year":"2011","unstructured":"Bauschke, H., & Combettes, P. (2011). Convex Analysis and Monotone Operator Theory in Hilbert Spaces. New York: Springer."},{"issue":"1","key":"6056_CR4","doi-asserted-by":"publisher","first-page":"183","DOI":"10.1137\/080716542","volume":"2","author":"A Beck","year":"2009","unstructured":"Beck, A., & Teboulle, M. (2009). A fast iterative shrinkage-thresholding algorithm for linear inverse problems. SIAM Journal on Imaging Sciences, 2(1), 183\u2013202.","journal-title":"SIAM Journal on Imaging Sciences"},{"key":"6056_CR5","doi-asserted-by":"crossref","unstructured":"Bottou, L. (2010). Large-scale machine learning with stochastic gradient descent. In Proceedings of the 19th International Conference on Computational Statistics, pp. 177\u2013186.","DOI":"10.1007\/978-3-7908-2604-3_16"},{"key":"6056_CR6","first-page":"499","volume":"2","author":"O Bousquet","year":"2002","unstructured":"Bousquet, O., & Elisseeff, A. (2002). Stability and generalization. Journal of Machine Learning Research, 2, 499\u2013526.","journal-title":"Journal of Machine Learning Research"},{"key":"6056_CR7","doi-asserted-by":"crossref","unstructured":"Chang, C., & Lin, C. (2011). LIBSVM: A library for support vector machines. ACM Transactions on Intelligent Systems and Technology, 2:1\u201327. http:\/\/www.csie.ntu.edu.tw\/~cjlin\/libsvm.","DOI":"10.1145\/1961189.1961199"},{"key":"6056_CR8","unstructured":"Charles, Z., & Papailiopoulos, D. (2017). Stability and generalization of learning algorithms that converge to global optima. ArXiv: 1710.08402."},{"key":"6056_CR9","first-page":"55","volume":"6","author":"A Elisseeff","year":"2005","unstructured":"Elisseeff, A., Evgeniou, T., & Pontil, M. (2005). Stability of randomized learning algorithms. Journal of Machine Learning Research, 6, 55\u201379.","journal-title":"Journal of Machine Learning Research"},{"issue":"1","key":"6056_CR10","doi-asserted-by":"publisher","first-page":"267","DOI":"10.1007\/s10107-014-0846-1","volume":"155","author":"S Ghadimi","year":"2016","unstructured":"Ghadimi, S., Lan, G., & Zhang, H. (2016). Mini-batch stochastic approximation methods for nonconvex stochastic composite optimization. Mathematical Programming, 155(1), 267\u2013305.","journal-title":"Mathematical Programming"},{"key":"6056_CR11","unstructured":"Hardt, M., Recht, B., & Singer, Y. (2016). Train faster, generalize better: Stability of stochastic gradient descent. In Proceedings of the 33rd International Conference on Machine Learning (ICML), pp. 1225\u20131234."},{"key":"6056_CR12","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"6056_CR13","doi-asserted-by":"crossref","unstructured":"Karimi, H., Nutini, J., & Schmidt, M. (2016). Linear convergence of gradient and proximal-gradient methods under the Polyak\u2013\u0141ojasiewicz condition. Machine Learning and Knowledge Discovery in Databases: European Conference, pp. 795\u2013811.","DOI":"10.1007\/978-3-319-46128-1_50"},{"key":"6056_CR14","unstructured":"Krizhevsky, A. (2009). Learning multiple layers of features from tiny images. Technical report."},{"key":"6056_CR15","unstructured":"Kuzborskij, I., & Lampert, C.\u00a0H. (2017). Data-dependent stability of stochastic gradient descent. ArXiv: 1703.01678v3."},{"issue":"11","key":"6056_CR16","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y Lecun","year":"1998","unstructured":"Lecun, Y., Bottou, L., Bengio, Y., & Haffner, P. (1998). Gradient-based learning applied to document recognition. Proceedings of the IEEE, 86(11), 2278\u20132324.","journal-title":"Proceedings of the IEEE"},{"key":"6056_CR17","unstructured":"Li, X., Ling, S., Strohmer, T., & Wei, K. (2016). Rapid, robust, and reliable blind deconvolution via nonconvex optimization. arXiv: 1606.04933v1."},{"key":"6056_CR18","unstructured":"Li, Q., Zhou, Y., Liang, Y., & Varshney, P.\u00a0K. (2017). Convergence analysis of proximal gradient with momentum for nonconvex optimization. In Proceedings of the 34th International Conference on Machine Learning (ICML)."},{"key":"6056_CR19","first-page":"1","volume":"18","author":"J Lin","year":"2017","unstructured":"Lin, J., & Rosasco, L. (2017). Optimal rates for multi-pass stochastic gradient methods. Journal of Machine Learning Research, 18, 1\u201347.","journal-title":"Journal of Machine Learning Research"},{"key":"6056_CR20","unstructured":"\u0141ojasiewicz, S. (1963). A topological property of real analytic subsets. Colloid du CNRS, Les equations aux derivees partielles, pp. 87\u201389."},{"key":"6056_CR21","unstructured":"London, B. (2017). A PAC-Bayesian analysis of randomized learning with application to stochastic gradient descent. In Proceedings of the 31st International Conference on Neural Information Processing Systems (NIPS)."},{"key":"6056_CR22","doi-asserted-by":"crossref","unstructured":"McAllester, D.\u00a0A. (1999). PAC-Bayesian model averaging. In Proceedings of the 12th Annual Conference on Computational Learning Theory, pp. 164\u2013170.","DOI":"10.1145\/307400.307435"},{"key":"6056_CR23","unstructured":"Mou, W., Wang, L., Zhai, X., & Zheng, K. (2017). Generalization bounds of SGLD for non-convex learning: Two theoretical viewpoints. ArXiv: 1707.05947."},{"issue":"4","key":"6056_CR24","doi-asserted-by":"publisher","first-page":"1574","DOI":"10.1137\/070704277","volume":"19","author":"A Nemirovski","year":"2009","unstructured":"Nemirovski, A., Juditsky, A., Lan, G., & Shapiro, A. (2009). Robust stochastic approximation approach to stochastic programming. SIAM Journal on Optimization, 19(4), 1574\u20131609.","journal-title":"SIAM Journal on Optimization"},{"key":"6056_CR25","unstructured":"Neyshabur, B., Bhojanapalli, S., McAllester, D., & Srebro, N. (2018). A PAC-Bayesian approach to spectrally-normalized margin bounds for neural networks. In Proceedings of the International Conference on Learning Representations(ICLR)."},{"issue":"3","key":"6056_CR26","doi-asserted-by":"publisher","first-page":"127","DOI":"10.1561\/2400000003","volume":"1","author":"N Parikh","year":"2014","unstructured":"Parikh, N., & Boyd, S. (2014). Proximal algorithms. Foundations and Trends in Optimization, 1(3), 127\u2013239.","journal-title":"Foundations and Trends in Optimization"},{"key":"6056_CR27","doi-asserted-by":"crossref","unstructured":"Pensia, A., Jog, V., & Loh, P. (2018). Generalization error bounds for noisy, iterative algorithms. arXiv: 1801.04295v1.","DOI":"10.1109\/ISIT.2018.8437571"},{"key":"6056_CR28","unstructured":"Poggio, T., Voinea, S., & L., R. (2011). Online learning, stability, and stochastic gradient descent. ArXiv: 1105.4701v3."},{"issue":"4","key":"6056_CR29","doi-asserted-by":"publisher","first-page":"864","DOI":"10.1016\/0041-5553(63)90382-3","volume":"3","author":"B Polyak","year":"1963","unstructured":"Polyak, B. (1963). Gradient methods for the minimisation of functionals. USSR Computational Mathematics and Mathematical Physics, 3(4), 864\u2013878.","journal-title":"USSR Computational Mathematics and Mathematical Physics"},{"key":"6056_CR30","unstructured":"Russo, D., & Zou, J. (2016). Controlling bias in adaptive data analysis using information theory. In Proceedings of the 19th International Conference on Artificial Intelligence and Statistics (AISTATS), vol.\u00a051, pp. 1232\u20131240."},{"key":"6056_CR31","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9781107298019","volume-title":"Understanding Machine Learning: From Theory to Algorithms","author":"S Shalev-Shwartz","year":"2014","unstructured":"Shalev-Shwartz, S., & Ben-David, S. (2014). Understanding Machine Learning: From Theory to Algorithms. New York: Cambridge University Press."},{"key":"6056_CR32","first-page":"2635","volume":"11","author":"S Shalev-Shwartz","year":"2010","unstructured":"Shalev-Shwartz, S., Shamir, O., Srebro, N., & Sridharan, K. (2010). Learnability, stability and uniform convergence. Journal of Machine Learning Research, 11, 2635\u20132670.","journal-title":"Journal of Machine Learning Research"},{"key":"6056_CR33","doi-asserted-by":"publisher","DOI":"10.1007\/0-387-26771-9_4","volume-title":"On Complexity of Stochastic Programming Problems","author":"A Shapiro","year":"2005","unstructured":"Shapiro, A., & Nemirovski, A. (2005). On Complexity of Stochastic Programming Problems. New York: Springer."},{"issue":"16","key":"6056_CR34","doi-asserted-by":"publisher","first-page":"4265","DOI":"10.1109\/TSP.2017.2708039","volume":"65","author":"J Sokoli\u0107","year":"2017","unstructured":"Sokoli\u0107, J., Giryes, R., Sapiro, G., & Rodrigues, M. R. D. (2017). Robust large margin deep neural networks. IEEE Transactions on Signal Processing, 65(16), 4265\u20134280.","journal-title":"IEEE Transactions on Signal Processing"},{"issue":"11","key":"6056_CR35","doi-asserted-by":"publisher","first-page":"1134","DOI":"10.1145\/1968.1972","volume":"27","author":"LG Valiant","year":"1984","unstructured":"Valiant, L. G. (1984). A theory of the learnable. Communications of the ACM, 27(11), 1134\u20131142.","journal-title":"Communications of the ACM"},{"key":"6056_CR36","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4757-2440-0","volume-title":"The Nature of Statistical Learning Theory","author":"VN Vapnik","year":"1995","unstructured":"Vapnik, V. N. (1995). The Nature of Statistical Learning Theory. New York: Springer."},{"key":"6056_CR37","volume-title":"Statistical Learning Theory","author":"VN Vapnik","year":"1998","unstructured":"Vapnik, V. N. (1998). Statistical Learning Theory. Hoboken: Wiley."},{"key":"6056_CR38","unstructured":"Xu, A., & Raginsky, M. (2017). Information-theoretic analysis of generalization capability of learning algorithms. In Proceedings of the 30th Advances in Neural Information Processing Systems (NIPS), pp. 2521\u20132530."},{"issue":"3","key":"6056_CR39","doi-asserted-by":"publisher","first-page":"391","DOI":"10.1007\/s10994-011-5268-1","volume":"86","author":"H Xu","year":"2012","unstructured":"Xu, H., & Mannor, S. (2012). Robustness and generalization. Machine Learning, 86(3), 391\u2013423.","journal-title":"Machine Learning"},{"key":"6056_CR40","unstructured":"Yin, D., Pananjady, A., Lam, M., Papailiopoulos, D., Ramchandran, K., & Bartlett, P.\u00a0L. (2017). Gradient diversity: a key ingredient for scalable distributed learning. ArXiv: 1706.05699v3."},{"key":"6056_CR41","unstructured":"Zahavy, T., Kang, B., Sivak, A., Feng, J., Xu, H., & Mannor, S. (2017). Ensemble robustness and generalization of stochastic deep learning algorithms. ArXiv: 1602.02389v4."},{"key":"6056_CR42","unstructured":"Zhang, C., Bengio, S., Hardt, M., Recht, B., & Vinyals, O. (2017). Understanding deep learning requires rethinking generalization. In Proceedings of the International Conference on Learning Representations (ICLR)."},{"key":"6056_CR43","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Zhang, H., & Liang, Y. (2016). Geometrical properties of phase retrieval and convergence of accelerated reshaped wirtinger flow. In Proceedings of the 54th Annual Allerton Conference on Communication, Control, and Computing (Allerton).","DOI":"10.1109\/ALLERTON.2016.7852249"}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-021-06056-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10994-021-06056-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-021-06056-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,24]],"date-time":"2022-09-24T00:05:27Z","timestamp":1663977927000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10994-021-06056-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,9,24]]},"references-count":43,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2022,1]]}},"alternative-id":["6056"],"URL":"https:\/\/doi.org\/10.1007\/s10994-021-06056-w","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,9,24]]},"assertion":[{"value":"20 April 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 July 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 August 2021","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 September 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The author declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"All the codes are made publicly available on GitHub at .","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Code availability"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}