{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T02:14:46Z","timestamp":1769220886381,"version":"3.49.0"},"reference-count":55,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2023,11,15]],"date-time":"2023-11-15T00:00:00Z","timestamp":1700006400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,11,15]],"date-time":"2023-11-15T00:00:00Z","timestamp":1700006400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Nonlinear Sci"],"published-print":{"date-parts":[[2024,2]]},"DOI":"10.1007\/s00332-023-09992-0","type":"journal-article","created":{"date-parts":[[2023,11,15]],"date-time":"2023-11-15T06:01:41Z","timestamp":1700028101000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Stochastic Gradient Descent with Noise of Machine Learning Type Part II: Continuous Time Analysis"],"prefix":"10.1007","volume":"34","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3766-5332","authenticated-orcid":false,"given":"Stephan","family":"Wojtowytsch","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,11,15]]},"reference":[{"key":"9992_CR1","doi-asserted-by":"crossref","unstructured":"Agrachev, A., Barilari, D., Boscain. U.: A Comprehensive Introduction to Sub-Riemannian Geometry, volume 181. Cambridge University Press (2019)","DOI":"10.1017\/9781108677325"},{"issue":"Suppl 2","key":"9992_CR2","doi-asserted-by":"publisher","first-page":"1555","DOI":"10.1007\/s00245-021-09804-5","volume":"84","author":"\u015e-L Ani\u0163a","year":"2021","unstructured":"Ani\u0163a, \u015e-L.: Optimal control of stochastic differential equations via Fokker\u2013Planck equations. Appl. Math. Optim. 84(Suppl 2), 1555\u20131583 (2021)","journal-title":"Appl. Math. Optim."},{"issue":"15","key":"9992_CR3","first-page":"4639","volume":"2017","author":"D Barilari","year":"2017","unstructured":"Barilari, D., Boscain, U., Charlot, G., Neel, R.W.: On the heat diffusion for generic Riemannian and sub-Riemannian structures. Int. Math. Res. Not. 2017(15), 4639\u20134672 (2017)","journal-title":"Int. Math. Res. Not."},{"issue":"38","key":"9992_CR4","doi-asserted-by":"publisher","first-page":"16459","DOI":"10.1073\/pnas.1003972107","volume":"107","author":"M Bonforte","year":"2010","unstructured":"Bonforte, M., Dolbeault, J., Grillo, G., V\u00e1zquez, J.-L.: Sharp rates of decay of solutions to the nonlinear fast diffusion equation via functional inequalities. Proc. Natl. Acad. Sci. 107(38), 16459\u201316464 (2010)","journal-title":"Proc. Natl. Acad. Sci."},{"key":"9992_CR5","volume-title":"Functional Analysis","author":"H Brezis","year":"2011","unstructured":"Brezis, H.: Functional Analysis. Sobolev spaces and partial differential equations. Universitext. Springer, New York (2011)"},{"key":"9992_CR6","first-page":"3036","volume":"34","author":"L Chizat","year":"2018","unstructured":"Chizat, L., Bach, F.: On the global convergence of gradient descent for over-parameterized models using optimal transport. Adv. Neural Inform. Process. Syst. 34, 3036\u20133046 (2018)","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"9992_CR7","unstructured":"Chizat, L., Bach, F.: Implicit bias of gradient descent for wide two-layer neural networks trained with the logistic loss. arxiv:2002.04486, (2020)"},{"key":"9992_CR8","doi-asserted-by":"crossref","unstructured":"De\u00a0Lellis, C.: Lecture notes on rectifiable sets, densities, and tangent measures. Zurich Lect. Adv. Math. 7 (2008)","DOI":"10.4171\/044"},{"key":"9992_CR9","unstructured":"Damian, A., Ma, T., Lee, J.D.: Label noise SGD provably prefers flat global minimizers. In: Beygelzimer,A., Dauphin, Y., Liang,P., and Vaughan, J.\u00a0W. (eds), Advances in Neural Information Processing Systems (2021)"},{"issue":"16","key":"9992_CR10","doi-asserted-by":"publisher","first-page":"5985","DOI":"10.1016\/j.na.2012.05.008","volume":"75","author":"J Dolbeault","year":"2012","unstructured":"Dolbeault, J., Volzone, B.: Improved Poincar\u00e9 inequalities. Nonlinear Anal. Theory Methods Appl. 75(16), 5985\u20136001 (2012)","journal-title":"Nonlinear Anal. Theory Methods Appl."},{"key":"9992_CR11","doi-asserted-by":"crossref","unstructured":"Evans, L.C., Gariepy, R.F.: Measure Theory and Fine Properties of Functions. CRC press, (2015)","DOI":"10.1201\/b18333"},{"issue":"1","key":"9992_CR12","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1112\/jlms\/s2-7.1.95","volume":"2","author":"D Edmunds","year":"1973","unstructured":"Edmunds, D., Peletier, L.: A Liouville theorem for degenerate elliptic equations. J. Lond. Math. Soc. 2(1), 95\u2013100 (1973)","journal-title":"J. Lond. Math. Soc."},{"key":"9992_CR13","unstructured":"Evans, L.C.: Partial Differential Equations, volume\u00a019 of Graduate Studies in Mathematics. American Mathematical Society, Providence, RI, second edition, (2010)"},{"key":"9992_CR14","volume-title":"Partial Differential Equations of Parabolic Type","author":"A Friedman","year":"2008","unstructured":"Friedman, A.: Partial Differential Equations of Parabolic Type. Courier Dover Publications, USA (2008)"},{"issue":"10","key":"9992_CR15","doi-asserted-by":"publisher","first-page":"4165","DOI":"10.1016\/j.jfa.2017.02.015","volume":"272","author":"X Fern\u00e1ndez-Real","year":"2017","unstructured":"Fern\u00e1ndez-Real, X., Ros-Oton, X.: Regularity theory for general stable operators: Parabolic equations. J. Funct. Anal. 272(10), 4165\u20134221 (2017)","journal-title":"J. Funct. Anal."},{"key":"9992_CR16","unstructured":"Ge, R., Huang, F., Jin, C., Yuan, Y.: Escaping from saddle points\u2014online stochastic gradient for tensor decomposition. In: Conference on Learning Theory, pp. 797\u2013842. PMLR, (2015)"},{"key":"9992_CR17","unstructured":"Gupta, K., Siegel, J.W., Wojtowytsch, S.: Achieving acceleration despite very noisy gradients. arXiv:2302.05515v2, (2023)"},{"key":"9992_CR18","unstructured":"Gilbarg, D., Trudinger, N.S.: Elliptic partial differential equations of second order, volume 224. springer, (2015)"},{"key":"9992_CR19","unstructured":"Hoffer, E., Hubara, I., Soudry, D.: Train longer, generalize better: Closing the generalization gap in large batch training of neural networks. arXiv preprint arXiv:1705.08741, (2017)"},{"key":"9992_CR20","unstructured":"Hassannezhad, A., Kokarev, G.: Sub-Laplacian eigenvalue bounds on sub-Riemannian manifolds. arXiv preprintarXiv:1407.0358, (2014)"},{"key":"9992_CR21","unstructured":"Hu, K., Ren, Z., Siska, D., Szpruch, L.: Mean-field Langevin dynamics and energy landscape of neural networks. arXiv:1905.07769 , (2019)"},{"key":"9992_CR22","unstructured":"Jastrz\u0229bski, S., Kenton, Z., Arpit, D., Ballas, N., Fischer, A., Bengio, Y., Storkey, A.: Three factors influencing minima in SGD. arXiv preprintarXiv:1711.04623, (2017)"},{"issue":"1","key":"9992_CR23","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1137\/S0036141096303359","volume":"29","author":"R Jordan","year":"1998","unstructured":"Jordan, R., Kinderlehrer, D., Otto, F.: The variational formulation of the Fokker-Planck equation. SIAM J. Math. Anal. 29(1), 1\u201317 (1998)","journal-title":"SIAM J. Math. Anal."},{"key":"9992_CR24","doi-asserted-by":"crossref","unstructured":"Javanmard, A., Mondelli, M., Montanari, A.: Analysis of a two-layer neural network via displacement convexity. arXiv preprintarXiv:1901.01375, (2019)","DOI":"10.1214\/20-AOS1945"},{"key":"9992_CR25","unstructured":"Jabir, J.-F., \u0160i\u0161ka, D., Szpruch, \u0141.: Mean-field neural odes via relaxed optimal control. arXiv preprint arXiv:1912.05475, (2019)"},{"key":"9992_CR26","volume-title":"Wahrscheinlichkeitstheorie","author":"A Klenke","year":"2006","unstructured":"Klenke, A.: Wahrscheinlichkeitstheorie, vol. 1. Springer, Cham (2006)"},{"key":"9992_CR27","unstructured":"Latz, J.: Analysis of stochastic gradient descent in continuous time. arXiv preprintarXiv:2004.07177, (2020)"},{"key":"9992_CR28","unstructured":"Li, Q., Tai, C.: W.\u00a0E. Dynamics of stochastic gradient algorithms. arXiv:1511.06251, (2015)"},{"key":"9992_CR29","unstructured":"Li, Q., Tai, C., Weinan, E.: Stochastic modified equations and adaptive stochastic gradient algorithms. In: International Conference on Machine Learning, pp. 2101\u20132110. PMLR, (2017)"},{"key":"9992_CR30","unstructured":"Luo, V., Wang, Y.: How many factors influence minima in SGD? arXiv preprintarXiv:2009.11858, (2020)"},{"key":"9992_CR31","unstructured":"Li, Z., Wang, T., Arora, S.: What happens after SGD reaches zero loss? \u2013a mathematical framework. In: International Conference on Learning Representations, (2022)"},{"key":"9992_CR32","unstructured":"Liu, K., Ziyin, L., Ueda, M.: Stochastic gradient descent with large learning rate. arXiv preprintarXiv:2012.03636, (2020)"},{"key":"9992_CR33","doi-asserted-by":"crossref","unstructured":"Masmoudi, N.: About the Hardy inequality. In: An Invitation to Mathematics, pp. 165\u2013180. Springer, (2011)","DOI":"10.1007\/978-3-642-19533-4_11"},{"key":"9992_CR34","unstructured":"Mandt, S., Hoffman, M.D., Blei, D.M.: Continuous-time limit of stochastic gradient descent revisited. NIPS-2015, (2015)"},{"key":"9992_CR35","unstructured":"Mandt, S., Hoffman, M., Blei, D.: A variational analysis of stochastic gradient algorithms. In: International Conference on Machine Learning, pp. 354\u2013363. PMLR, (2016)"},{"key":"9992_CR36","unstructured":"Mei, S., Misiakiewicz, T., Montanari, A.: Mean-field theory of two-layers neural networks: Dimension-free bounds and kernel limit. arXiv preprintarXiv:1902.06015, (2019)"},{"issue":"33","key":"9992_CR37","doi-asserted-by":"publisher","first-page":"E7665","DOI":"10.1073\/pnas.1806579115","volume":"115","author":"S Mei","year":"2018","unstructured":"Mei, S., Montanari, A., Nguyen, P.-M.: A mean field view of the landscape of two-layer neural networks. Proc. Natl. Acad. Sci. 115(33), E7665\u2013E7671 (2018)","journal-title":"Proc. Natl. Acad. Sci."},{"key":"9992_CR38","unstructured":"Neelakantan, A., Vilnis, L., Le, Q.V., Sutskever, I., Kaiser, L., Kurach, K., Martens, J.: Adding gradient noise improves learning for very deep networks. arXiv preprintarXiv:1511.06807, (2015)"},{"issue":"1","key":"9992_CR39","doi-asserted-by":"publisher","first-page":"143","DOI":"10.7146\/math.scand.a-10602","volume":"8","author":"A Persson","year":"1960","unstructured":"Persson, A.: Bounds for the discrete part of the spectrum of a semi-bounded Schr\u00f6dinger operator. Math. Scand. 8(1), 143\u2013153 (1960)","journal-title":"Math. Scand."},{"key":"9992_CR40","doi-asserted-by":"crossref","unstructured":"Robbins, H., Monro, S.: A stochastic approximation method. Annal. Math Stat. pp. 400\u2013407, (1951)","DOI":"10.1214\/aoms\/1177729586"},{"issue":"12","key":"9992_CR41","doi-asserted-by":"publisher","first-page":"8675","DOI":"10.1016\/j.jde.2016.02.033","volume":"260","author":"X Ros-Oton","year":"2016","unstructured":"Ros-Oton, X., Serra, J.: Regularity theory for general stable operators. J. Diff. Equ. 260(12), 8675\u20138715 (2016)","journal-title":"J. Diff. Equ."},{"key":"9992_CR42","unstructured":"Raginsky, M., Rakhlin, A., Telgarsky, M.: Non-convex learning via stochastic gradient Langevin dynamics: a nonasymptotic analysis. In: Conference on Learning Theory, pp. 1674\u20131703. PMLR, (2017)"},{"key":"9992_CR43","unstructured":"Rotskoff, G.M., Vanden-Eijnden, E.: Neural networks as interacting particle systems: Asymptotic convexity of the loss landscape and universal scaling of the approximation error. arXiv:1805.00915, (2018)"},{"key":"9992_CR44","unstructured":"Smith, S.L., Le, Q.\u00a0V.: A Bayesian perspective on generalization and stochastic gradient descent. arXiv preprintarXiv:1710.06451, (2017)"},{"issue":"1","key":"9992_CR45","doi-asserted-by":"publisher","first-page":"933","DOI":"10.1137\/17M1126825","volume":"8","author":"J Sirignano","year":"2017","unstructured":"Sirignano, J., Spiliopoulos, K.: Stochastic gradient descent in continuous time. SIAM J. Financ. Mat. 8(1), 933\u2013961 (2017)","journal-title":"SIAM J. Financ. Mat."},{"issue":"2","key":"9992_CR46","doi-asserted-by":"publisher","first-page":"725","DOI":"10.1137\/18M1192184","volume":"80","author":"J Sirignano","year":"2020","unstructured":"Sirignano, J., Spiliopoulos, K.: Mean field analysis of neural networks: A law of large numbers. SIAM J. Appl. Math. 80(2), 725\u2013752 (2020)","journal-title":"SIAM J. Appl. Math."},{"issue":"2","key":"9992_CR47","doi-asserted-by":"publisher","first-page":"124","DOI":"10.1287\/stsy.2019.0050","volume":"10","author":"J Sirignano","year":"2020","unstructured":"Sirignano, J., Spiliopoulos, K.: Stochastic gradient descent in continuous time: A central limit theorem. Stochastic Syst. 10(2), 124\u2013151 (2020)","journal-title":"Stochastic Syst."},{"key":"9992_CR48","unstructured":"Simsekli, U., Sagun, L., Gurbuzbalaban, M.: A tail-index analysis of stochastic gradient noise in deep neural networks. In: International Conference on Machine Learning, pp. 5827\u20135837. PMLR, (2019)"},{"issue":"1","key":"9992_CR49","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1103\/PhysRevE.55.99","volume":"55","author":"BJ West","year":"1997","unstructured":"West, B.J., Grigolini, P., Metzler, R., Nonnenmacher, T.F.: Fractional diffusion and l\u00e9vy stable processes. Phys. Rev. E 55(1), 99 (1997)","journal-title":"Phys. Rev. E"},{"key":"9992_CR50","unstructured":"Wojtowytsch, S.: On the global convergence of gradient descent training for two-layer Relu networks in the mean field regime. arXiv:2005.13530, (2020)"},{"key":"9992_CR51","doi-asserted-by":"publisher","first-page":"45","DOI":"10.1007\/s00332-023-09903-3","volume":"33","author":"S Wojtowytsch","year":"2023","unstructured":"Wojtowytsch, S.: Stochastic gradient descent with noise of machine learning type. Part I: Discrete time analysis. J. Nonlinear Sci. 33, 45 (2023)","journal-title":"J. Nonlinear Sci."},{"key":"9992_CR52","unstructured":"Welling, M., Teh, Y.W.: Bayesian learning via stochastic gradient Langevin dynamics. In: Proceedings of the 28th International Conference on Machine Learning (ICML-11), pp. 681\u2013688. Citeseer, (2011)"},{"key":"9992_CR53","unstructured":"Zhou, P., Feng, J., Ma, C., Xiong, C., HOI, S.: et\u00a0al. Towards theoretically understanding why SGD generalizes better than Adam in deep learning. arXiv preprintarXiv:2010.05627, (2020)"},{"key":"9992_CR54","unstructured":"Ziyin, L., Liu, K., Mori, T., Ueda, M.: Strength of minibatch noise in SGD. arXiv preprint arXiv:2102.05375, (2021)"},{"key":"9992_CR55","unstructured":"Zhu, Z., Wu, J., Yu, B., Wu, L., Ma, J.: The anisotropic noise in stochastic gradient descent: Its behavior of escaping from sharp minima and regularization effects. arXiv preprintarXiv:1803.00195, (2018)"}],"container-title":["Journal of Nonlinear Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00332-023-09992-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00332-023-09992-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00332-023-09992-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,15]],"date-time":"2024-02-15T17:08:17Z","timestamp":1708016897000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00332-023-09992-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,15]]},"references-count":55,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2024,2]]}},"alternative-id":["9992"],"URL":"https:\/\/doi.org\/10.1007\/s00332-023-09992-0","relation":{},"ISSN":["0938-8974","1432-1467"],"issn-type":[{"value":"0938-8974","type":"print"},{"value":"1432-1467","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,11,15]]},"assertion":[{"value":"11 October 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 October 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 November 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"There is no conflict of interest to report.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"16"}}