{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T22:14:08Z","timestamp":1769552048364,"version":"3.49.0"},"reference-count":77,"publisher":"Society for Industrial & Applied Mathematics (SIAM)","issue":"1","funder":[{"name":"National Science Foundation","award":["ECCS 2330196"],"award-info":[{"award-number":["ECCS 2330196"]}]},{"name":"Ministry of Science, Technological Development, and Innovation","award":["451-03-65\/2024-03\/200156"],"award-info":[{"award-number":["451-03-65\/2024-03\/200156"]}]},{"DOI":"10.13039\/100032113","name":"University of Novi Sad","doi-asserted-by":"crossref","award":["01-3394\/1"],"award-info":[{"award-number":["01-3394\/1"]}],"id":[{"id":"10.13039\/100032113","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Ministry of Science, Technological Development, and Innovation","award":["451-03-137\/2025-03\/200125"],"award-info":[{"award-number":["451-03-137\/2025-03\/200125"]}]},{"name":"Ministry of Science, Technological Development, and Innovation","award":["451-03-136\/2025-03\/200125"],"award-info":[{"award-number":["451-03-136\/2025-03\/200125"]}]},{"name":"LASCADO","award":["7359"],"award-info":[{"award-number":["7359"]}]},{"name":"Provincial Secretariat for Higher Education and Scientific Research","award":["142-451-2593\/2021-01\/2"],"award-info":[{"award-number":["142-451-2593\/2021-01\/2"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["SIAM J. Optim."],"published-print":{"date-parts":[[2026,3,31]]},"DOI":"10.1137\/24m1704154","type":"journal-article","created":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T08:35:41Z","timestamp":1769502941000},"page":"32-59","source":"Crossref","is-referenced-by-count":0,"title":["Large Deviation Upper Bounds and Improved MSE Rates of Nonlinear SGD: Heavy-Tailed Noise and Power of Symmetry"],"prefix":"10.1137","volume":"36","author":[{"given":"Aleksandar","family":"Armacki","sequence":"first","affiliation":[{"name":"School of Engineering, EPFL, 1015 Lausanne, Switzerland."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6054-673X","authenticated-orcid":true,"given":"Shuhua","family":"Yu","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA 15213-3890 USA."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dragana","family":"Bajovi\u0107","sequence":"additional","affiliation":[{"name":"Faculty of Technical Sciences, Department of Power, Electronic, and Communication Engineering, University of Novi Sad, 21000 Novi Sad, Serbia."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Du\u0161an","family":"Jakoveti\u0107","sequence":"additional","affiliation":[{"name":"Faculty of Sciences, Department of Mathematics and Informatics, University of Novi Sad, 21000 Novi Sad, Serbia."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Soummya","family":"Kar","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA 15213-3890 USA."}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"351","published-online":{"date-parts":[[2026,1,27]]},"reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1134\/S0965542524700076"},{"key":"ref2","author":"Alistarh D.","year":"2017","journal-title":"in Advances in Neural Information Processing Systems 30 (NIPS 2017)"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-022-01822-7"},{"key":"ref4","unstructured":"A. Armacki, S. Yu, D. Bajovi\u0107, D. Jakoveti\u0107, and S. Kar, Large Deviations and Improved Mean-Squared Error Rates of Nonlinear SGD: Heavy-Tailed Noise and Power of Symmetry, preprint, arXiv:2410.15637, 2024."},{"key":"ref5","unstructured":"A. Armacki, S. Yu, P. Sharma, G. Joshi, D. Bajovi\u0107, D. Jakoveti\u0107, and S. Kar, High-probability convergence bounds for online nonlinear stochastic gradient descent under heavy-tailed noise, in Proceedings of The 28th International Conference on Artificial Intelligence and Statistics,\u00a0PMLR 258, 2025, pp. 1774\u20131782."},{"key":"ref6","unstructured":"W. Azizian, F. Iutzeler, J. Malick, and P. Mertikopoulos, What is the long-run distribution of stochastic gradient descent? A\u00a0large deviations analysis, in Proceedings of the 41st International Conference on Machine Learning,\u00a0PMLR 235,\u00a02024, pp. 2168\u20132229."},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2023.3324866"},{"key":"ref8","unstructured":"D. Bajovi\u0107, D. Jakoveti\u0107, and S. Kar, Large deviations rates for stochastic gradient descent with strongly convex functions, in Proceedings of the 26th International Conference on Artificial Intelligence and Statistics, Proceedings of Machine Learning Research 206, PMLR, 2023, pp. 10095\u201310111."},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2012.2210885"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2011.2157147"},{"key":"ref11","first-page":"29364","volume-title":"Advances in Neural Information Processing Systems","volume":"34","author":"Barsbey M.","year":"2021"},{"key":"ref12","unstructured":"B. Battash, L. Wolf, and O. Lindenbaum, Revisiting the noise model of stochastic gradient descent, in Proceedings of the 27th International Conference on Artificial Intelligence and Statistics, Proceedings of Machine Learning Research 238, PMLR, 2024, pp. 4780\u20134788."},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.2307\/121080"},{"key":"ref14","unstructured":"J. Bernstein, Y.X. Wang, K. Azizzadenesheli, and A. Anandkumar, signSGD: Compressed optimisation for non-convex problems, in International Conference on Machine Learning, PMLR, 2018, pp. 560\u2013569."},{"key":"ref15","unstructured":"J. Bernstein, J. Zhao, K. Azizzadenesheli, and A. Anandkumar, signSGD with majority vote is communication efficient and fault tolerant, in International Conference on Learning Representations, ICLR, 2019."},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1137\/S1052623497331063"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1137\/16M1080173"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854778"},{"key":"ref19","first-page":"13773","volume":"33","author":"Chen X.","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref20","first-page":"9955","volume":"35","author":"Crawshaw M.","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref21","unstructured":"A. Cutkosky and H. Mehta, Momentum improves normalized SGD, in International Conference on Machine Learning, PMLR, 2020, pp. 2260\u20132268."},{"key":"ref22","volume-title":"Large Deviations Techniques and Applications","volume":"38","author":"Dembo A.","year":"2009"},{"key":"ref23","volume-title":"Entropy, Large Deviations, and Statistical Mechanics","author":"Ellis R.","year":"2005"},{"key":"ref24","unstructured":"V. Gandikota, D. Kane, R. K. Maity, and A. Mazumdar, vqSGD: Vector quantized stochastic gradient descent, in International Conference on Artificial Intelligence and Statistics, PMLR, 2021, pp. 2197\u20132205."},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1137\/110848864"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1137\/120880811"},{"key":"ref27","unstructured":"M. Gurbuzbalaban, U. Simsekli, and L. Zhu, The heavy-tail phenomenon in SGD, in Proceedings of the 38th International Conference on Machine Learning, Proceedings of Machine Learning Research 139, PMLR, 2021, pp. 3964\u20133975."},{"key":"ref28","unstructured":"N. J. A. Harvey, C. Liaw, Y. Plan, and S. Randhawa, Tight analyses for non-smooth stochastic gradient descent, in Proceedings of the 32nd Conference on Learning Theory, Proceedings of Machine Learning Research 99, PMLR, 2019, pp. 1579\u20131613."},{"key":"ref29","author":"Hazan E.","year":"2015","journal-title":"in Advances in Neural Information Processing Systems 28 (NIPS 2015)"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.4310\/AMSA.2019.v4.n1.a1"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1137\/21M145896X"},{"key":"ref32","unstructured":"C. Jin, P. Netrapalli, R. Ge, S. M. Kakade, and M. I. Jordan, A Short Note on Concentration Inequalities for Random Vectors with SubGaussian Norm, preprint, arXiv:1902.03736, 2019."},{"key":"ref33","unstructured":"W. Jongeneel, D. Kuhn, and M. Li, A large deviations perspective on policy gradient algorithms, in Proceedings of the Sixth Annual Learning for Dynamics and Control Conference, Proceedings of Machine Learning Research 242, PMLR, 2024, pp. 916\u2013928."},{"key":"ref34","author":"Khaled A.","year":"2023","journal-title":"in Transactions on Machine Learning Research, TMLR"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-010-0434-y"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-39568-1"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2952219"},{"key":"ref38","unstructured":"X. Li and F. Orabona, On the convergence of stochastic gradient descent with adaptive stepsizes, in Proceedings of the 22nd International Conference on Artificial Intelligence and Statistics, Proceedings of Machine Learning Research 89, PMLR, 2019, pp. 983\u2013992."},{"key":"ref39","unstructured":"X. Li and F. Orabona, A high probability analysis of adaptive SGD with momentum, in Workshop on Beyond First Order Methods in ML Systems at ICML\u201920, 2020."},{"key":"ref40","unstructured":"A. Lindhe, Topics on Large Deviations in Artificial Intelligence, Ph.D. thesis, KTH Royal Institute of Technology, 2023."},{"key":"ref41","unstructured":"Z. Liu, T. D. Nguyen, T. H. Nguyen, A. Ene, and H. Nguyen, High probability convergence of stochastic gradient methods, in International Conference on Machine Learning, PMLR, 2023, pp. 21884\u201321914."},{"key":"ref42","unstructured":"Z. Liu, J. Zhang, and Z. Zhou, Breaking the lower bound with (little) structure: Acceleration in non-convex stochastic optimization with heavy-tailed noise, in Proceedings of the 36th Conference on Learning Theory, Proceedings of Machine Learning Research 195, PMLR, 2023, pp. 2266\u20132290."},{"key":"ref43","unstructured":"Z. Liu and Z. Zhou, Stochastic Nonsmooth Convex Optimization with Heavy-Tailed Noises: High-Probability Bound, In-Expectation Rate and Initial Distance Adaptation, preprint, arXiv:2303.12277, 2023."},{"key":"ref44","first-page":"1","volume":"25","author":"Madden L.","year":"2024","journal-title":"J. Mach. Learn. Res."},{"key":"ref45","unstructured":"A. R. Masegosa and L. A. Ortega, A large deviation theory analysis on the implicit bias of SGD, preprint, to appear."},{"key":"ref46","first-page":"1117","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Mertikopoulos P.","year":"2020"},{"key":"ref47","volume-title":"Advances in Neural Information Processing Systems","volume":"24","author":"Moulines E.","year":"2011"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1017\/9781009053730"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1137\/070704277"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-91578-4"},{"key":"ref51","first-page":"24191","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Nguyen T. D.","year":"2023"},{"key":"ref52","unstructured":"R. Pascanu, T. Mikolov, and Y. Bengio, On the difficulty of training recurrent neural networks, in Proceedings of the 30th International Conference on Machine Learning, Proceedings of Machine Learning Research 28, PMLR, 2013, pp. 1310\u20131318."},{"key":"ref53","unstructured":"S. Peluchetti, S. Favaro, and S. Fortini, Stable behaviour of infinitely wide deep neural networks, in Proceedings of the 23rd International Conference on Artificial Intelligence and Statistics, Proceedings of Machine Learning Research 108, PMLR, 2020, pp. 1137\u20131146."},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1137\/0330046"},{"key":"ref55","first-page":"378","volume":"40","author":"Polyak B.","year":"1979","journal-title":"Autom. Remote Control"},{"key":"ref56","first-page":"45","volume":"34","author":"Polyak B.","year":"1973","journal-title":"Avtomat. i Telemekh."},{"key":"ref57","volume-title":"Introduction to Optimization","author":"Polyak B. T.","year":"1987"},{"key":"ref58","first-page":"95","volume":"45","author":"Polyak B. T.","year":"1984","journal-title":"Avtomat. i Telemekh."},{"key":"ref59","unstructured":"N. Puchkin, E. Gorbunov, N. Kutuzov, and A. Gasnikov, Breaking the heavy-tailed noise barrier in stochastic optimization problems, in Proceedings of The 27th International Conference on Artificial Intelligence and Statistics,\u00a0PMLR 238,\u00a02024, pp. 856\u2013864."},{"key":"ref60","unstructured":"A. Rakhlin, O. Shamir, and K. Sridharan, Making gradient descent optimal for strongly convex stochastic optimization, in Proceedings of the 29th International Conference on Machine Learning, PMLR, 2012, pp. 1571\u20131578."},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1214\/aoms\/1177729586"},{"key":"ref62","unstructured":"D. Ruppert, Efficient Estimations from a Slowly Convergent Robbins-Monro Process, Technical report 781, Cornell University Operations Research and Industrial Engineering, 1988."},{"key":"ref63","unstructured":"A. Sadiev, M. Danilova, E. Gorbunov, S. Horv\u00e1th, G. Gidel, P. Dvurechensky, A. Gasnikov, and P. Richt\u00e1rik, High-probability bounds for stochastic optimization and variational inequalities: The case of unbounded variance, in International Conference on Machine Learning, PMLR, 2023, pp. 29563\u201329648."},{"key":"ref64","unstructured":"O. Sebbouh, R. M. Gower, and A. Defazio, Almost sure convergence rates for stochastic gradient descent and stochastic heavy ball, in Proceedings of the 34th Conference on Learning Theory, Proceedings of Machine Learning Research 134, PMLR, 2021, pp. 3935\u20133971."},{"key":"ref65","first-page":"5","volume-title":"22nd Conference on Learning Theory (COLT 2009)","volume":"2","author":"Shalev-Shwartz S.","year":"2009"},{"key":"ref66","unstructured":"U. \u015eim\u015fekli, M. G\u00fcrb\u00fczbalaban, T. H. Nguyen, G. Richard, and L. Sagun, On the Heavy-Tailed Theory of Stochastic Gradient Descent for Deep Neural Networks, preprint, arXiv:1912.00018, 2019."},{"key":"ref67","unstructured":"U. Simsekli, L. Sagun, and M. Gurbuzbalaban, A tail-index analysis of stochastic gradient noise in deep neural networks, in International Conference on Machine Learning, PMLR, 2019, pp. 5827\u20135837."},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1016\/j.physrep.2009.05.002"},{"key":"ref69","unstructured":"C.P. Tsai, A. Prasad, S. Balakrishnan, and P. Ravikumar, Heavy-tailed streaming statistical estimation, in Proceedings of the 25th International Conference on Artificial Intelligence and Statistics, Proceedings of Machine Learning Research 151, PMLR, 2022, pp. 1251\u20131282."},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1214\/07-AOP348"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1017\/9781108231596"},{"key":"ref72","first-page":"18866","volume-title":"Advances in Neural Information Processing Systems","volume":"34","author":"Wang H.","year":"2021"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1017\/9781009004282"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2023.3277211"},{"key":"ref75","first-page":"15511","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Zhang B.","year":"2020"},{"key":"ref76","unstructured":"J. Zhang, T. He, S. Sra, and A. Jadbabaie, Why gradient clipping accelerates training: A theoretical justification for adaptivity, in International Conference on Learning Representations, ICLR, 2019."},{"key":"ref77","first-page":"15383","volume":"33","author":"Zhang J.","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."}],"container-title":["SIAM Journal on Optimization"],"original-title":[],"language":"en","deposited":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T08:35:57Z","timestamp":1769502957000},"score":1,"resource":{"primary":{"URL":"https:\/\/epubs.siam.org\/doi\/10.1137\/24M1704154"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,27]]},"references-count":77,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,3,31]]}},"alternative-id":["10.1137\/24M1704154"],"URL":"https:\/\/doi.org\/10.1137\/24m1704154","relation":{},"ISSN":["1052-6234","1095-7189"],"issn-type":[{"value":"1052-6234","type":"print"},{"value":"1095-7189","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1,27]]}}}