{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T06:32:28Z","timestamp":1770013948298,"version":"3.49.0"},"reference-count":61,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"NSF DMS","award":["2110722"],"award-info":[{"award-number":["2110722"]}]},{"name":"NSF DMS","award":["2309549"],"award-info":[{"award-number":["2309549"]}]},{"DOI":"10.13039\/501100001809","name":"NSFC","doi-asserted-by":"publisher","award":["11831002"],"award-info":[{"award-number":["11831002"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Signal Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/tsp.2024.3398496","type":"journal-article","created":{"date-parts":[[2024,5,9]],"date-time":"2024-05-09T17:44:01Z","timestamp":1715276641000},"page":"2527-2542","source":"Crossref","is-referenced-by-count":2,"title":["Convergence Analysis of an Adaptively Regularized Natural Gradient Method"],"prefix":"10.1109","volume":"72","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-4485-7563","authenticated-orcid":false,"given":"Jiayuan","family":"Wu","sequence":"first","affiliation":[{"name":"College of Engineering, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9356-2990","authenticated-orcid":false,"given":"Jiang","family":"Hu","sequence":"additional","affiliation":[{"name":"Massachusetts General Hospital and Harvard Medical School, Harvard University, Boston, MA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2518-6975","authenticated-orcid":false,"given":"Hongchao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Department of Mathematics, Louisiana State University, Baton Rouge, LA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1762-0671","authenticated-orcid":false,"given":"Zaiwen","family":"Wen","sequence":"additional","affiliation":[{"name":"Beijing International Center for Mathematical Research, Center for Machine Learning Research and College of Engineering, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"issue":"1","key":"ref1","first-page":"5776","article-title":"New insights and perspectives on the natural gradient method","volume":"21","author":"Martens","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref2","first-page":"2408","article-title":"Optimizing neural networks with Kronecker-factored approximate curvature","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Martens","year":"2015"},{"key":"ref3","first-page":"573","article-title":"A Kronecker-factored approximate Fisher matrix for convolution layers","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Grosse","year":"2016"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.4208\/jcm.2104-m2021-0007"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2014.2357775"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2017.2784360"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1103\/PhysRevResearch.2.033429"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21468\/SciPostPhysCodeb.7"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1214\/aoms\/1177729586"},{"key":"ref10","first-page":"315","article-title":"Accelerating stochastic gradient descent using predictive variance reduction","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"26","author":"Johnson","year":"2013"},{"key":"ref11","first-page":"1646","article-title":"Saga: A fast incremental gradient method with support for non-strongly convex composite objectives","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"27","author":"Defazio","year":"2014"},{"issue":"7","key":"ref12","first-page":"2121","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"Duchi","year":"2011","journal-title":"J. Mach. Learn. Res."},{"key":"ref13","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2014"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1038\/nature14539"},{"key":"ref15","first-page":"127","article-title":"Neural learning in structured parameter spaces-natural Riemannian gradient","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"9","author":"Amari","year":"1996"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.2307\/1403504"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/s10915-022-01911-x"},{"key":"ref18","article-title":"Scalable second order optimization for deep learning","author":"Anil","year":"2020"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3213654"},{"key":"ref20","article-title":"A mini-block natural gradient method for deep neural networks","author":"Bahamou","year":"2022"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1137\/22m1477805"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1137\/140954362"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1137\/15M1021106"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-018-1346-5"},{"key":"ref25","first-page":"2386","article-title":"Practical quasi-Newton methods for training deep neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Goldfarb","year":"2020"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01051"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1137\/0328072"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1137\/0330046"},{"key":"ref29","first-page":"71","article-title":"Stochastic gradient descent for non-smooth optimization: Convergence results and optimal averaging schemes","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Shamir","year":"2013"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/1015330.1015332"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1137\/16M1080173"},{"key":"ref32","article-title":"On the convergence of Adam and beyond","author":"Reddi","year":"2019"},{"key":"ref33","first-page":"5200","article-title":"SGD: General analysis and improved rates","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Gower","year":"2019"},{"key":"ref34","article-title":"A unified convergence theorem for stochastic optimization methods","author":"Li","year":"2022"},{"issue":"4","key":"ref35","first-page":"643","article-title":"Gradient methods for minimizing functionals","volume":"3","author":"Polyak","year":"1963","journal-title":"Zhurnal Vychislitel\u2019noi Matematiki i Matematicheskoi Fiziki"},{"key":"ref36","article-title":"On exponential convergence of SGD in non-convex over-parametrized learning","author":"Bassily","year":"2018"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1137\/21m1468048"},{"key":"ref38","first-page":"8082","article-title":"Fast convergence of natural gradient descent for over-parameterized neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Zhang","year":"2019"},{"key":"ref39","volume-title":"Numerical Optimization","author":"Nocedal","year":"2006"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-017-1141-8"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1162\/08997660260028683"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1016\/S0893-6080(00)00051-4"},{"key":"ref43","first-page":"4156","article-title":"Limitations of the empirical fisher approximation for natural gradient descent","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Kunstner","year":"2019"},{"key":"ref44","article-title":"Kronecker-factored curvature approximations for recurrent neural networks","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Martens","year":"2018"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1137\/19M1291832"},{"key":"ref46","article-title":"Complexity bounds of iterative linear quadratic optimization algorithms for discrete time nonlinear control","author":"Roulet","year":"2022"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/s10915-017-0488-6"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/s10589-022-00393-9"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1017\/9781108591034"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1287\/ijoo.2019.0016"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5946"},{"key":"ref52","article-title":"On large-batch training for deep learning: Generalization gap and sharp minima","author":"Keskar","year":"2016"},{"key":"ref53","article-title":"Don\u2019t decay the learning rate, increase the batch size","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Smith","year":"2018"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-021-01709-z"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1007\/s10957-020-01666-1"},{"key":"ref56","article-title":"Gaussian error linear units (Gelus)","author":"Hendrycks","year":"2016"},{"key":"ref57","volume-title":"Matrix Perturbation Theory","author":"Stewart","year":"1990"},{"issue":"3","key":"ref58","first-page":"341","article-title":"A modified finite Newton method for fast solution of large scale linear SVMs","volume":"6","author":"Keerthi","year":"2005","journal-title":"J. Mach. Learn. Res."},{"key":"ref59","first-page":"361","article-title":"Rcv1: A new benchmark collection for text categorization research","volume":"5","author":"Lewis","year":"2004","journal-title":"J. Mach. Learn. Res."},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1038\/ncomms5308"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/2959100.2959134"}],"container-title":["IEEE Transactions on Signal Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/78\/10347386\/10527404.pdf?arnumber=10527404","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T04:38:56Z","timestamp":1717735136000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10527404\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":61,"URL":"https:\/\/doi.org\/10.1109\/tsp.2024.3398496","relation":{},"ISSN":["1053-587X","1941-0476"],"issn-type":[{"value":"1053-587X","type":"print"},{"value":"1941-0476","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}