{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T07:44:41Z","timestamp":1770018281744,"version":"3.49.0"},"reference-count":39,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2022,8,1]],"date-time":"2022-08-01T00:00:00Z","timestamp":1659312000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,8,1]],"date-time":"2022-08-01T00:00:00Z","timestamp":1659312000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["11831002"],"award-info":[{"award-number":["11831002"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Sci Comput"],"published-print":{"date-parts":[[2022,9]]},"DOI":"10.1007\/s10915-022-01911-x","type":"journal-article","created":{"date-parts":[[2022,8,1]],"date-time":"2022-08-01T09:08:27Z","timestamp":1659344907000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Sketch-Based Empirical Natural Gradient Methods for Deep Learning"],"prefix":"10.1007","volume":"92","author":[{"given":"Minghan","family":"Yang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dong","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1762-0671","authenticated-orcid":false,"given":"Zaiwen","family":"Wen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mengyun","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pengxiang","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,8,1]]},"reference":[{"key":"1911_CR1","unstructured":"Agarwal, N., Anil, R., Hazan, E., Koren, T., Zhang, C.:, Disentangling adaptive gradient methods from learning rates, arXiv preprint arXiv:2002.11803 (2020)"},{"key":"1911_CR2","unstructured":"Amari, S.-i.: Neural learning in structured parameter spaces-natural Riemannian gradient. In: Advances in Neural Information Processing Systems pp 127\u2013133 (1997)"},{"key":"1911_CR3","unstructured":"Anil, R., Gupta, V., Koren, T., Regan, K., Singer, Y.:, Scalable second order optimization for deep learning, arXiv preprint arXiv:2002.09018 (2020)"},{"key":"1911_CR4","unstructured":"Bernacchia, A., Lengyel, M., Hennequin, G.:, Exact natural gradient in deep linear networks and its application to the nonlinear case. In: Advances in Neural Information Processing Systems (2018)"},{"key":"1911_CR5","unstructured":"Botev, A., Ritter, H., Barber, D.:, Practical Gauss-Newton optimisation for deep learning, in International Conference on Machine Learning, pp 557\u2013565 (2017)"},{"key":"1911_CR6","doi-asserted-by":"publisher","first-page":"1008","DOI":"10.1137\/140954362","volume":"26","author":"RH Byrd","year":"2016","unstructured":"Byrd, R.H., Hansen, S.L., Nocedal, J., Singer, Y.: A stochastic quasi-Newton method for large-scale optimization. SIAM J. Optim. 26, 1008\u20131031 (2016)","journal-title":"SIAM J. Optim."},{"key":"1911_CR7","unstructured":"Cai, T., Gao, R., Hou, J., Chen, S., Wang, D., He, D., Zhang, Z., Wang, L.:, A Gram-Gauss-Newton method learning overparameterized deep neural networks for regression problems, ArXiv: abs\/1905.11675 (2019)"},{"key":"1911_CR8","unstructured":"Goldfarb, D., Ren, Y., Bahamou, A.:, Practical quasi-Newton methods for training deep neural networks. In: Advances in Neural Information Processing Systems, Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M. F., Lin, H., (eds.) vol. 33, Curran Associates, Inc., pp 2386\u20132396 (2020)"},{"key":"1911_CR9","unstructured":"Gower, R., Kovalev, D., Lieder, F., Richt\u00e1rik, P. RSN: Randomized subspace Newton, Advances in Neural Information Processing Systems 32 (2019)"},{"key":"1911_CR10","unstructured":"Goyal, P., Doll\u00e1r, P., Girshick, R., Noordhuis, P., Wesolowski, L., Kyrola, A., Tulloch, A., Jia, Y., He, K.:, Accurate, large minibatch SGD: Training ImageNet in 1 hour. ArXiv:1706.02677 (2017)"},{"key":"1911_CR11","unstructured":"Grosse, R., Martens, J.:, A kronecker-factored approximate fisher matrix for convolution layers. In: International Conference on Machine Learning pp. 573\u2013582 (2016)"},{"key":"1911_CR12","unstructured":"Gupta, V., Koren, T., Singer, Y.:, Shampoo: Preconditioned stochastic tensor optimization. In: International Conference on Machine Learning pp 1842\u20131850 (2018)"},{"key":"1911_CR13","doi-asserted-by":"publisher","first-page":"169","DOI":"10.1007\/s10994-007-5016-8","volume":"69","author":"E Hazan","year":"2007","unstructured":"Hazan, E., Agarwal, A., Kale, S.: Logarithmic regret algorithms for online convex optimization. Mach. Learn. 69, 169\u2013192 (2007)","journal-title":"Mach. Learn."},{"key":"1911_CR14","doi-asserted-by":"crossref","unstructured":"Karakida, R., Osawa, K.:, Understanding approximate Fisher information for fast convergence of natural gradient descent in wide neural networks. In: Advances in Neural Information Processing Systems (2020)","DOI":"10.1088\/1742-5468\/ac3ae3"},{"key":"1911_CR15","unstructured":"Keskar, N. S., Mudigere, D., Nocedal, J., Smelyanskiy, M., Tang, P. T. P.:, On large-batch training for deep learning: Generalization gap and sharp minima. In: International Conference on Learning Representations (2017)"},{"key":"1911_CR16","unstructured":"Kingma, D. P., Ba, J.:, Adam: A method for stochastic optimization, arXiv preprint arXiv:1412.6980 (2014)"},{"key":"1911_CR17","first-page":"19377","volume":"33","author":"J Lacotte","year":"2020","unstructured":"Lacotte, J., Pilanci, M.: Effective dimension adaptive sketching methods for faster regularized least-squares optimization. Adv. Neural. Inf. Process. Syst. 33, 19377\u201319387 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1911_CR18","unstructured":"Lacotte, J., Wang, Y., Pilanci, M.: Adaptive Newton sketch: Linear-time optimization with quadratic convergence and effective hessian dimensionality. In: International Conference on Machine Learning, PMLR pp. 5926\u20135936 (2021)"},{"key":"1911_CR19","first-page":"123","volume":"3","author":"MW Mahoney","year":"2011","unstructured":"Mahoney, M.W., et al.: Randomized algorithms for matrices and data, Foundations and Trends\u00ae. Mach. Learn. 3, 123\u2013224 (2011)","journal-title":"Mach. Learn."},{"key":"1911_CR20","first-page":"1","volume":"21","author":"J Martens","year":"2020","unstructured":"Martens, J.: New insights and perspectives on the natural gradient method. J. Mach. Learn. Res. 21, 1\u201376 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"1911_CR21","unstructured":"Martens, J., Grosse, R.: Optimizing neural networks with kronecker-factored approximate curvature. In: International conference on machine learning pp. 2408\u20132417 (2015)"},{"key":"1911_CR22","unstructured":"Mattson, P., Cheng, C., Coleman, C., Diamos, G., Micikevicius, P., Patterson, D., Tang, H., Wei, G.-Y., Bailis, P., Bittorf, V. et al.:, Mlperf training benchmark, arXiv preprint arXiv:1910.01500 (2019)"},{"key":"1911_CR23","unstructured":"Osawa, K., Tsuji, Y., Ueno, Y., Naruse, A., Foo, C.-S., Yokota, R.: Scalable and practical natural gradient for large-scale deep learning, IEEE Transactions on Pattern Analysis and Machine Intelligence (2020)"},{"key":"1911_CR24","first-page":"1842","volume":"17","author":"M Pilanci","year":"2016","unstructured":"Pilanci, M., Wainwright, M.J.: Iterative hessian sketch: Fast and accurate solution approximation for constrained least-squares, The. J. Mach. Learn. Res. 17, 1842\u20131879 (2016)","journal-title":"J. Mach. Learn. Res."},{"key":"1911_CR25","doi-asserted-by":"publisher","first-page":"205","DOI":"10.1137\/15M1021106","volume":"27","author":"M Pilanci","year":"2017","unstructured":"Pilanci, M., Wainwright, M.J.: Newton Sketch: A near linear-time optimization algorithm with linear-quadratic convergence. SIAM J. Optim. 27, 205\u2013245 (2017)","journal-title":"SIAM J. Optim."},{"key":"1911_CR26","unstructured":"Ren, Y., Goldfarb, D.: Efficient subsampled Gauss-Newton and natural gradient methods for training neural networks, arXiv preprint arXiv:1906.02353 (2019)"},{"key":"1911_CR27","unstructured":"Ren, Y., Goldfarb, D.: Kronecker-factored quasi-Newton methods for convolutional neural networks, arXiv preprint arXiv:2102.06737 (2021)"},{"key":"1911_CR28","unstructured":"Ren, Y., Goldfarb, D.: Tensor normal training for deep learning models, Advances in Neural Information Processing Systems 34 (2021)"},{"key":"1911_CR29","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1214\/aoms\/1177729586","volume":"22","author":"H Robbins","year":"1951","unstructured":"Robbins, H., Monro, S.: A stochastic approximation method. Ann. Math. Stat. 22, 400\u2013407 (1951)","journal-title":"Ann. Math. Stat."},{"key":"1911_CR30","unstructured":"Roux, N. L., Manzagol, P.-A., Bengio, Y.: Topmoumoute online natural gradient algorithm, in Advances in Neural Information Processing Systems pp. 849\u2013856 (2008)"},{"key":"1911_CR31","first-page":"1","volume":"20","author":"CJ Shallue","year":"2019","unstructured":"Shallue, C.J., Lee, J., Antognini, J., Sohl-Dickstein, J., Frostig, R., Dahl, G.E.: Measuring the effects of data parallelism on neural network training. J. Mach. Learn. Res. 20, 1\u201349 (2019)","journal-title":"J. Mach. Learn. Res."},{"key":"1911_CR32","unstructured":"Sun, R.:, Optimization for deep learning: theory and algorithms, arXiv preprint arXiv:1912.08957 (2019)"},{"key":"1911_CR33","first-page":"8039","volume":"18","author":"S Wang","year":"2017","unstructured":"Wang, S., Gittens, A., Mahoney, M.W.: Sketched ridge regression: Optimization perspective, statistical perspective, and model averaging. J. Mach. Learn. Res. 18, 8039\u20138088 (2017)","journal-title":"J. Mach. Learn. Res."},{"key":"1911_CR34","first-page":"1697","volume":"17","author":"S Wang","year":"2016","unstructured":"Wang, S., Luo, L., Zhang, Z.: Spsd matrix approximation vis column selection: Theories, algorithms, and extensions. J. Mach. Learn. Res. 17, 1697\u20131745 (2016)","journal-title":"J. Mach. Learn. Res."},{"key":"1911_CR35","doi-asserted-by":"publisher","first-page":"927","DOI":"10.1137\/15M1053141","volume":"27","author":"X Wang","year":"2017","unstructured":"Wang, X., Ma, S., Goldfarb, D., Liu, W.: Stochastic Quasi-Newton Methods for Nonconvex Stochastic Optimization. SIAM J. Optim. 27, 927\u2013956 (2017)","journal-title":"SIAM J. Optim."},{"key":"1911_CR36","doi-asserted-by":"crossref","unstructured":"Yang, M., Milzarek, A., Wen, Z., Zhang, T.: A stochastic extra-step quasi-Newton method for nonsmooth nonconvex optimization, Mathematical Programming pp. 1\u201347 (2021)","DOI":"10.1007\/s10107-021-01629-y"},{"key":"1911_CR37","doi-asserted-by":"crossref","unstructured":"Yang, M., Xu, D., Chen, H., Wen, Z., Chen, M.: Enhance curvature information by structured stochastic quasi-Newton methods. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition pp. 10654\u201310663 (2021)","DOI":"10.1109\/CVPR46437.2021.01051"},{"key":"1911_CR38","doi-asserted-by":"crossref","unstructured":"Yang, M., Xu, D., Cui, Q., Wen, Z., Xu, P.:, NG+: A multi-step matrix-product natural gradient method for deep learning, arXiv preprint arXiv:2106.07454 (2021)","DOI":"10.1007\/s10915-022-01911-x"},{"key":"1911_CR39","unstructured":"Zhang, G., Martens, J., Grosse, R. B.:, Fast convergence of natural gradient descent for over-parameterized neural networks. In: Advances in Neural Information Processing Systems pp. 8082\u20138093 (2019)"}],"container-title":["Journal of Scientific Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10915-022-01911-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10915-022-01911-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10915-022-01911-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,26]],"date-time":"2022-08-26T22:15:32Z","timestamp":1661552132000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10915-022-01911-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,1]]},"references-count":39,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2022,9]]}},"alternative-id":["1911"],"URL":"https:\/\/doi.org\/10.1007\/s10915-022-01911-x","relation":{},"ISSN":["0885-7474","1573-7691"],"issn-type":[{"value":"0885-7474","type":"print"},{"value":"1573-7691","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,8,1]]},"assertion":[{"value":"7 December 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 May 2022","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 June 2022","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 August 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have not disclosed any competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"94"}}