{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T22:00:45Z","timestamp":1766268045385,"version":"build-2065373602"},"reference-count":27,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2021,2,8]],"date-time":"2021-02-08T00:00:00Z","timestamp":1612742400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,2,8]],"date-time":"2021-02-08T00:00:00Z","timestamp":1612742400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Sci. China Inf. Sci."],"published-print":{"date-parts":[[2021,3]]},"DOI":"10.1007\/s11432-020-3023-7","type":"journal-article","created":{"date-parts":[[2021,2,14]],"date-time":"2021-02-14T08:30:19Z","timestamp":1613291419000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["On the convergence and improvement of stochastic normalized gradient descent"],"prefix":"10.1007","volume":"64","author":[{"given":"Shen-Yi","family":"Zhao","sequence":"first","affiliation":[]},{"given":"Yin-Peng","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Wu-Jun","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,2,8]]},"reference":[{"key":"3023_CR1","first-page":"142","volume":"17","author":"L Bottou","year":"1998","unstructured":"Bottou L. Online learning and stochastic approximations. On-line Learn Neural Netw, 1998, 17: 142","journal-title":"On-line Learn Neural Netw"},{"key":"3023_CR2","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1214\/aoms\/1177729586","volume":"22","author":"H Robbins","year":"1951","unstructured":"Robbins H, Monro S. A stochastic approximation method. Ann Math Statist, 1951, 22: 400\u2013407","journal-title":"Ann Math Statist"},{"key":"3023_CR3","doi-asserted-by":"publisher","first-page":"012101","DOI":"10.1007\/s11432-018-9656-y","volume":"62","author":"C Y Chen","year":"2019","unstructured":"Chen C Y, Wang W L, Zhang Y Z, et al. A convergence analysis for a class of practical variance-reduction stochastic gradient MCMC. Sci China Inf Sci, 2019, 62: 012101","journal-title":"Sci China Inf Sci"},{"key":"3023_CR4","doi-asserted-by":"publisher","first-page":"506","DOI":"10.1137\/S1052623495294797","volume":"8","author":"P Tseng","year":"1998","unstructured":"Tseng P. An incremental gradient(-projection) method with momentum term and adaptive stepsize rule. SIAM J Optim, 1998, 8: 506\u2013531","journal-title":"SIAM J Optim"},{"key":"3023_CR5","first-page":"2121","volume":"12","author":"J Duchi","year":"2011","unstructured":"Duchi J, Hazan E, Singer Y. Adaptive subgradient methods for online learning and stochastic optimization. J Mach Learn Res, 2011, 12: 2121\u20132159","journal-title":"J Mach Learn Res"},{"key":"3023_CR6","doi-asserted-by":"publisher","first-page":"59","DOI":"10.1007\/s10107-015-0871-8","volume":"156","author":"S Ghadimi","year":"2016","unstructured":"Ghadimi S, Lan G. Accelerated gradient methods for nonconvex nonlinear and stochastic programming. Math Program, 2016, 156: 59\u201399","journal-title":"Math Program"},{"key":"3023_CR7","doi-asserted-by":"publisher","first-page":"1269","DOI":"10.1007\/s11432-008-0117-y","volume":"51","author":"F Ding","year":"2008","unstructured":"Ding F, Yang H Z, Liu F. Performance analysis of stochastic gradient algorithms under weak conditions. Sci China Ser F-Inf Sci, 2008, 51: 1269\u20131280","journal-title":"Sci China Ser F-Inf Sci"},{"key":"3023_CR8","doi-asserted-by":"publisher","first-page":"1574","DOI":"10.1137\/070704277","volume":"19","author":"A Nemirovski","year":"2009","unstructured":"Nemirovski A, Juditsky A, Lan G, et al. Robust stochastic approximation approach to stochastic programming. SIAM J Optim, 2009, 19: 1574\u20131609","journal-title":"SIAM J Optim"},{"key":"3023_CR9","unstructured":"Krizhevsky A, Sutskever I, Hinton G E. Imagenet classification with deep convolutional neural networks. In: Proceedings of Advances in Neural Information Processing Systems, Lake Tahoe, 2012. 1097\u20131105"},{"key":"3023_CR10","doi-asserted-by":"crossref","unstructured":"LeCun Y A, Bottou L, Orr G B, et al. Neural Networks: Tricks of the Trade. Berlin: Springer Science & Business Media, 2012. 9\u201348","DOI":"10.1007\/978-3-642-35289-8_3"},{"key":"3023_CR11","unstructured":"Keskar N S, Mudigere D, Nocedal J, et al. On large-batch training for deep learning: generalization gap and sharp minima. In: Proceedings of International Conference on Learning Representations, Toulon, 2017"},{"key":"3023_CR12","unstructured":"Sutskever I, Martens J, Dahl G E, et al. On the importance of initialization and momentum in deep learning. In: Proceedings of International Conference on Machine Learning, Atlanta, 2013. 1139\u20131147"},{"key":"3023_CR13","unstructured":"Hazan E, Levy K, Shalev-Shwartz S. Beyond convexity: stochastic quasi-convex optimization. In: Proceedings of Advances in Neural Information Processing Systems, Montr\u00e9al, 2015. 1594\u20131602"},{"key":"3023_CR14","unstructured":"Levy K Y. The power of normalization: faster evasion of saddle points. 2016. ArXiv:1611.04831"},{"key":"3023_CR15","unstructured":"Fang C, Li C J, Lin Z, et al. Spider: near-optimal non-convex optimization via stochastic path-integrated differential estimator. In: Proceedings of Advances in Neural Information Processing Systems, Montr\u00e9al, 2018. 689\u2013699"},{"key":"3023_CR16","unstructured":"Zhang J, He T, Sra S, et al. Why gradient clipping accelerates training: a theoretical justification for adaptivity. In: Proceedings of International Conference on Learning Representations, Addis Ababa, 2020"},{"key":"3023_CR17","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4419-8853-9","volume-title":"Introductory Lectures on Convex Optimization: A Basic Course","author":"Y E Nesterov","year":"2004","unstructured":"Nesterov Y E. Introductory Lectures on Convex Optimization: A Basic Course. Berlin: Springer Science & Business Media, 2004"},{"key":"3023_CR18","unstructured":"Wilson A C, Mackey L, Wibisono A. Accelerating rescaled gradient descent: fast optimization of smooth functions. In: Proceedings of Advances in Neural Information Processing Systems, Vancouver, 2019. 13533\u201313543"},{"key":"3023_CR19","unstructured":"Ge R, Huang F, Jin C, et al. Escaping from saddle points-online stochastic gradient for tensor decomposition. In: Proceedings of Conference on Learning Theory, Paris, 2015. 797\u2013842"},{"key":"3023_CR20","unstructured":"Zinkevich M. Online convex programming and generalized infinitesimal gradient ascent. In: Proceedings of International Conference on Machine learning, Washington, 2003. 928\u2013936"},{"key":"3023_CR21","unstructured":"Johnson R, Zhang T. Accelerating stochastic gradient descent using predictive variance reduction. In: Proceedings of Advances in Neural Information Processing Systems, Lake Tahoe, 2013. 315\u2013323"},{"key":"3023_CR22","unstructured":"Lei L, Ju C, Chen J, et al. Non-convex finite-sum optimization via SCSG methods. In: Proceedings of Advances in Neural Information Processing Systems, Long Beach, 2017. 2348\u20132358"},{"key":"3023_CR23","unstructured":"Allen Z, Bengio S, Wallach H, et al. Natasha2-faster non-convex optimization than SGD. In: Proceedings of Advances in Neural Information Processing Systems, Montr\u00e9al, 2018. 2680\u20132691"},{"key":"3023_CR24","unstructured":"Defazio A, Bottou L. On the ineffectiveness of variance reduced optimization for deep learning. In: Proceedings of Advances in Neural Information Processing Systems, Vancouver, 2019. 1753\u20131763"},{"key":"3023_CR25","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, et al. Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Las Vegas, 2016. 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"3023_CR26","unstructured":"Krizhevsky A, Hinton G E. Learning Multiple Layers of Features From Tiny Images. Technical Report TR-2009. 2009"},{"key":"3023_CR27","unstructured":"Yuan Z, Yan Y, Jin R, et al. Stagewise training accelerates convergence of testing error over SGD. In: Proceedings of Advances in Neural Information Processing Systems, Vancouver, 2019. 2604\u20132614"}],"container-title":["Science China Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-020-3023-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11432-020-3023-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-020-3023-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,19]],"date-time":"2022-04-19T20:30:17Z","timestamp":1650400217000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11432-020-3023-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,2,8]]},"references-count":27,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2021,3]]}},"alternative-id":["3023"],"URL":"https:\/\/doi.org\/10.1007\/s11432-020-3023-7","relation":{},"ISSN":["1674-733X","1869-1919"],"issn-type":[{"type":"print","value":"1674-733X"},{"type":"electronic","value":"1869-1919"}],"subject":[],"published":{"date-parts":[[2021,2,8]]},"assertion":[{"value":"20 March 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 May 2020","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 June 2020","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 February 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"132103"}}