{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T20:26:11Z","timestamp":1766262371459,"version":"3.48.0"},"reference-count":39,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T00:00:00Z","timestamp":1729641600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T00:00:00Z","timestamp":1729641600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Sci. China Inf. Sci."],"published-print":{"date-parts":[[2024,11]]},"DOI":"10.1007\/s11432-022-3892-8","type":"journal-article","created":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T03:03:05Z","timestamp":1729825385000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Stochastic normalized gradient descent with momentum for large-batch training"],"prefix":"10.1007","volume":"67","author":[{"given":"Shen-Yi","family":"Zhao","sequence":"first","affiliation":[]},{"given":"Chang-Wei","family":"Shi","sequence":"additional","affiliation":[]},{"given":"Yin-Peng","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Wu-Jun","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,23]]},"reference":[{"key":"3892_CR1","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1214\/aoms\/1177729586","volume":"22","author":"H Robbins","year":"1951","unstructured":"Robbins H, Monro S. A stochastic approximation method. Ann Math Statist, 1951, 22: 400\u2013407","journal-title":"Ann Math Statist"},{"key":"3892_CR2","doi-asserted-by":"publisher","first-page":"1269","DOI":"10.1007\/s11432-008-0117-y","volume":"51","author":"F Ding","year":"2008","unstructured":"Ding F, Yang H Z, Liu F. Performance analysis of stochastic gradient algorithms under weak conditions. Sci China Ser F-Inf Sci, 2008, 51: 1269\u20131280","journal-title":"Sci China Ser F-Inf Sci"},{"key":"3892_CR3","doi-asserted-by":"publisher","first-page":"012101","DOI":"10.1007\/s11432-018-9656-y","volume":"62","author":"C Y Chen","year":"2019","unstructured":"Chen C Y, Wang W L, Zhang Y Z, et al. A convergence analysis for a class of practical variance-reduction stochastic gradient MCMC. Sci China Inf Sci, 2019, 62: 012101","journal-title":"Sci China Inf Sci"},{"key":"3892_CR4","doi-asserted-by":"publisher","first-page":"132103","DOI":"10.1007\/s11432-020-3023-7","volume":"64","author":"S-Y Zhao","year":"2021","unstructured":"Zhao S-Y, Xie Y-P, Li W-J. On the convergence and improvement of stochastic normalized gradient descent. Sci China Inf Sci, 2021, 64: 132103","journal-title":"Sci China Inf Sci"},{"key":"3892_CR5","doi-asserted-by":"publisher","first-page":"661","DOI":"10.1145\/2623330.2623612","volume-title":"Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining","author":"M Li","year":"2014","unstructured":"Li M, Zhang T, Chen Y Q, et al. Efficient mini-batch training for stochastic optimization. In: Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, New York, 2014. 661\u2013670"},{"key":"3892_CR6","first-page":"7184","volume-title":"Proceedings of the International Conference on Machine Learning","author":"H Yu","year":"2019","unstructured":"Yu H, Jin R, Yang S. On the linear speedup analysis of communication efficient momentum SGD for distributed non-convex optimization. In: Proceedings of the International Conference on Machine Learning, Long Beach, 2019. 7184\u20137193"},{"key":"3892_CR7","first-page":"19","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"M Li","year":"2014","unstructured":"Li M, Andersen D G, Smola A J, et al. Communication efficient distributed machine learning with the parameter server. In: Proceedings of the Advances in Neural Information Processing Systems, Montr\u00e9al, 2014. 19\u201327"},{"key":"3892_CR8","first-page":"5330","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"X R Lian","year":"2017","unstructured":"Lian X R, Zhang C, Zhang H, et al. Can decentralized algorithms outperform centralized algorithms? A case study for decentralized parallel stochastic gradient descent. In: Proceedings of the Advances in Neural Information Processing Systems, Long Beach, 2017. 5330\u20135340"},{"key":"3892_CR9","first-page":"6654","volume-title":"Proceedings of the International Conference on Machine Learning","author":"T Lin","year":"2021","unstructured":"Lin T, Karimireddy S P, Stich S U, et al. Quasi-global momentum: accelerating decentralized deep learning on heterogeneous data. In: Proceedings of the International Conference on Machine Learning, 2021. 6654\u20136665"},{"key":"3892_CR10","first-page":"5693","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"H Yu","year":"2019","unstructured":"Yu H, Yang S, Zhu S H. Parallel restarted SGD with faster convergence and less communication: demystifying why model averaging works for deep learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, Hawaii, 2019. 5693\u20135700"},{"key":"3892_CR11","volume-title":"Proceedings of the International Conference on Learning Representations","author":"T Lin","year":"2020","unstructured":"Lin T, Stich S U, Patel K K, et al. Don\u2019t use large mini-batches, use local SGD. In: Proceedings of the International Conference on Learning Representations, Addis Ababa, 2020"},{"key":"3892_CR12","first-page":"7510","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"H C Gao","year":"2021","unstructured":"Gao H C, Xu A, Huang H. On the convergence of communication-efficient local SGD for federated learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2021. 7510\u20137518"},{"key":"3892_CR13","unstructured":"You Y, Gitman I, Ginsburg B. Scaling SGD batch size to 32K for ImageNet training. 2017. ArXiv:1708.03888"},{"key":"3892_CR14","doi-asserted-by":"publisher","first-page":"9","DOI":"10.1007\/978-3-642-35289-8_3","volume-title":"Neural Networks: Tricks of the Trade","author":"Y A LeCun","year":"2012","unstructured":"LeCun Y A, Bottou L, Orr G B, et al. Neural Networks: Tricks of the Trade. Berlin: Springer Science & Business Media, 2012. 9\u201348"},{"key":"3892_CR15","volume-title":"Proceedings of the International Conference on Learning Representations","author":"N S Keskar","year":"2017","unstructured":"Keskar N S, Mudigere D, Nocedal J, et al. On large-batch training for deep learning: generalization gap and sharp minima. In: Proceedings of the International Conference on Learning Representations, Toulon, 2017"},{"key":"3892_CR16","first-page":"1731","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"E Hoffer","year":"2017","unstructured":"Hoffer E, Hubara I, Soudry D. Train longer, generalize better: closing the generalization gap in large batch training of neural networks. In: Proceedings of the Advances in Neural Information Processing Systems, Long Beach, 2017. 1731\u20131741"},{"key":"3892_CR17","first-page":"6094","volume-title":"Proceedings of the International Conference on Machine Learning","author":"T Lin","year":"2020","unstructured":"Lin T, Kong L J, Stich S U, et al. Extrapolation for large-batch training in deep learning. In: Proceedings of the International Conference on Machine Learning, 2020. 6094\u20136104"},{"key":"3892_CR18","unstructured":"Goyal P, Doll\u00e1r P, Girshick R, et al. Accurate, large minibatch SGD: training ImageNet in 1 hour. 2017. ArXiv:1706.02677"},{"key":"3892_CR19","first-page":"1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Y You","year":"2019","unstructured":"You Y, Hseu J, Ying C, et al. Large-batch training for LSTM and beyond. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, Denver, 2019. 1\u201316"},{"key":"3892_CR20","unstructured":"Ginsburg B, Castonguay P, Hrinchuk O, et al. Stochastic gradient methods with layer-wise adaptive moments for training of deep networks. 2019. ArXiv:1905.11286"},{"key":"3892_CR21","first-page":"7883","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"Z Y Huo","year":"2021","unstructured":"Huo Z Y, Gu B, Huang H. Large batch optimization for deep learning using new complete layer-wise adaptive rate scaling. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2021. 7883\u20137890"},{"key":"3892_CR22","first-page":"12360","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Y Liu","year":"2022","unstructured":"Liu Y, Mai S Q, Chen X N, et al. Towards efficient and scalable sharpness-aware minimization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Louisiana, 2022. 12360\u201312370"},{"key":"3892_CR23","first-page":"13925","volume-title":"Proceedings of the International Conference on Machine Learning","author":"R Liu","year":"2022","unstructured":"Liu R, Mozafari B. Communication-efficient distributed learning for large batch optimization. In: Proceedings of the International Conference on Machine Learning, Baltimore, 2022. 13925\u201313946"},{"key":"3892_CR24","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Y You","year":"2020","unstructured":"You Y, Li J, Reddi S, et al. Large batch optimization for deep learning: training BERT in 76 minutes. In: Proceedings of the International Conference on Learning Representations, Addis Ababa, 2020"},{"key":"3892_CR25","first-page":"3009","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"K Yuan","year":"2021","unstructured":"Yuan K, Chen Y M, Huang X M, et al. DecentLaM: decentralized momentum SGD for large-batch deep training. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, Montr\u00e1al, 2021. 3009\u20133019"},{"key":"3892_CR26","volume-title":"Proceedings of the International Conference on Learning Representations","author":"J Z Zhang","year":"2020","unstructured":"Zhang J Z, He T X, Sra S, et al. Why gradient clipping accelerates training: a theoretical justification for adaptivity. In: Proceedings of the International Conference on Learning Representations, Addis Ababa, 2020"},{"key":"3892_CR27","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4419-8853-9","volume-title":"Introductory Lectures on Convex Optimization: A Basic Course","author":"Y E Nesterov","year":"2004","unstructured":"Nesterov Y E. Introductory Lectures on Convex Optimization: A Basic Course. Berlin: Springer Science & Business Media, 2004"},{"key":"3892_CR28","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/0041-5553(64)90137-5","volume":"4","author":"B T Polyak","year":"1964","unstructured":"Polyak B T. Some methods of speeding up the convergence of iteration methods. USSR Comput Math Math Phys, 1964, 4: 1\u201317","journal-title":"USSR Comput Math Math Phys"},{"key":"3892_CR29","first-page":"1594","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"E Hazan","year":"2015","unstructured":"Hazan E, Levy K, Shalev-Shwartz S. Beyond convexity: stochastic quasi-convex optimization. In: Proceedings of the Advances in Neural Information Processing Systems, Montr\u00e1al, 2015. 1594\u20131602"},{"key":"3892_CR30","first-page":"2955","volume-title":"Proceedings of the International Joint Conference on Artificial Intelligence","author":"Y Yan","year":"2018","unstructured":"Yan Y, Yang T B, Li Z, et al. A unified analysis of stochastic momentum methods for deep learning. In: Proceedings of the International Joint Conference on Artificial Intelligence, Stockholm, 2018. 2955\u20132961"},{"key":"3892_CR31","first-page":"770","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"K M He","year":"2016","unstructured":"He K M, Zhang X Y, Ren S Q, et al. Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Las Vegas, 2016. 770\u2013778"},{"key":"3892_CR32","first-page":"1","volume-title":"Proceedings of the Conference on Machine Translation","author":"M Ott","year":"2018","unstructured":"Ott M, Edunov S, Grangier D, et al. Scaling neural machine translation. In: Proceedings of the Conference on Machine Translation, Brussels, 2018. 1\u20139"},{"key":"3892_CR33","volume-title":"Proceedings of the International Conference on Learning Representations","author":"I Loshchilov","year":"2017","unstructured":"Loshchilov I, Hutter F. SGDR: stochastic gradient descent with warm restarts. In: Proceedings of the International Conference on Learning Representations, Toulon, 2017"},{"key":"3892_CR34","first-page":"248","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"J Deng","year":"2009","unstructured":"Deng J, Dong W, Socher R, et al. ImageNet: a large-scale hierarchical image database. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Miami, 2009. 248\u2013255"},{"key":"3892_CR35","volume-title":"Proceedings of the International Conference on Learning Representations","author":"A Dosovitskiy","year":"2021","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, et al. An image is worth 16 \u00d7 16 words: transformers for image recognition at scale. In: Proceedings of the International Conference on Learning Representations, 2021"},{"key":"3892_CR36","volume-title":"Proceedings of the International Conference on Learning Representations","author":"S Merity","year":"2018","unstructured":"Merity S, Keskar N S, Socher R. Regularizing and optimizing LSTM language models. In: Proceedings of the International Conference on Learning Representations, Vancouver, 2018"},{"key":"3892_CR37","volume-title":"Proceedings of the International Conference on Learning Representations","author":"S Merity","year":"2017","unstructured":"Merity S, Xiong C M, Bradbury J, et al. Pointer sentinel mixture models. In: Proceedings of the International Conference on Learning Representations, Toulon, 2017"},{"key":"3892_CR38","first-page":"1725","volume-title":"Proceedings of the International Joint Conference on Artificial Intelligence","author":"H F Guo","year":"2017","unstructured":"Guo H F, Tang R M, Ye Y M, et al. DeepFM: a factorization-machine based neural network for CTR prediction. In: Proceedings of the International Joint Conference on Artificial Intelligence, Melbourne, 2017. 1725\u20131731"},{"key":"3892_CR39","volume-title":"Proceedings of the International Conference on Learning Representations","author":"D P Kingma","year":"2015","unstructured":"Kingma D P, Ba J. ADAM: a method for stochastic optimization. In: Proceedings of the International Conference on Learning Representations, San Diego, 2015"}],"container-title":["Science China Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-022-3892-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11432-022-3892-8","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-022-3892-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T20:24:09Z","timestamp":1766262249000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11432-022-3892-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,23]]},"references-count":39,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2024,11]]}},"alternative-id":["3892"],"URL":"https:\/\/doi.org\/10.1007\/s11432-022-3892-8","relation":{},"ISSN":["1674-733X","1869-1919"],"issn-type":[{"type":"print","value":"1674-733X"},{"type":"electronic","value":"1869-1919"}],"subject":[],"published":{"date-parts":[[2024,10,23]]},"assertion":[{"value":"27 June 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 November 2022","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 April 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 October 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"212101"}}