{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,15]],"date-time":"2026-01-15T09:00:48Z","timestamp":1768467648257,"version":"3.49.0"},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2022,10,22]],"date-time":"2022-10-22T00:00:00Z","timestamp":1666396800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,10,22]],"date-time":"2022-10-22T00:00:00Z","timestamp":1666396800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1007\/s10489-022-04207-7","type":"journal-article","created":{"date-parts":[[2022,10,22]],"date-time":"2022-10-22T02:03:10Z","timestamp":1666404190000},"page":"14233-14248","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Smooth momentum: improving lipschitzness in gradient descent"],"prefix":"10.1007","volume":"53","author":[{"given":"Bum Jun","family":"Kim","sequence":"first","affiliation":[]},{"given":"Hyeyeon","family":"Choi","sequence":"additional","affiliation":[]},{"given":"Hyeonah","family":"Jang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6023-1837","authenticated-orcid":false,"given":"Sang Woo","family":"Kim","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,22]]},"reference":[{"issue":"9","key":"4207_CR1","doi-asserted-by":"publisher","first-page":"6400","DOI":"10.1007\/s10489-021-02293-7","volume":"51","author":"SK Pal","year":"2021","unstructured":"Pal SK, Pramanik A, Maiti J, Mitra P (2021) Deep learning in multi-object detection and tracking: state of the art. Appl Intell 51(9):6400\u20136429","journal-title":"Appl Intell"},{"issue":"10","key":"4207_CR2","doi-asserted-by":"publisher","first-page":"3125","DOI":"10.1007\/s10489-020-01704-5","volume":"50","author":"Q Mao","year":"2020","unstructured":"Mao Q, Sun H, Zuo L, Jia R (2020) Finding every car: a traffic surveillance multi-scale vehicle object detection method. Appl Intell 50(10):3125\u20133136","journal-title":"Appl Intell"},{"issue":"11","key":"4207_CR3","doi-asserted-by":"publisher","first-page":"3623","DOI":"10.1007\/s10489-020-01742-z","volume":"50","author":"L Lu","year":"2020","unstructured":"Lu L, Wu D, Wu T, Huang F, Yi Y (2020) Anchor-free multi-orientation text detection in natural scene images. Appl Intell 50(11):3623\u20133637","journal-title":"Appl Intell"},{"issue":"2","key":"4207_CR4","doi-asserted-by":"publisher","first-page":"393","DOI":"10.1007\/s10994-020-05929-w","volume":"110","author":"H Gouk","year":"2021","unstructured":"Gouk H, Frank E, Pfahringer B, Cree MJ (2021) Regularisation of neural networks by enforcing Lipschitz continuity. Mach Learn 110(2):393\u2013416","journal-title":"Mach Learn"},{"issue":"1","key":"4207_CR5","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava N, Hinton GE, Krizhevsky A, Sutskever I, Salakhutdinov R (2014) Dropout: a simple way to prevent neural networks from overfitting. J Mach Learn Res 15(1):1929\u20131958","journal-title":"J Mach Learn Res"},{"key":"4207_CR6","unstructured":"Ioffe S, Szegedy C (2015) Batch normalization: accelerating deep network training by reducing internal covariate shift, vol 37 of JMLR workshop and conference proceedings pp 448\u2013456 (JMLR.org)"},{"key":"4207_CR7","unstructured":"Santurkar S, Tsipras D, Ilyas A, Madry A (2018) How does batch normalization help optimization?, pp 2488\u20132498"},{"key":"4207_CR8","unstructured":"Qiao S, Wang H, Liu C, Shen W, Yuille AL (2019) Weight standardization. CoRR arXiv:1903.10520"},{"key":"4207_CR9","doi-asserted-by":"crossref","unstructured":"Nesterov YE (2004) Introductory lectures on convex optimization - a basic course vol 87 of applied optimization (springer)","DOI":"10.1007\/978-1-4419-8853-9"},{"key":"4207_CR10","unstructured":"Li H, Xu Z, Taylor G, Studer C, Goldstein T (2018) Visualizing the loss landscape of neural nets, pp 6391\u20136401"},{"key":"4207_CR11","unstructured":"Pascanu R, Mikolov T, Bengio Y (2013) On the difficulty of training recurrent neural networks, vol 28 of JMLR workshop and conference proceedings, pp 1310\u20131318 (JMLR.org)"},{"issue":"2","key":"4207_CR12","first-page":"26","volume":"4","author":"T Tieleman","year":"2012","unstructured":"Tieleman T, Hinton G (2012) Lecture 6.5-rmsprop: divide the gradient by a running average of its recent magnitude. COURSERA: Neural Netw Mach Learn 4(2):26\u201331","journal-title":"COURSERA: Neural Netw Mach Learn"},{"key":"4207_CR13","unstructured":"Kingma DP, Ba J (2015) Adam: a method for stochastic optimization. In: ICLR"},{"key":"4207_CR14","unstructured":"Wilson AC, Roelofs R, Stern M, Srebro N, Recht B (2017) The marginal value of adaptive gradient methods in machine learning, pp 4148\u20134158"},{"key":"4207_CR15","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition, pp 770\u2013778 (IEEE computer society)","DOI":"10.1109\/CVPR.2016.90"},{"key":"4207_CR16","doi-asserted-by":"crossref","unstructured":"Saon G et al (2017) English conversational telephone speech recognition by humans and machines, pp 132\u2013136","DOI":"10.21437\/Interspeech.2017-405"},{"issue":"1","key":"4207_CR17","doi-asserted-by":"publisher","first-page":"145","DOI":"10.1016\/S0893-6080(98)00116-6","volume":"12","author":"N Qian","year":"1999","unstructured":"Qian N (1999) On the momentum term in gradient descent learning algorithms. Neural Netw 12 (1):145\u2013151","journal-title":"Neural Netw"},{"issue":"4","key":"4207_CR18","doi-asserted-by":"publisher","first-page":"3939","DOI":"10.1007\/s10489-021-02224-6","volume":"52","author":"W Yuan","year":"2022","unstructured":"Yuan W, Hu F, Lu L (2022) A new non-adaptive optimization method: stochastic gradient descent with momentum and difference. Appl Intell 52(4):3939\u20133953","journal-title":"Appl Intell"},{"issue":"7","key":"4207_CR19","first-page":"2557","volume":"31","author":"W Tao","year":"2020","unstructured":"Tao W, Pan Z, Wu G, Tao Q (2020) The strength of nesterov\u2019s extrapolation in the individual convergence of nonsmooth optimization. IEEE Trans Neural Netw Learn Syst 31(7):2557\u20132568","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"4207_CR20","doi-asserted-by":"crossref","unstructured":"Gui Y, Li D, Fang R (2022) A fast adaptive algorithm for training deep neural networks. Appl Intell","DOI":"10.1007\/s10489-022-03629-7"},{"key":"4207_CR21","unstructured":"Paszke A et al (2019) PyTorch: an imperative style, high-performance deep learning library (eds Wallach, H. et al) advances in neural information processing systems vol 32, pp 8024\u20138035 (curran associates, Inc.). http:\/\/papers.neurips.cc\/paper\/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf"},{"key":"4207_CR22","unstructured":"Abadi M et al (2015) TensorFlow: large-scale machine learning on heterogeneous systems. https:\/\/www.tensorflow.org\/. Software available from tensorflow.org"},{"issue":"3","key":"4207_CR23","doi-asserted-by":"publisher","first-page":"1460","DOI":"10.1007\/s10489-020-01892-0","volume":"51","author":"R Yedida","year":"2021","unstructured":"Yedida R, Saha S, Prashanth T (2021) LipschitzLR: using theoretically computed adaptive learning rates for fast convergence. Appl Intell 51(3):1460\u20131478","journal-title":"Appl Intell"},{"key":"4207_CR24","unstructured":"Zhang J, He T, Sra S, Jadbabaie A (2020) Why gradient clipping accelerates training: a theoretical justification for adaptivity. (OpenReview.net)"},{"issue":"3","key":"4207_CR25","doi-asserted-by":"publisher","first-page":"200","DOI":"10.1287\/ijoo.2018.0010","volume":"1","author":"FE Curtis","year":"2019","unstructured":"Curtis FE, Scheinberg K, Shi R (2019) A stochastic trust region algorithm based on careful step normalization. Informs J Optimization 1(3):200\u2013220","journal-title":"Informs J Optimization"},{"key":"4207_CR26","unstructured":"Bello I, Zoph B, Vasudevan V, Le QV (2017) Neural optimizer search with reinforcement learning. Vol 70 of proceedings of machine learning research, pp 459\u2013468 (PMLR)"},{"key":"4207_CR27","doi-asserted-by":"crossref","unstructured":"Jiaocheng M, Jinan S, Xin Z, Yuan Peng Z (2022) Bayes-dcgru with bayesian optimization for rolling bearing fault diagnosis. Appl Intell","DOI":"10.1007\/s10489-021-02924-z"},{"key":"4207_CR28","unstructured":"Ackley D (2012) A connectionist machine for genetic hillclimbing. Vol 28 (Springer science & business media"},{"key":"4207_CR29","unstructured":"Krizhevsky A, Hinton G (2009) Learning multiple layers of features from tiny images. Technical report"},{"key":"4207_CR30","unstructured":"Zagoruyko S, Komodakis N (2017) Paying more attention to attention: improving the performance of convolutional neural networks via attention transfer. (OpenReview.net"},{"key":"4207_CR31","doi-asserted-by":"crossref","unstructured":"Parkhi OM, Vedaldi A, Zisserman A, Jawahar CV (2012) Cats and dogs. pp 3498\u20133505 (IEEE computer society","DOI":"10.1109\/CVPR.2012.6248092"},{"issue":"4","key":"4207_CR32","doi-asserted-by":"publisher","first-page":"838","DOI":"10.1137\/0330046","volume":"30","author":"BT Polyak","year":"1992","unstructured":"Polyak BT, Juditsky AB (1992) Acceleration of stochastic approximation by averaging. SIAM J Contr Optimization 30(4):838\u2013855","journal-title":"SIAM J Contr Optimization"},{"key":"4207_CR33","unstructured":"Liu L et al (2020) On the variance of the adaptive learning rate and beyond. (OpenReview.net)"},{"key":"4207_CR34","unstructured":"Riedmiller MA, Braun H (1993) A direct adaptive method for faster backpropagation learning: the RPROP algorithm. pp 586\u2013591 (IEEE)"},{"issue":"3","key":"4207_CR35","doi-asserted-by":"publisher","first-page":"3249","DOI":"10.1007\/s10489-021-02558-1","volume":"52","author":"Y Wang","year":"2022","unstructured":"Wang Y, Li K, Lei Y (2022) A general multi-scale image classification based on shared conversion matrix routing. Appl Intell 52(3):3249\u20133265","journal-title":"Appl Intell"},{"key":"4207_CR36","doi-asserted-by":"crossref","unstructured":"Peters ME et al (2018) Deep contextualized word representations. pp 2227\u20132237 (association for computational linguistics","DOI":"10.18653\/v1\/N18-1202"},{"key":"4207_CR37","unstructured":"Vaswani A et al (2017) Attention is all you need. pp 5998\u20136008"},{"key":"4207_CR38","unstructured":"Devlin J, Chang M, Lee K, Toutanova K (2019) BERT: pre-training of deep bidirectional transformers for language understanding. pp 4171\u20134186 (association for computational linguistics)"},{"key":"4207_CR39","unstructured":"Dosovitskiy A et al (2021) An image is worth 16x16 words: transformers for image recognition at scale. (OpenReview.net)"},{"key":"4207_CR40","doi-asserted-by":"crossref","unstructured":"Szegedy C, Vanhoucke V, Ioffe S, Shlens J, Wojna Z (2016) Rethinking the inception architecture for computer vision. pp 2818\u20132826 (IEEE computer society)","DOI":"10.1109\/CVPR.2016.308"},{"key":"4207_CR41","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu W (2002) Bleu: a method for automatic evaluation of machine translation. pp 311\u2013318 (ACL)","DOI":"10.3115\/1073083.1073135"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-022-04207-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-022-04207-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-022-04207-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,5,31]],"date-time":"2023-05-31T06:35:12Z","timestamp":1685514912000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-022-04207-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,22]]},"references-count":41,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2023,6]]}},"alternative-id":["4207"],"URL":"https:\/\/doi.org\/10.1007\/s10489-022-04207-7","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,10,22]]},"assertion":[{"value":"26 September 2022","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 October 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"<!--Emphasis Type='Bold' removed-->Conflict of Interests"}}]}}