{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T17:58:39Z","timestamp":1778003919837,"version":"3.51.4"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"13","license":[{"start":{"date-parts":[[2022,12,17]],"date-time":"2022-12-17T00:00:00Z","timestamp":1671235200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,12,17]],"date-time":"2022-12-17T00:00:00Z","timestamp":1671235200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2023,7]]},"DOI":"10.1007\/s10489-022-04382-7","type":"journal-article","created":{"date-parts":[[2022,12,17]],"date-time":"2022-12-17T11:03:37Z","timestamp":1671275017000},"page":"16844-16858","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":18,"title":["DiffMoment: an adaptive optimization technique for convolutional neural network"],"prefix":"10.1007","volume":"53","author":[{"given":"Shubhankar","family":"Bhakta","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9638-1906","authenticated-orcid":false,"given":"Utpal","family":"Nandi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tapas","family":"Si","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sudipta Kr","family":"Ghosal","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chiranjit","family":"Changdar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rajat Kumar","family":"Pal","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,12,17]]},"reference":[{"key":"4382_CR1","unstructured":"Rumelhart DE, Hinton GE, Williams RJ (1988) Learning representations by back-propagating errors. In: Neurocomputing:Foundations of research. http:\/\/dl.acm.org\/citation.cfm?id=65669.104451. MIT Press, Cambridge, pp 696\u2013699"},{"key":"4382_CR2","doi-asserted-by":"publisher","first-page":"150","DOI":"10.1007\/s10489-014-0645-7","volume":"43","author":"S Mirjalili","year":"2015","unstructured":"Mirjalili S (2015) How effective is the Grey Wolf optimizer in training multi-layer perceptrons. Appl Intell 43:150\u2013161. https:\/\/doi.org\/10.1007\/s10489-014-0645-7","journal-title":"Appl Intell"},{"key":"4382_CR3","doi-asserted-by":"publisher","first-page":"322","DOI":"10.1007\/s10489-016-0767-1","volume":"45","author":"H Faris","year":"2016","unstructured":"Faris H, Aljarah I, Mirjalili S (2016) Training feedforward neural networks using multi-verse optimizer for binary classification problems. Appl Intell 45:322\u2013332. https:\/\/doi.org\/10.1007\/s10489-016-0767-1","journal-title":"Appl Intell"},{"key":"4382_CR4","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y LeCun","year":"2015","unstructured":"LeCun Y, Bengio Y, Hinton G (2015) Deep learning. Nature 521:436\u201344. https:\/\/doi.org\/10.1038\/nature14539","journal-title":"Nature"},{"key":"4382_CR5","unstructured":"Goodfellow I, Bengio Y, Courville A (2016) Deep learning. MIT Press. http:\/\/www.deeplearningbook.org"},{"key":"4382_CR6","doi-asserted-by":"publisher","unstructured":"Girshick R, Donahue J, Darrell T, Malik J (2014) Rich feature hierarchies for accurate object detection and semantic segmentation. In: 2014 IEEE conference on computer vision and pattern recognition, pp 580\u2013587, DOI https:\/\/doi.org\/10.1109\/CVPR.2014.81, (to appear in print)","DOI":"10.1109\/CVPR.2014.81"},{"key":"4382_CR7","doi-asserted-by":"publisher","unstructured":"Girshick R (2015) Fast r-cnn. In: 2015 IEEE international conference on computer vision (ICCV), pp 1440\u20131448, DOI https:\/\/doi.org\/10.1109\/ICCV.2015.169, (to appear in print)","DOI":"10.1109\/ICCV.2015.169"},{"issue":"6","key":"4382_CR8","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2017","unstructured":"Ren S, He K, Girshick R, Sun J (2017) Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Trans Pattern Anal Mach Intell 39(6):1137\u20131149. https:\/\/doi.org\/10.1109\/TPAMI.2016.2577031","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"4382_CR9","unstructured":"Krizhevsky A, Sutskever I, Hinton G E (2012) Imagenet classification with deep convolutional neural networks. In: Pereira F, Burges C J C, Bottou L, Weinberger K Q (eds) Advances in neural information processing systems, vol 25. Curran Associates, Inc. pp 1097\u20131105"},{"issue":"1","key":"4382_CR10","doi-asserted-by":"publisher","first-page":"145","DOI":"10.1016\/S0893-6080(98)00116-6","volume":"12","author":"N Qian","year":"1999","unstructured":"Qian N (1999) On the momentum term in gradient descent learning algorithms. Neural Netw 12 (1):145\u2013151. http:\/\/dblp.uni-trier.de\/db\/journals\/nn\/nn12.html#Qian99","journal-title":"Neural Netw"},{"issue":"61","key":"4382_CR11","first-page":"2121","volume":"12","author":"J Duchi","year":"2011","unstructured":"Duchi J, Hazan E, Singer Y (2011) Adaptive subgradient methods for online learning and stochastic optimization. J Mach Learn Res 12(61):2121\u20132159. http:\/\/jmlr.org\/papers\/v12\/duchi11a.html","journal-title":"J Mach Learn Res"},{"key":"4382_CR12","unstructured":"Kingma D P, Ba J (2015) Adam: a method for stochastic optimization. In: Bengio Y, LeCun Y (eds) 3Rd international conference on learning representations, ICLR 2015, San diego, May 7-9, p 2015"},{"key":"4382_CR13","doi-asserted-by":"publisher","unstructured":"Dogo EM, Afolabi OJ, Nwulu NI, Twala B, Aigbavboa CO (2018) A comparative analysis of gradient descent-based optimization algorithms on convolutional neural networks. In: 2018 International conference on computational techniques, electronics and mechanical systems (CTEMS), pp 92\u201399, DOI https:\/\/doi.org\/10.1109\/CTEMS.2018.8769211, (to appear in print)","DOI":"10.1109\/CTEMS.2018.8769211"},{"key":"4382_CR14","doi-asserted-by":"crossref","unstructured":"Bottou L (2010) Large-scale machine learning with stochastic gradient descent. In: Lechevallier Y, Saporta G (eds) Proceedings of COMPSTAT\u20192010. Physica-Verlag HD, Heidelberg, pp 177\u2013186","DOI":"10.1007\/978-3-7908-2604-3_16"},{"issue":"3","key":"4382_CR15","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1214\/aoms\/1177729586","volume":"22","author":"H Robbins","year":"1951","unstructured":"Robbins H, Monro S (1951) A stochastic approximation method. Annals Math Stat 22 (3):400\u2013407. http:\/\/www.jstor.org\/stable\/2236626","journal-title":"Annals Math Stat"},{"key":"4382_CR16","unstructured":"Sutskever I, Martens J, Dahl G, Hinton G (2013) On the importance of initialization and momentum in deep learning. In: Proceedings of the 30th International Conference on International Conference on Machine Learning - Vol 28. JMLR.org, ICML\u201913, p III\u20131139\u2013III\u20131147"},{"key":"4382_CR17","doi-asserted-by":"publisher","unstructured":"Botev A, Lever G, Barber D (2017) Nesterov\u2019s accelerated gradient and momentum as approximations to regularised update descent. In: 2017 International joint conference on neural networks (IJCNN), pp 1899\u20131903, DOI https:\/\/doi.org\/10.1109\/IJCNN.2017.7966082https:\/\/doi.org\/10.1109\/IJCNN.2017.7966082 , (to appear in print)","DOI":"10.1109\/IJCNN.2017.7966082 10.1109\/IJCNN.2017.7966082"},{"key":"4382_CR18","unstructured":"Lydia A, Francis S (2019) Adagrad - an optimizer for stochastic gradient descent vol 6. pp 566\u2013568"},{"key":"4382_CR19","doi-asserted-by":"publisher","unstructured":"Fang JK, Fong CM, Yang P, Hung CK, Lu WL, Chang CW (2020) Adagrad gradient descent method for ai image management. In: 2020 IEEE International Conference on Consumer Electronics - Taiwan (ICCE-Taiwan), pp 1\u20132, DOI https:\/\/doi.org\/10.1109\/ICCE-Taiwan49838.2020.9258085, (to appear in print)","DOI":"10.1109\/ICCE-Taiwan49838.2020.9258085"},{"key":"4382_CR20","unstructured":"Zeiler M D (2012) ADADELTA: An adaptive learning rate method. CoRR:1212.5701"},{"key":"4382_CR21","unstructured":"Hinton G, Srivastava KSN (2012) Lecture 6a overview of mini-batch gradient descent course. In: Neural networks for machine learning"},{"key":"4382_CR22","doi-asserted-by":"crossref","unstructured":"Dorronsoro JR, Gonzalez A, Cruz CS (2002) Natural Gradient Learning in NLDA Networks, Conference on Artificial and Natural Neural Networks: Connectionist Models of Neurons, Learning Processes and Artificial Intelligence-Part-I, pp 427\u2013434. https:\/\/dl.acm.org\/doi\/10.5555\/646369.690794","DOI":"10.1007\/3-540-45720-8_50"},{"key":"4382_CR23","doi-asserted-by":"publisher","unstructured":"Zhang J (2019) Gradient descent based optimization algorithms for deep learning models training. CoRR: 1903.03614. https:\/\/doi.org\/10.1007\/978-3-030-33904-3_40","DOI":"10.1007\/978-3-030-33904-3_40"},{"key":"4382_CR24","unstructured":"Wilson AC, Roelofs R, Stern M, Srebro N, Recht B (2017) The marginal value of adaptive gradient methods in machine learning. In: Guyon I, Luxburg UV, Bengio S, Wallach H, Fergus R, Vishwanathan S, Garnett R (eds) Advances in Neural Information Processing Systems, Curran Associates, Inc., vol 30. https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/81b3833e2504647f9d794f7d7b9bf341-Paper.pdf"},{"key":"4382_CR25","unstructured":"Lyu K, Li J (2020) Gradient descent maximizes the margin of homogeneous neural networks. In: International conference on learning representations. https:\/\/openreview.net\/forum?id=SJeLIgBKPS"},{"key":"4382_CR26","unstructured":"Zhuang J, Tang T, Ding Y, Tatikonda S, Dvornek N C, Papademetris X, Duncan JS (2020) Adabelief optimizer: Adapting stepsizes by the belief in observed gradients. CoRR: 2010.07468"},{"key":"4382_CR27","unstructured":"Defazio A, Jelassi S (2021) Adaptivity without compromise: A momentumized, adaptive, dual averaged gradient method for stochastic optimization. 2101.11075"},{"issue":"11","key":"4382_CR28","doi-asserted-by":"publisher","first-page":"4500","DOI":"10.1109\/TNNLS.2019.2955777","volume":"31","author":"SR Dubey","year":"2020","unstructured":"Dubey SR, Chakraborty S, Roy SK, Mukherjee S, Singh SK, Chaudhuri BB (2020) diffgrad: An optimization method for convolutional neural networks. IEEE Trans Neural Netw Learn Syst 31 (11):4500\u20134511. https:\/\/doi.org\/10.1109\/TNNLS.2019.2955777","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"4382_CR29","doi-asserted-by":"publisher","unstructured":"Yong H, Huang J, Hua X, Zhang L (2020) Gradient-centralization: A new optimization technique for deep neural networks. In: Vedaldi A, Bischof H, Brox T, Frahm JM (eds) Computer Vision ECCV 2020. ECCV 2020. Lecture Notes in Computer Science, vol 12346. https:\/\/doi.org\/10.1007\/978-3-030-58452-8_37. Springer, Cham","DOI":"10.1007\/978-3-030-58452-8_37"},{"key":"4382_CR30","unstructured":"Liu L, Jiang H, He P, Chen W, Liu X, Gao J, Han J (2020) On the variance of the adaptive learning rate and beyond. In: Proceedings of the Eighth International Conference on Learning Representations (ICLR), vol 2020. pp 1\u201313"},{"key":"4382_CR31","volume-title":"Learning multiple layers of features from tiny images","author":"A Krizhevsky","year":"2009","unstructured":"Krizhevsky A (2009) Learning multiple layers of features from tiny images. University of Tront, Master\u2019s thesis"},{"issue":"3","key":"4382_CR32","doi-asserted-by":"publisher","first-page":"175","DOI":"10.1093\/comjnl\/3.3.175","volume":"3","author":"HH Rosenbrock","year":"1960","unstructured":"Rosenbrock HH (1960) An Automatic Method for Finding the Greatest or Least Value of a Function. Comput J 3(3):175\u2013184. https:\/\/doi.org\/10.1093\/comjnl\/3.3.175. https:\/\/academic.oup.com\/comjnl\/article-lookup\/doi\/10.1093\/comjnl\/3.3.175","journal-title":"Comput J"},{"key":"4382_CR33","unstructured":"Dauphin YN, Pascanu R, Gulcehre C, Cho K, Ganguli S, Bengio Y (2014) Identifying and attacking the saddle point problem in high-dimensional non-convex optimization. In: Ghahramani Z, Welling M, Cortes C, Lawrence N, Weinberger KQ (eds) Advances in Neural Information Processing Systems, Curran Associates, Inc., vol 27. https:\/\/proceedings.neurips.cc\/paper\/2014\/file\/17e23e50bedc63b4095e3d8204ce063b-Paper.pdf"},{"key":"4382_CR34","unstructured":"Lacotte J, Pilanci M (2020) All local minima are global for two-layer relu neural networks: The hidden convex optimization landscape. 2006.05900"},{"key":"4382_CR35","unstructured":"Kawaguchi K, Kaelbling LP (2020) Elimination of all bad local minima in deep learning. In: Chiappa S, Calandra R (eds) Proceedings of the Twenty Third International Conference on Artificial Intelligence and Statistics. PMLR,108, pp 853\u2013863"},{"issue":"3-4","key":"4382_CR36","doi-asserted-by":"publisher","first-page":"142","DOI":"10.1561\/2200000058","volume":"10","author":"P Jain","year":"2017","unstructured":"Jain P, Kar P (2017) Non-convex optimization for machine learning. Found Trends Mach Learn 10(3-4):142\u2013336. https:\/\/doi.org\/10.1561\/2200000058","journal-title":"Found Trends Mach Learn"},{"key":"4382_CR37","unstructured":"Danilova M, Dvurechensky PE, Gasnikov AV, Gorbunov EA, Guminov S, Kamzolov D, Shibaev I (2020) Recent theoretical advances in non-convex optimization. CoRR:2012.06188. https:\/\/dblp.uni-trier.de\/rec\/journals\/corr\/abs-2012-06188.html"},{"key":"4382_CR38","doi-asserted-by":"publisher","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 770\u2013778, DOI https:\/\/doi.org\/10.1109\/CVPR.2016.90, (to appear in print)","DOI":"10.1109\/CVPR.2016.90"},{"key":"4382_CR39","doi-asserted-by":"publisher","unstructured":"Szegedy C, Liu W, Jia Y, Sermanet P, Reed S, Anguelov D, Erhan D, Vanhoucke V, Rabinovich A (2015) Going deeper with convolutions. In: 2015 IEEE conference on computer vision and pattern recognition (CVPR), pp 1\u20139, DOI https:\/\/doi.org\/10.1109\/CVPR.2015.7298594, (to appear in print)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"4382_CR40","doi-asserted-by":"publisher","unstructured":"Liu S, Deng W (2015) Very deep convolutional neural network based image classification using small training sample size. In: 2015 3rd IAPR Asian conference on pattern recognition (ACPR), pp 730\u2013734, DOI https:\/\/doi.org\/10.1109\/ACPR.2015.7486599, (to appear in print)","DOI":"10.1109\/ACPR.2015.7486599"},{"key":"4382_CR41","doi-asserted-by":"publisher","unstructured":"Szegedy C, Liu W, Jia Y, Sermanet P, Reed S, Anguelov D, Erhan D, Vanhoucke V, Rabinovich A (2015) Going deeper with convolutions. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 1\u20139, DOI https:\/\/doi.org\/10.1109\/CVPR.2015.7298594, (to appear in print)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"4382_CR42","unstructured":"Nair V, Hinton GE (2010) Rectified linear units improve restricted Boltzmann machines. In: Proceedings Of the 27th international conference on machine learning, Haifa, 21 June 2010, pp 807\u2013814"},{"key":"4382_CR43","unstructured":"Andrew L, Maas AYN, Hannun AY (2013) Rectifier nonlinearities improve neural network acoustic models. In: Proceedings of the international conference on machine learning, pp 1\u20136"},{"key":"4382_CR44","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2012) Imagenet classification with deep convolutional neural networks. In: Pereira F, Burges CJC, Bottou L, Weinberger KQ (eds) Advances in Neural Information Processing Systems 25, Curran Associates, Inc., pp 1097\u20131105. http:\/\/papers.nips.cc\/paper\/4824-imagenet-classification.pdf"},{"key":"4382_CR45","doi-asserted-by":"publisher","unstructured":"Huang G, Liu Z, van der Maaten L, Weinberger K (2017) Densely Connected Convolutional Networks. https:\/\/doi.org\/10.1109\/CVPR.2017.243","DOI":"10.1109\/CVPR.2017.243"},{"key":"4382_CR46","doi-asserted-by":"crossref","unstructured":"LecunBottou Y, et al. (1998) Gradient-based learning applied to document recognition. In: Proceedings of the IEEE, vol 88. no. 11","DOI":"10.1109\/5.726791"},{"issue":"6","key":"4382_CR47","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1109\/MSP.2012.2211477","volume":"29","author":"L Deng","year":"2012","unstructured":"Deng L (2012) \/E MNIST database of handwritten digit images for machine learning research [best of the web]. IEEE Signal Proc Mag 29(6):141\u2013142","journal-title":"IEEE Signal Proc Mag"},{"issue":"2","key":"4382_CR48","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1162\/089976698300017746","volume":"10","author":"S-i Amari","year":"1998","unstructured":"Amari S-i (1998) Natural gradient works efficiently in learning. Neural Comput 10(2):251\u2013276","journal-title":"Neural Comput"},{"key":"4382_CR49","unstructured":"Maas AL, Hannun AY, Ng AY (2013) Rectifier nonlinearities improve neural network acoustic models. In: Proceedings of the 30th International Conference on Machine Learning, Atlanta, GA, USA, vol 30. pp 16\u201321"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-022-04382-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-022-04382-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-022-04382-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,1]],"date-time":"2023-07-01T05:13:34Z","timestamp":1688188414000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-022-04382-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,12,17]]},"references-count":49,"journal-issue":{"issue":"13","published-print":{"date-parts":[[2023,7]]}},"alternative-id":["4382"],"URL":"https:\/\/doi.org\/10.1007\/s10489-022-04382-7","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,12,17]]},"assertion":[{"value":"5 December 2022","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 December 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no potential conflict of interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"<!--Emphasis Type='Bold' removed-->Conflict of Interests"}}]}}