{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T14:20:52Z","timestamp":1760710852885,"version":"3.37.3"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2022,1,26]],"date-time":"2022-01-26T00:00:00Z","timestamp":1643155200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,26]],"date-time":"2022-01-26T00:00:00Z","timestamp":1643155200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100000781","name":"European Research Council","doi-asserted-by":"publisher","award":["FACTORY-CoG-6681839"],"award-info":[{"award-number":["FACTORY-CoG-6681839"]}],"id":[{"id":"10.13039\/501100000781","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001665","name":"Agence Nationale de la Recherche","doi-asserted-by":"publisher","award":["ANR 3IA-ANITI"],"award-info":[{"award-number":["ANR 3IA-ANITI"]}],"id":[{"id":"10.13039\/501100001665","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Process Lett"],"published-print":{"date-parts":[[2022,6]]},"DOI":"10.1007\/s11063-021-10705-5","type":"journal-article","created":{"date-parts":[[2022,1,26]],"date-time":"2022-01-26T00:03:51Z","timestamp":1643155431000},"page":"1727-1752","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Second-Order Step-Size Tuning of SGD for Non-Convex Optimization"],"prefix":"10.1007","volume":"54","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7384-6387","authenticated-orcid":false,"given":"Camille","family":"Castera","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"J\u00e9r\u00f4me","family":"Bolte","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"C\u00e9dric","family":"F\u00e9votte","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Edouard","family":"Pauwels","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,1,26]]},"reference":[{"issue":"1","key":"10705_CR1","doi-asserted-by":"publisher","first-page":"23","DOI":"10.1007\/BF01584842","volume":"81","author":"YI Alber","year":"1998","unstructured":"Alber YI, Iusem AN, Solodov MV (1998) On the projected subgradient method for nonsmooth convex optimization in a hilbert space. Math Program 81(1):23\u201335","journal-title":"Math Program"},{"key":"10705_CR2","unstructured":"Allen-Zhu Z (2018) Natasha 2: faster non-convex optimization than SGD. In: Advances in Neural Information Processing Systems (NIPS), pp 2675\u20132686"},{"issue":"2","key":"10705_CR3","doi-asserted-by":"publisher","first-page":"247","DOI":"10.1023\/B:JOTA.0000015684.50827.49","volume":"120","author":"F Alvarez","year":"2004","unstructured":"Alvarez F, Cabot A (2004) Steepest descent with curvature dynamical system. J Optim Theory Appl 120(2):247\u2013273","journal-title":"J Optim Theory Appl"},{"issue":"5","key":"10705_CR4","doi-asserted-by":"publisher","first-page":"1040","DOI":"10.1080\/10556788.2012.667811","volume":"28","author":"S Babaie-Kafaki","year":"2013","unstructured":"Babaie-Kafaki S, Fatemi M (2013) A modified two-point stepsize gradient algorithm for unconstrained minimization. Optim Methods Softw 28(5):1040\u20131050","journal-title":"Optim Methods Softw"},{"key":"10705_CR5","unstructured":"Barakat A, Bianchi P (2018) Convergence of the ADAM algorithm from a dynamical system viewpoint. arXiv:1810.02263"},{"issue":"1","key":"10705_CR6","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1093\/imanum\/8.1.141","volume":"8","author":"J Barzilai","year":"1988","unstructured":"Barzilai J, Borwein JM (1988) Two-point step size gradient methods. IMA J Numer Anal 8(1):141\u2013148","journal-title":"IMA J Numer Anal"},{"key":"10705_CR7","volume-title":"Nonlinear programming","author":"DP Bertsekas","year":"1998","unstructured":"Bertsekas DP, Hager W, Mangasarian O (1998) Nonlinear programming. Athena Scientific, Belmont, MA"},{"key":"10705_CR8","doi-asserted-by":"publisher","first-page":"626","DOI":"10.1007\/s10957-012-0265-5","volume":"158","author":"F Biglari","year":"2013","unstructured":"Biglari F, Solimanpur M (2013) Scaling on the spectral gradient method. J Optim Theory Appl 158:626\u2013635","journal-title":"J Optim Theory Appl"},{"key":"10705_CR9","unstructured":"Bolte J, Pauwels E (2020) A mathematical model for automatic differentiation in machine learning. In: advances in Neural Information Processing Systems (NIPS)"},{"key":"10705_CR10","unstructured":"Carmon Y, Duchi JC, Hinder O, Sidford A (2017) Convex until proven guilty: dimension-free acceleration of gradient descent on non-convex functions. In: proceedings of the international conference on machine learning (ICML), pp 654\u2013663"},{"issue":"134","key":"10705_CR11","first-page":"1","volume":"22","author":"C Castera","year":"2021","unstructured":"Castera C, Bolte J, F\u00e9votte C, Pauwels E (2021) An inertial Newton algorithm for deep learning. J Mach Learn Res 22(134):1\u201331","journal-title":"J Mach Learn Res"},{"issue":"2","key":"10705_CR12","doi-asserted-by":"publisher","first-page":"717","DOI":"10.1093\/imanum\/drv034","volume":"36","author":"FE Curtis","year":"2016","unstructured":"Curtis FE, Guo W (2016) Handling nonpositive curvature in a limited memory steepest descent method. IMA J Numer Anal 36(2):717\u2013742","journal-title":"IMA J Numer Anal"},{"issue":"1\u20132","key":"10705_CR13","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/s10107-018-1335-8","volume":"176","author":"FE Curtis","year":"2019","unstructured":"Curtis FE, Robinson DP (2019) Exploiting negative curvature in deterministic and stochastic optimization. Math Program 176(1\u20132):69\u201394","journal-title":"Math Program"},{"issue":"1","key":"10705_CR14","doi-asserted-by":"publisher","first-page":"103","DOI":"10.1023\/A:1014838419611","volume":"22","author":"Y Dai","year":"2002","unstructured":"Dai Y, Yuan J, Yuan YX (2002) Modified two-point stepsize gradient methods for unconstrained optimization. Comput Optim Appl 22(1):103\u2013109","journal-title":"Comput Optim Appl"},{"issue":"1","key":"10705_CR15","doi-asserted-by":"publisher","first-page":"119","DOI":"10.1007\/s10208-018-09409-5","volume":"20","author":"D Davis","year":"2020","unstructured":"Davis D, Drusvyatskiy D, Kakade S, Lee JD (2020) Stochastic subgradient method converges on tame functions. Found Comput Math 20(1):119\u2013154","journal-title":"Found Comput Math"},{"key":"10705_CR16","unstructured":"Duchi J, Hazan E, Singer Y (2011) Adaptive subgradient methods for online learning and stochastic optimization. J Mach Learn Res, 12(7)"},{"issue":"4","key":"10705_CR17","doi-asserted-by":"publisher","first-page":"3229","DOI":"10.1137\/17M1135086","volume":"28","author":"JC Duchi","year":"2018","unstructured":"Duchi JC, Ruan F (2018) Stochastic methods for composite and weakly convex optimization problems. SIAM J Optim 28(4):3229\u20133259","journal-title":"SIAM J Optim"},{"key":"10705_CR18","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"issue":"5786","key":"10705_CR19","doi-asserted-by":"publisher","first-page":"504","DOI":"10.1126\/science.1127647","volume":"313","author":"GE Hinton","year":"2006","unstructured":"Hinton GE, Salakhutdinov RR (2006) Reducing the dimensionality of data with neural networks. Science 313(5786):504\u2013507","journal-title":"Science"},{"issue":"3","key":"10705_CR20","doi-asserted-by":"publisher","first-page":"90","DOI":"10.1109\/MCSE.2007.55","volume":"9","author":"JD Hunter","year":"2007","unstructured":"Hunter JD (2007) Matplotlib: a 2d graphics environment. Comput Sci Eng 9(3):90\u201395","journal-title":"Comput Sci Eng"},{"key":"10705_CR21","unstructured":"Idelbayev Y (2018) Proper ResNet implementation for CIFAR10\/CIFAR100 in PyTorch. https:\/\/github.com\/akamaster\/pytorch_resnet_cifar10"},{"key":"10705_CR22","unstructured":"Ioffe S, Szegedy C (2015) Batch normalization: Accelerating deep network training by reducing internal covariate shift. In: proceedings of the international conference on machine learning (ICML), pp 448\u2013456"},{"key":"10705_CR23","unstructured":"Johnson R, Zhang T (2013) Accelerating stochastic gradient descent using predictive variance reduction. In: advances in neural information processing systems (NIPS), pp 315\u2013323"},{"key":"10705_CR24","unstructured":"Kingma DP, Ba J (2015) Adam: a method for stochastic optimization. In: proceedings of the international conference on learning representations (ICLR)"},{"key":"10705_CR25","unstructured":"Krishnan S, Xiao Y, Saurous RA (2018) Neumann optimizer: a practical optimization algorithm for deep neural networks. In: proceedings of the international conference on learning representations (ICLR)"},{"key":"10705_CR26","unstructured":"Krizhevsky A (2009) Learning multiple layers of features from tiny images. Tech. rep, Canadian Institute for Advanced Research"},{"issue":"11","key":"10705_CR27","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y LeCun","year":"1998","unstructured":"LeCun Y, Bottou L, Bengio Y, Haffner P et al (1998) Gradient-based learning applied to document recognition. Proc IEEE 86(11):2278\u20132324","journal-title":"Proc IEEE"},{"key":"10705_CR28","unstructured":"LeCun Y, Cortes C, Burges C (2010) MNIST handwritten digit database. ATT Labs [Online] Available: www.https:\/\/yannlecuncom\/exdb\/mnist"},{"key":"10705_CR29","unstructured":"Li X, Orabona F (2019) On the convergence of stochastic gradient descent with adaptive stepsizes. In: proceedings of the international conference on artificial intelligence and statistics (AISTATS), pp 983\u2013992"},{"key":"10705_CR30","doi-asserted-by":"publisher","first-page":"197","DOI":"10.1016\/j.patrec.2019.08.029","volume":"128","author":"J Liang","year":"2019","unstructured":"Liang J, Xu Y, Bao C, Quan Y, Ji H (2019) Barzilai-Borwein-based adaptive learning rate for deep learning. Pattern Recognit Lett 128:197\u2013203","journal-title":"Pattern Recognit Lett"},{"key":"10705_CR31","unstructured":"Lin M, Chen Q, Yan S (2013) Network in network. arXiv:1312.4400"},{"key":"10705_CR32","unstructured":"Liu M, Yang T (2017) On noisy negative curvature descent: Competing with gradient descent for faster non-convex optimization. arXiv:1709.08571"},{"key":"10705_CR33","unstructured":"Martens J, Grosse R (2015) Optimizing neural networks with kronecker-factored approximate curvature. In: proceedings of the international conference on machine learning (ICML), pp 2408\u20132417"},{"key":"10705_CR34","unstructured":"Paszke A, Gross S, Massa F, Lerer A, Bradbury J, Chanan G, Killeen T, Lin Z, Gimelshein N, Antiga L, et\u00a0al. (2019) Pytorch: an imperative style, high-performance deep learning library. In: advances in neural information processing systems (NIPS), pp 8026\u20138037"},{"issue":"1","key":"10705_CR35","doi-asserted-by":"publisher","first-page":"26","DOI":"10.1137\/S1052623494266365","volume":"7","author":"M Raydan","year":"1997","unstructured":"Raydan M (1997) The Barzilai and Borwein gradient method for the large scale unconstrained minimization problem. SIAM J Optim 7(1):26\u201333","journal-title":"SIAM J Optim"},{"issue":"1","key":"10705_CR36","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1214\/aoms\/1177729586","volume":"22","author":"H Robbins","year":"1951","unstructured":"Robbins H, Monro S (1951) A stochastic approximation method. Ann Math Stat 22(1):400\u2013407","journal-title":"Ann Math Stat"},{"key":"10705_CR37","doi-asserted-by":"crossref","unstructured":"Robbins H, Siegmund D (1971) A convergence theorem for non negative almost supermartingales and some applications. In: optimizing methods in statistics, Elsevier, pp 233\u2013257","DOI":"10.1016\/B978-0-12-604550-5.50015-8"},{"key":"10705_CR38","doi-asserted-by":"crossref","unstructured":"Robles-Kelly A, Nazari A (2019) Incorporating the Barzilai-Borwein adaptive step size into subgradient methods for deep network training. In: 2019 digital image computing: techniques and applications (DICTA), pp 1\u20136","DOI":"10.1109\/DICTA47822.2019.8945980"},{"key":"10705_CR39","unstructured":"Rossum G (1995) Python reference manual. CWI (Centre for Mathematics and Computer Science)"},{"issue":"2","key":"10705_CR40","doi-asserted-by":"publisher","first-page":"1448","DOI":"10.1137\/17M1134329","volume":"28","author":"CW Royer","year":"2018","unstructured":"Royer CW, Wright SJ (2018) Complexity analysis of second-order line-search algorithms for smooth nonconvex optimization. SIAM J Optim 28(2):1448\u20131477","journal-title":"SIAM J Optim"},{"key":"10705_CR41","unstructured":"Schraudolph NN, Yu J, G\u00fcnter S (2007) A stochastic Quasi-Newton method for online convex optimization. In: proceedings of the international conference on artificial intelligence and statistics (AISTATS)"},{"key":"10705_CR42","unstructured":"Tan C, Ma S, Dai YH, Qian Y (2016) Barzilai-Borwein step size for stochastic gradient descent. In: advances in neural information processing systems (NIPS), pp 685\u2013693"},{"issue":"2","key":"10705_CR43","first-page":"26","volume":"4","author":"T Tieleman","year":"2012","unstructured":"Tieleman T, Hinton G (2012) Lecture 6.5-RMSprop: divide the gradient by a running average of its recent magnitude. COURSERA Neural Netw Mach Learn 4(2):26\u201331","journal-title":"COURSERA Neural Netw Mach Learn"},{"issue":"2","key":"10705_CR44","doi-asserted-by":"publisher","first-page":"22","DOI":"10.1109\/MCSE.2011.37","volume":"13","author":"Walt Svd","year":"2011","unstructured":"Svd Walt, Colbert SC, Varoquaux G (2011) The numpy array: a structure for efficient numerical computation. Comput Sci Eng 13(2):22\u201330","journal-title":"Comput Sci Eng"},{"key":"10705_CR45","unstructured":"Wilson AC, Roelofs R, Stern M, Srebro N, Recht B (2017) The marginal value of adaptive gradient methods in machine learning. In: advances in neural information processing systems (NIPS), pp 4148\u20134158"},{"issue":"10","key":"10705_CR46","doi-asserted-by":"publisher","first-page":"2986","DOI":"10.1016\/j.cam.2010.04.012","volume":"234","author":"Y Xiao","year":"2010","unstructured":"Xiao Y, Wang Q, Wang D (2010) Notes on the Dai-Yuan-Yuan modified spectral gradient method. J Comput Appl Math 234(10):2986\u20132992","journal-title":"J Comput Appl Math"},{"key":"10705_CR47","unstructured":"Zhuang J, Tang T, Ding Y, Tatikonda SC, Dvornek N, Papademetris X, Duncan J (2020) Adabelief optimizer: adapting stepsizes by the belief in observed gradients. Advances in Neural Information Processing Systems (NIPS) 33"}],"container-title":["Neural Processing Letters"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-021-10705-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11063-021-10705-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-021-10705-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,28]],"date-time":"2022-05-28T13:16:15Z","timestamp":1653743775000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11063-021-10705-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,1,26]]},"references-count":47,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2022,6]]}},"alternative-id":["10705"],"URL":"https:\/\/doi.org\/10.1007\/s11063-021-10705-5","relation":{},"ISSN":["1370-4621","1573-773X"],"issn-type":[{"type":"print","value":"1370-4621"},{"type":"electronic","value":"1573-773X"}],"subject":[],"published":{"date-parts":[[2022,1,26]]},"assertion":[{"value":"19 November 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 January 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}