{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T18:27:31Z","timestamp":1780511251874,"version":"3.54.1"},"reference-count":43,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,2]]},"DOI":"10.1109\/ita.2018.8503173","type":"proceedings-article","created":{"date-parts":[[2018,10,25]],"date-time":"2018-10-25T23:42:52Z","timestamp":1540510972000},"page":"1-9","source":"Crossref","is-referenced-by-count":27,"title":["On the Insufficiency of Existing Momentum Schemes for Stochastic Optimization"],"prefix":"10.1109","author":[{"given":"Rahul","family":"Kidambi","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Praneeth","family":"Netrapalli","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Prateek","family":"Jain","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sham","family":"Kakade","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/29.61535"},{"key":"ref38","article-title":"A stochastic gradient method with an exponential convergence rate for strongly-convex optimization with finite training sets","author":"le roux","year":"2012","journal-title":"NIPS 25"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1016\/0041-5553(64)90137-5"},{"key":"ref32","article-title":"Path-sgd: Path-normalized optimization in deep neural networks","author":"neyshabur","year":"2015","journal-title":"CoRR abs\/1506 02617"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1137\/100802001"},{"key":"ref30","doi-asserted-by":"crossref","DOI":"10.1007\/978-1-4419-8853-9","author":"nesterov","year":"2004","journal-title":"Introductory Lectures on Convex Optimization A Basic Course Volume 87 of Applied Optimization"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1214\/aoms\/1177729586"},{"key":"ref36","article-title":"A generic approach for escaping saddle points","author":"reddi","year":"2017","journal-title":"ar Xiv preprint arXiv 1709 01434"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.1974.1100738"},{"key":"ref34","author":"polyak","year":"1987","journal-title":"Introduction to Optimization"},{"key":"ref10","article-title":"Harder, better, faster, stronger convergence rates for least-squares regression","author":"dieuleveut","year":"2016","journal-title":"CoRR abs\/1602 05419"},{"key":"ref40","article-title":"Stochastic dual coordinate ascent methods for regularized loss minimization","author":"shalev-shwartz","year":"2012","journal-title":"CoRR abs\/1209 1873"},{"key":"ref11","first-page":"2121","article-title":"Elad Hazan, and Yoram Singer. Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"duchi","year":"2011","journal-title":"Journal of Machine Learning Research"},{"key":"ref12","article-title":"Un-regularizing: approximate proximal point and faster stochastic algorithms for empirical risk minimization","author":"frostig","year":"2015","journal-title":"ICML"},{"key":"ref13","article-title":"Competing with the empirical risk minimizer in a single pass","author":"roy","year":"2015","journal-title":"COLT"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1137\/110848864"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1137\/110848876"},{"key":"ref16","doi-asserted-by":"crossref","first-page":"630","DOI":"10.1007\/978-3-319-46493-0_38","article-title":"Identity mappings in deep residual networks","author":"he","year":"2016","journal-title":"ECCV (4) Ser Lecture Notes in Computer Science"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref18","doi-asserted-by":"crossref","first-page":"504","DOI":"10.1126\/science.1127647","article-title":"Reducing the dimensionality of data with neural networks","volume":"313","author":"hinton","year":"2006","journal-title":"Science"},{"key":"ref19","article-title":"Parallelizing stochastic approximation through mini - batching and tail-averaging","author":"prateek","year":"2016","journal-title":"arXiv preprint arXiv 1610 03774"},{"key":"ref28","first-page":"372","article-title":"A method of solving a convex programming problem with convergence rate o(1\/k2)","volume":"27","author":"nesterov","year":"1983","journal-title":"Soviet Mathematics Doklady"},{"key":"ref4","article-title":"The tradeoffs of large scale learning","author":"bottou","year":"2007","journal-title":"NIPS 20"},{"key":"ref27","article-title":"Optimizing neural networks with kronecker-factored approximate curvature","author":"martens","year":"2015","journal-title":"International Conference on Machine Learning"},{"key":"ref3","article-title":"The first direct acceleration of stochastic gradient methods","author":"allen-zhu","year":"2016","journal-title":"CoRR abs\/1603 05953"},{"key":"ref6","article-title":"Entropy-sgd: Biasing gradient descent into wide valleys","author":"chaudhari","year":"2017","journal-title":"CoRR abs\/1611 01838"},{"key":"ref29","article-title":"Gradient methods for minimizing composite functions","author":"nesterov","year":"2012","journal-title":"Mathematical Programming Series B"},{"key":"ref5","article-title":"Methode generale pour la resolution des systemes d&#x2019; equations simultanees","author":"cauchy","year":"1847","journal-title":"C R Acad Sci Paris"},{"key":"ref8","article-title":"SAGA: A fast incremental gradient method with support for non-strongly convex composite objectives","author":"aaron","year":"2014","journal-title":"NIPS 27"},{"key":"ref7","article-title":"A simple practical accelerated method for finite sums","author":"defazio","year":"2016","journal-title":"Advances in Neural Information Processing Systems 29 (NIPS 2016)"},{"key":"ref2","year":"0","journal-title":"PyTorch"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-013-0677-5"},{"key":"ref1","year":"0","journal-title":"Preresnet-44 for cifar-10"},{"key":"ref20","article-title":"Accelerating stochastic gradient descent","author":"prateek","year":"2017","journal-title":"arXiv preprint arXiv 1704 08227"},{"key":"ref22","article-title":"On large-batch training for deep learning: Generalization gap and sharp minima","author":"shirish keskar","year":"2016","journal-title":"CoRR abs\/1609 04836"},{"key":"ref21","article-title":"Accelerating stochastic gradient descent using predictive variance reduction","author":"johnson","year":"2013","journal-title":"NIPS 26"},{"key":"ref42","article-title":"Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude","author":"tieleman","year":"2012","journal-title":"COURSERA Neural Networks for Machine Learning"},{"key":"ref24","author":"krizhevsky","year":"2009","journal-title":"Learning multiple layers of features from tiny images"},{"key":"ref41","first-page":"1139","article-title":"On the importance of initialization and momentum in deep learning","author":"sutskever","year":"2013","journal-title":"International Conference on Machine Learning"},{"key":"ref23","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2014","journal-title":"CoRR abs\/1412 6980"},{"key":"ref26","article-title":"Deep learning via hessian-free optimization","author":"martens","year":"2010","journal-title":"International Conference on Machine Learning"},{"key":"ref43","article-title":"Yellowfin and the art of momentum tuning","author":"zhang","year":"2017","journal-title":"CoRR abs\/1706 03471"},{"key":"ref25","article-title":"A universal catalyst for first-order optimization","author":"lin","year":"2015","journal-title":"NIPS"}],"event":{"name":"2018 Information Theory and Applications Workshop (ITA)","location":"San Diego, CA","start":{"date-parts":[[2018,2,11]]},"end":{"date-parts":[[2018,2,16]]}},"container-title":["2018 Information Theory and Applications Workshop (ITA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8486613\/8502949\/08503173.pdf?arnumber=8503173","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,26]],"date-time":"2022-01-26T17:52:10Z","timestamp":1643219530000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8503173\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,2]]},"references-count":43,"URL":"https:\/\/doi.org\/10.1109\/ita.2018.8503173","relation":{},"subject":[],"published":{"date-parts":[[2018,2]]}}}