{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,7]],"date-time":"2025-08-07T09:11:24Z","timestamp":1754557884515,"version":"3.28.0"},"reference-count":38,"publisher":"IEEE","license":[{"start":{"date-parts":[[2020,7,1]],"date-time":"2020-07-01T00:00:00Z","timestamp":1593561600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,7,1]],"date-time":"2020-07-01T00:00:00Z","timestamp":1593561600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,7,1]],"date-time":"2020-07-01T00:00:00Z","timestamp":1593561600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020,7]]},"DOI":"10.1109\/ijcnn48605.2020.9207425","type":"proceedings-article","created":{"date-parts":[[2020,9,30]],"date-time":"2020-09-30T00:40:33Z","timestamp":1601426433000},"page":"1-8","source":"Crossref","is-referenced-by-count":3,"title":["Improving Generalization Performance of Adaptive Learning Rate by Switching from Block Diagonal Matrix Preconditioning to SGD"],"prefix":"10.1109","author":[{"given":"Yasutoshi","family":"Ida","sequence":"first","affiliation":[]},{"given":"Yasuhiro","family":"Fujiwara","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"article-title":"Very Deep Convolutional Networks for Large-scale Image Recognition","year":"2014","author":"simonyan","key":"ref38"},{"key":"ref33","article-title":"Improving Generalization Performance by Switching from Adam to SGD","volume":"abs 1712 7628","author":"keskar","year":"2017","journal-title":"CoRR"},{"key":"ref32","article-title":"Google&#x2019;s Neural Machine Translation System: Bridging the Gap between Human and Machine Translation","volume":"abs 1609 8144","author":"wu","year":"2016","journal-title":"CoRR"},{"key":"ref31","first-page":"413","article-title":"Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients","author":"balles","year":"2018","journal-title":"ICML"},{"key":"ref30","first-page":"5285","article-title":"Second-order Optimization for Deep Reinforcement Learning using Kronecker-factored Approximation","author":"wu","year":"2017","journal-title":"The Annual Conference on Neural Information Processing Systems"},{"article-title":"Learning Multiple Layers of Features from Tiny Images","year":"2009","author":"krizhevsky","key":"ref37"},{"key":"ref36","article-title":"Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms","volume":"abs 1708 7747","author":"xiao","year":"2017","journal-title":"CoRR"},{"key":"ref35","first-page":"3288","article-title":"Convolutional Neural Networks Applied to House Numbers Digit Classification","author":"sermanet","year":"2012","journal-title":"International Conference on Pattern Recognition (ICPR)"},{"key":"ref34","article-title":"Extremely Large Minibatch SGD: Training ResNet-50 on ImageNet in 15 Minutes","volume":"abs 1711 4325","author":"akiba","year":"2017","journal-title":"CoRR"},{"article-title":"ADADELTA: An Adaptive Learning Rate Method","year":"2012","author":"zeiler","key":"ref10"},{"key":"ref11","article-title":"Adam: A Method for Stochastic Optimization","author":"kingma","year":"2014","journal-title":"International Conference on Learning Representations"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/267"},{"key":"ref13","first-page":"1504","article-title":"Equilibrated Adaptive Learning Rates for Non-convex Optimization","author":"dauphin","year":"2015","journal-title":"The Annual Conference on Neural Information Processing Systems"},{"key":"ref14","first-page":"2933","article-title":"Identifying and Attacking the Saddle Point Problem in High-dimensional Non-convex Optimization","author":"dauphin","year":"2014","journal-title":"The Annual Conference on Neural Information Processing Systems"},{"key":"ref15","first-page":"7654","article-title":"The Anisotropic Noise in Stochastic Gradient Descent: Its Behavior of Escaping from Sharp Minima and Regularization Effects","author":"zhu","year":"2019","journal-title":"ICML"},{"key":"ref16","first-page":"1561","article-title":"Fast Lasso Algorithm via Selective Coordinate Descent","author":"fujiwara","year":"2016","journal-title":"AAAI"},{"key":"ref17","first-page":"1700","article-title":"Fast Sparse Group Lasso","author":"ida","year":"2019","journal-title":"The Annual Conference on Neural Information Processing Systems"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1090\/mmono\/191"},{"key":"ref19","first-page":"2408","article-title":"Optimizing Neural Networks with Kronecker-factored Approximate Curvature","author":"martens","year":"2015","journal-title":"ICML"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1162\/089976698300017746"},{"key":"ref4","first-page":"649","article-title":"Character-level Convolutional Networks for Text Classification","author":"zhang","year":"2015","journal-title":"The Annual Conference on Neural Information Processing Systems"},{"key":"ref27","first-page":"4151","article-title":"The Marginal Value of Adaptive Gradient Methods in Machine Learning","author":"wilson","year":"2017","journal-title":"The Annual Conference on Neural Information Processing Systems"},{"key":"ref3","first-page":"379","article-title":"R-FCN: Object Detection via Region-based Fully Convolutional Networks","author":"dai","year":"2016","journal-title":"The Annual Conference on Neural Information Processing Systems"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1214\/aoms\/1177729586"},{"key":"ref29","first-page":"573","article-title":"A Kronecker-factored Approximate Fisher Matrix for Convolution Layers","author":"grosse","year":"2016","journal-title":"ICML"},{"key":"ref5","first-page":"2966","article-title":"Deep Voice 2: Multi-Speaker Neural Text-to-Speech","author":"gibiansky","year":"2017","journal-title":"The Annual Conference on Neural Information Processing Systems"},{"key":"ref8","first-page":"2121","article-title":"Adaptive Subgradient Methods for Online Learning and Stochastic Optimization","volume":"12","author":"duchi","year":"2011","journal-title":"The Journal of Machine Learning Research"},{"key":"ref7","article-title":"On the Convergence of Adam and Beyond","author":"reddi","year":"2018","journal-title":"International Conference on Learning Representations (ICLR)"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2019.8852442"},{"key":"ref9","article-title":"Lecture 6.5-rmsprop: Divide the Gradient by a Running Average of its Recent Magnitude","author":"tieleman","year":"2012","journal-title":"COURSERA Neural Networks for Machine Learning"},{"key":"ref1","first-page":"1106","article-title":"ImageNet Classification with Deep Convolutional Neural Networks","author":"krizhevsky","year":"2012","journal-title":"The Annual Conference on Neural Information Processing Systems"},{"key":"ref20","first-page":"134:1","article-title":"Stochastic Gradient Descent as Approximate Bayesian Inference","volume":"18","author":"mandt","year":"2017","journal-title":"Journal of Machine Learning Research"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/72.377972"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/BF00332914"},{"key":"ref24","first-page":"156","article-title":"Automatic Learning Rate Maximization by On-Line Estimation of the Hessian&#x2019;s Eigenvectors","author":"lecun","year":"1993","journal-title":"The Annual Conference on Neural Information Processing Systems"},{"key":"ref23","article-title":"Accelerated Backpropagation Learning: Two Optimization Methods","volume":"3","author":"battiti","year":"1989","journal-title":"Complex Systems"},{"key":"ref26","article-title":"On-Line Step Size Adaptation","author":"almeida","year":"1997","journal-title":"Technical Report"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1016\/0893-6080(88)90003-2"}],"event":{"name":"2020 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2020,7,19]]},"location":"Glasgow, United Kingdom","end":{"date-parts":[[2020,7,24]]}},"container-title":["2020 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9200848\/9206590\/09207425.pdf?arnumber=9207425","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,28]],"date-time":"2022-06-28T21:53:46Z","timestamp":1656453226000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9207425\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,7]]},"references-count":38,"URL":"https:\/\/doi.org\/10.1109\/ijcnn48605.2020.9207425","relation":{},"subject":[],"published":{"date-parts":[[2020,7]]}}}