{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,28]],"date-time":"2025-10-28T15:00:41Z","timestamp":1761663641316,"version":"3.28.0"},"reference-count":27,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017,5]]},"DOI":"10.1109\/ijcnn.2017.7965845","type":"proceedings-article","created":{"date-parts":[[2017,7,10]],"date-time":"2017-07-10T17:41:30Z","timestamp":1499708490000},"page":"125-132","source":"Crossref","is-referenced-by-count":14,"title":["A robust adaptive stochastic gradient method for deep learning"],"prefix":"10.1109","author":[{"given":"Caglar","family":"Gulcehre","sequence":"first","affiliation":[]},{"given":"Jose","family":"Sotelo","sequence":"additional","affiliation":[]},{"given":"Marcin","family":"Moczulski","sequence":"additional","affiliation":[]},{"given":"Yoshua","family":"Bengio","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref10","first-page":"181","article-title":"Variance reduction for stochastic gradient optimization","author":"wang","year":"2013","journal-title":"Advances in neural information processing systems"},{"key":"ref11","first-page":"315","article-title":"Accelerating stochastic gradient descent using predictive variance reduction","author":"johnson","year":"2013","journal-title":"Advances in neural information processing systems"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/72.279181"},{"journal-title":"Gradient Flow in Recurrent Nets The Difficulty of Learning Long-term Dependencies","year":"2001","author":"hochreiter","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1214\/aoms\/1177729586"},{"key":"ref15","first-page":"2121","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"duchi","year":"2011","journal-title":"The Journal of Machine Learning Research"},{"journal-title":"Maxout Networks","year":"2013","author":"goodfellow","key":"ref16"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2005.132"},{"journal-title":"Generating Sequences with Recurrent Neural Networks","year":"2013","author":"graves","key":"ref18"},{"journal-title":"ADADELTA An Adaptive Learning Rate Method","year":"2012","author":"zeiler","key":"ref19"},{"key":"ref4","first-page":"156","article-title":"Automatic learning rate maximization by on-line estimation of the hessians eigenvectors","volume":"5","author":"lecun","year":"1993","journal-title":"Advances in neural information processing systems"},{"journal-title":"Theano A Python framework for fast computation of mathematical expressions","year":"2016","key":"ref27"},{"journal-title":"No More Pesky Learning Rates","year":"2012","author":"schaul","key":"ref3"},{"journal-title":"Adaptive learning rates and parallelization for stochastic sparse non-smooth gradients","year":"2013","author":"schaul","key":"ref6"},{"key":"ref5","doi-asserted-by":"crossref","first-page":"9","DOI":"10.1007\/978-3-642-35289-8_3","article-title":"Efficient backprop","author":"lecun","year":"2012","journal-title":"Neural Networks Tricks of the Trade"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1016\/j.cam.2004.05.013"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1090\/S0025-5718-01-01332-1"},{"key":"ref2","first-page":"29","article-title":"Improving the convergence of backpropagation learning with second order methods","author":"becker","year":"1988","journal-title":"Proceedings of the 1988 Connectionist Models Summer School"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1162\/08997660260028683"},{"journal-title":"Adasecant robust adaptive secant method for stochastic gradient","year":"2014","author":"gulcehre","key":"ref1"},{"key":"ref20","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2015","journal-title":"International Conference on Learning Representations"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"journal-title":"Learning phrase representations using RNN encoder-decoder for statistical machine translation","year":"2014","author":"cho","key":"ref21"},{"key":"ref24","article-title":"Theano: new features and speed improvements","author":"bastien","year":"2012","journal-title":"NIPS 2012 Workshop on Deep Learning and Unsupervised Feature Learning"},{"journal-title":"Subword language modeling with neural networks","year":"2012","author":"mikolov","key":"ref23"},{"journal-title":"Blocks and fuel Frameworks for deep learning","year":"2015","author":"van merri\u00ebnboer","key":"ref26"},{"journal-title":"Pylearn2 a machine learning researchlibrary","year":"2013","author":"goodfellow","key":"ref25"}],"event":{"name":"2017 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2017,5,14]]},"location":"Anchorage, AK, USA","end":{"date-parts":[[2017,5,19]]}},"container-title":["2017 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7958416\/7965814\/07965845.pdf?arnumber=7965845","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,9,29]],"date-time":"2019-09-29T06:16:33Z","timestamp":1569737793000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/7965845\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,5]]},"references-count":27,"URL":"https:\/\/doi.org\/10.1109\/ijcnn.2017.7965845","relation":{},"subject":[],"published":{"date-parts":[[2017,5]]}}}