{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,11]],"date-time":"2025-10-11T08:23:25Z","timestamp":1760171005552,"version":"3.40.3"},"publisher-location":"Cham","reference-count":30,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319461274"},{"type":"electronic","value":"9783319461281"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-46128-1_1","type":"book-chapter","created":{"date-parts":[[2016,9,3]],"date-time":"2016-09-03T05:34:23Z","timestamp":1472880863000},"page":"1-16","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["adaQN: An Adaptive Quasi-Newton Algorithm for Training RNNs"],"prefix":"10.1007","author":[{"given":"Nitish Shirish","family":"Keskar","sequence":"first","affiliation":[]},{"given":"Albert S.","family":"Berahas","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,9,4]]},"reference":[{"issue":"2","key":"1_CR1","doi-asserted-by":"publisher","first-page":"157","DOI":"10.1109\/72.279181","volume":"5","author":"Y Bengio","year":"1994","unstructured":"Bengio, Y., Simard, P., Frasconi, P.: Learning long-term dependencies with gradient descent is difficult. Neural Netw. IEEE Trans. 5(2), 157\u2013166 (1994)","journal-title":"Neural Netw. IEEE Trans."},{"key":"1_CR2","unstructured":"Bengio, Y., Goodfellow, I., Courville, A.: Deep learning (2016). Book in preparation for MIT Press. http:\/\/www.deeplearningbook.org"},{"issue":"2","key":"1_CR3","doi-asserted-by":"publisher","first-page":"1008","DOI":"10.1137\/140954362","volume":"26","author":"RH Byrd","year":"2016","unstructured":"Byrd, R.H., Hansen, S.L., Nocedal, J., Singer, Y.: A stochastic quasi-Newton method for large-scale optimization. SIAM J. Optim. 26(2), 1008\u20131031 (2016)","journal-title":"SIAM J. Optim."},{"key":"1_CR4","doi-asserted-by":"crossref","unstructured":"Cho, K., Van Merri\u00ebnboer, B., G\u00fcl\u00e7ehre, \u00c7., Bahdanau, D., Bougares, F., Schwenk, H., Bengio, Y.: Learning phrase representations using RNN encoder-decoder for statistical machine translation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1724\u20131734. Association for Computational Linguistics, Doha, Qatar, October 2014","DOI":"10.3115\/v1\/D14-1179"},{"key":"1_CR5","first-page":"2121","volume":"12","author":"J Duchi","year":"2011","unstructured":"Duchi, J., Hazan, E., Singer, Y.: Adaptive subgradient methods for online learning and stochastic optimization. J. Mach. Learn. Res. 12, 2121\u20132159 (2011)","journal-title":"J. Mach. Learn. Res."},{"key":"1_CR6","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-642-24797-2","volume-title":"Supervised sequence labelling with Recurrent Neural Networks","author":"A Graves","year":"2012","unstructured":"Graves, A.: Supervised sequence labelling with Recurrent Neural Networks, vol. 385. Springer, Heidelberg (2012)"},{"key":"1_CR7","doi-asserted-by":"crossref","unstructured":"Graves, A., Mohamed, A., Hinton, G.: Speech recognition with deep recurrent neural networks. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2013), pp. 6645\u20136649 (2013)","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"1_CR8","unstructured":"Graves, A., Schmidhuber, J.: Offline handwriting recognition with multidimensional recurrent neural networks. In: Advances in Neural Information Processing Systems (NIPS 2009), pp. 545\u2013552 (2009)"},{"issue":"8","key":"1_CR9","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"1_CR10","unstructured":"Karpathy, A., Johnson, J., Li, F.: Visualizing and understanding recurrent networks. In: International Conference on Learning Representations (ICLR 2016) (2016)"},{"key":"1_CR11","unstructured":"Keskar, N., Berahas, A.S.: minSQN: Stochastic Quasi-Newton Optimization in MATLAB (2016). https:\/\/github.com\/keskarnitish\/minSQN\/"},{"key":"1_CR12","unstructured":"Kingma, D., Ba, J.: Adam: A method for stochastic optimization. In: International Conference on Learning Representations (ICLR 2015) (2015)"},{"key":"1_CR13","unstructured":"Le, Q.V., Jaitly, N., Hinton, G.E.: A simple way to initialize recurrent networks of rectified linear units. arXiv preprint (2015). arXiv:1504.00941"},{"key":"1_CR14","unstructured":"Martens, J.: Deep learning via hessian-free optimization. In: Proceedings of the 27th International Conference on Machine Learning (ICML 2010) (2010)"},{"key":"1_CR15","unstructured":"Martens, J.: New perspectives on the natural gradient method. arXiv preprint (2014). arXiv:1412.1193"},{"key":"1_CR16","unstructured":"Martens, J., Grosse, R.: Optimizing neural networks with kronecker-factored approximate curvature. In: Proceedings of the 32th International Conference on Machine Learning (ICML 2015) (2015)"},{"key":"1_CR17","unstructured":"Martens, J., Sutskever, I.: Learning recurrent neural networks with hessian-free optimization. In: Proceedings of the 28th International Conference on Machine Learning (ICML 2011), pp. 1033\u20131040 (2011)"},{"key":"1_CR18","doi-asserted-by":"crossref","unstructured":"Mikolov, T., Kombrink, S., Deoras, A., Burget, L., Cernocky, J.: Rnnlm-recurrent neural network language modeling toolkit. In: Proceedings of the 2011 ASRU Workshop, pp. 196\u2013201 (2011)","DOI":"10.21437\/Interspeech.2011-720"},{"issue":"23","key":"1_CR19","doi-asserted-by":"publisher","first-page":"6089","DOI":"10.1109\/TSP.2014.2357775","volume":"62","author":"A Mokhtari","year":"2014","unstructured":"Mokhtari, A., Ribeiro, A.: RES: Regularized stochastic BFGS algorithm. Sig. Proces. IEEE Trans. 62(23), 6089\u20136104 (2014)","journal-title":"Sig. Proces. IEEE Trans."},{"key":"1_CR20","first-page":"3151","volume":"16","author":"A Mokhtari","year":"2015","unstructured":"Mokhtari, A., Ribeiro, A.: Global convergence of online limited memory BFGS. J. Mach. Learn. Res. 16, 3151\u20133181 (2015)","journal-title":"J. Mach. Learn. Res."},{"key":"1_CR21","first-page":"372","volume":"27","author":"Y Nesterov","year":"1983","unstructured":"Nesterov, Y.: A method of solving a convex programming problem with convergence rate o (1\/k2). Sov. Math. Dokl. 27, 372\u2013376 (1983)","journal-title":"Sov. Math. Dokl."},{"key":"1_CR22","volume-title":"Numerical optimization","author":"J Nocedal","year":"2006","unstructured":"Nocedal, J., Wright, S.: Numerical optimization. Springer, Heidelberg (2006)"},{"key":"1_CR23","unstructured":"Pascanu, R., Bengio, Y.: Revisiting natural gradient for deep networks. In: International Conference on Learning Representations (ICLR 2013) (2013)"},{"key":"1_CR24","unstructured":"Pascanu, R., Mikolov, T., Bengio, Y.: On the difficulty of training recurrent neural networks. In: Proceedings of the 31st International Conference on Machine Learning (ICML 2014), pp. 1310\u20131318 (2013)"},{"key":"1_CR25","series-title":"Automatic speech and speaker recognition","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4613-1367-0_10","volume-title":"The use of recurrent neural networks in continuous speech recognition","author":"T Robinson","year":"1996","unstructured":"Robinson, T., Hochberg, M., Renals, S.: The use of recurrent neural networks in continuous speech recognition. Automatic speech and speaker recognition. Springer, Heidelberg (1996)"},{"key":"1_CR26","unstructured":"Schraudolph, N.N., Yu, J., G\u00fcnter, S.: A stochastic quasi-newton method for online convex optimization. In: International Conference on Artificial Intelligence and Statistics, pp. 436\u2013443 (2007)"},{"key":"1_CR27","unstructured":"Sutskever, I., Martens, J., Dahl, G., Hinton, G.: On the importance of initialization and momentum in deep learning. In: Proceedings of the 30th International Conference on Machine Learning (ICML 2013), pp. 1139\u20131147 (2013)"},{"key":"1_CR28","first-page":"26","volume":"4","author":"T Tieleman","year":"2012","unstructured":"Tieleman, T., Hinton, G.: Lecture 6.5-RMSProp: Divide the gradient by a running average of its recent magnitude. COURSERA: Neural Netw. Mach. Learn. 4, 26\u201331 (2012)","journal-title":"COURSERA: Neural Netw. Mach. Learn."},{"key":"1_CR29","unstructured":"Wang, X., Ma, S., Liu, W.: Stochastic quasi-Newton methods for nonconvex stochastic optimization. arXiv preprint (2014). arXiv:1412.1196"},{"key":"1_CR30","unstructured":"Zaremba, W., Sutskever, I., Vinyals, O.: Recurrent Neural Network Regularization. arXiv preprint (2014). arXiv:1409.2329"}],"container-title":["Lecture Notes in Computer Science","Machine Learning and Knowledge Discovery in Databases"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-46128-1_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,9,5]],"date-time":"2021-09-05T00:09:12Z","timestamp":1630800552000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-319-46128-1_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319461274","9783319461281"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-46128-1_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2016]]},"assertion":[{"value":"4 September 2016","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECML PKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Joint European Conference on Machine Learning and Knowledge Discovery in Databases","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Riva del Garda","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2016","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 September 2016","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 September 2016","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecml2016","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}