{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:14:28Z","timestamp":1777655668759,"version":"3.51.4"},"publisher-location":"Berlin, Heidelberg","reference-count":34,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"value":"9783642352881","type":"print"},{"value":"9783642352898","type":"electronic"}],"license":[{"start":{"date-parts":[[2012,1,1]],"date-time":"2012-01-01T00:00:00Z","timestamp":1325376000000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2012]]},"DOI":"10.1007\/978-3-642-35289-8_27","type":"book-chapter","created":{"date-parts":[[2012,11,14]],"date-time":"2012-11-14T12:03:17Z","timestamp":1352894597000},"page":"479-535","source":"Crossref","is-referenced-by-count":114,"title":["Training Deep and Recurrent Networks with Hessian-Free Optimization"],"prefix":"10.1007","author":[{"given":"James","family":"Martens","sequence":"first","affiliation":[]},{"given":"Ilya","family":"Sutskever","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"issue":"2","key":"27_CR1","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1162\/089976698300017746","volume":"10","author":"S.I. Amari","year":"1998","unstructured":"Amari, S.I.: Natural gradient works efficiently in learning. Neural Computation\u00a010(2), 251\u2013276 (1998)","journal-title":"Neural Computation"},{"key":"27_CR2","first-page":"29","volume-title":"Proceedings of the 1988 Connectionist Models Summer School","author":"S. Becker","year":"1988","unstructured":"Becker, S., Le Cun, Y.: Improving the convergence of back-propagation learning with second order methods. In: Proceedings of the 1988 Connectionist Models Summer School, pp. 29\u201337. Morgan Kaufmann, San Matteo (1988)"},{"key":"27_CR3","doi-asserted-by":"crossref","unstructured":"Bengio, Y., Lamblin, P., Popovici, D., Larochelle, H.: Greedy layer-wise training of deep networks. In: Advances in Neural Information Processing Systems, vol.\u00a019, p. 153 (2007)","DOI":"10.7551\/mitpress\/7503.003.0024"},{"issue":"2","key":"27_CR4","doi-asserted-by":"publisher","first-page":"157","DOI":"10.1109\/72.279181","volume":"5","author":"Y. Bengio","year":"1994","unstructured":"Bengio, Y., Simard, P., Frasconi, P.: Learning long-term dependencies with gradient descent is difficult. IEEE Transactions on Neural Networks\u00a05(2), 157\u2013166 (1994)","journal-title":"IEEE Transactions on Neural Networks"},{"key":"27_CR5","doi-asserted-by":"crossref","unstructured":"Bergstra, J., Breuleux, O., Bastien, F., Lamblin, P., Pascanu, R., Desjardins, G., Turian, J., Warde-Farley, D., Bengio, Y.: Theano: a cpu and gpu math expression compiler. In: Proceedings of the Python for Scientific Computing Conference (SciPy), vol.\u00a04 (2010)","DOI":"10.25080\/Majora-92bf1922-003"},{"issue":"4","key":"27_CR6","doi-asserted-by":"publisher","first-page":"494","DOI":"10.1162\/neco.1992.4.4.494","volume":"4","author":"C. Bishop","year":"1992","unstructured":"Bishop, C.: Exact calculation of the hessian matrix for the multilayer perceptron. Neural Computation\u00a04(4), 494\u2013501 (1992)","journal-title":"Neural Computation"},{"key":"27_CR7","unstructured":"Bottou, L., Bousquet, O.: The tradeoffs of large scale learning. In: Advances in Neural Information Processing Systems, vol.\u00a020, pp. 161\u2013168 (2008)"},{"key":"27_CR8","doi-asserted-by":"publisher","first-page":"977","DOI":"10.1137\/10079923X","volume":"21","author":"R.H. Byrd","year":"2011","unstructured":"Byrd, R.H., Chin, G.M., Neveitt, W., Nocedal, J.: On the use of stochastic hessian information in optimization methods for machine learning. SIAM Journal on Optimization\u00a021, 977 (2011)","journal-title":"SIAM Journal on Optimization"},{"key":"27_CR9","doi-asserted-by":"crossref","unstructured":"Byrd, R.H., Chin, G.M., Nocedal, J., Wu, Y.: Sample size selection in optimization methods for machine learning. Mathematical Programming, 1\u201329 (2012)","DOI":"10.1007\/s10107-012-0572-5"},{"key":"27_CR10","unstructured":"Chapelle, O., Erhan, D.: Improved preconditioner for hessian free optimization. In: NIPS Workshop on Deep Learning and Unsupervised Feature Learning (2011)"},{"key":"27_CR11","doi-asserted-by":"crossref","unstructured":"Schraudolph, N.: Fast Curvature Matrix-Vector Products for Second-Order Gradient Descent. Neural Computation\u00a014 (2002)","DOI":"10.1162\/08997660260028683"},{"key":"27_CR12","unstructured":"Glorot, X., Bengio, Y.: Understanding the difficulty of training deep feedforward neural networks. In: Proceedings of the International Conference on Artificial Intelligence and Statistics (AISTATS 2010). Society for Artificial Intelligence and Statistics (2010)"},{"issue":"2","key":"27_CR13","doi-asserted-by":"publisher","first-page":"504","DOI":"10.1137\/S1052623497322735","volume":"9","author":"N.I.M. Gould","year":"1999","unstructured":"Gould, N.I.M., Lucidi, S., Roma, M., Toint, P.L.: Solving the trust-region subproblem using the lanczos method. SIAM Journal on Optimization\u00a09(2), 504\u2013525 (1999)","journal-title":"SIAM Journal on Optimization"},{"key":"27_CR14","doi-asserted-by":"publisher","first-page":"1487","DOI":"10.1137\/0914086","volume":"14","author":"P.C. Hansen","year":"1993","unstructured":"Hansen, P.C., O\u2019Leary, D.P.: The use of the l-curve in the regularization of discrete ill-posed problems. SIAM Journal on Scientific Computing\u00a014, 1487 (1993)","journal-title":"SIAM Journal on Scientific Computing"},{"key":"27_CR15","doi-asserted-by":"crossref","unstructured":"Hestenes, M.R., Stiefel, E.: Methods of conjugate gradients for solving linear systems (1952)","DOI":"10.6028\/jres.049.044"},{"issue":"7","key":"27_CR16","doi-asserted-by":"publisher","first-page":"1527","DOI":"10.1162\/neco.2006.18.7.1527","volume":"18","author":"G.E. Hinton","year":"2006","unstructured":"Hinton, G.E., Osindero, S., Teh, Y.W.: A fast learning algorithm for deep belief nets. Neural Computation\u00a018(7), 1527\u20131554 (2006)","journal-title":"Neural Computation"},{"issue":"5786","key":"27_CR17","doi-asserted-by":"publisher","first-page":"504","DOI":"10.1126\/science.1127647","volume":"313","author":"G.E. Hinton","year":"2006","unstructured":"Hinton, G.E., Salakhutdinov, R.R.: Reducing the dimensionality of data with neural networks. Science\u00a0313(5786), 504\u2013507 (2006)","journal-title":"Science"},{"key":"27_CR18","unstructured":"Hochreiter, S.: Untersuchungen zu dynamischen neuronalen netzen. diploma thesis, Institut f\u00fcr Informatik, Lehrstuhl Prof. Brauer, Technische Universit\u00e4t M\u00fcnchen (1991)"},{"issue":"8","key":"27_CR19","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S. Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Computation\u00a09(8), 1735\u20131780 (1997)","journal-title":"Neural Computation"},{"issue":"5667","key":"27_CR20","doi-asserted-by":"publisher","first-page":"78","DOI":"10.1126\/science.1091277","volume":"304","author":"H. Jaeger","year":"2004","unstructured":"Jaeger, H., Haas, H.: Harnessing nonlinearity: Predicting chaotic systems and saving energy in wireless communication. Science\u00a0304(5667), 78\u201380 (2004)","journal-title":"Science"},{"key":"27_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"9","DOI":"10.1007\/3-540-49430-8_2","volume-title":"Neural Networks: Tricks of the Trade","author":"Y. LeCun","year":"1998","unstructured":"LeCun, Y., Bottou, L., Orr, G.B., M\u00fcller, K.-R.: Efficient BackProp. In: Orr, G.B., M\u00fcller, K.-R. (eds.) NIPS-WS 1996. LNCS, vol.\u00a01524, pp. 9\u201350. Springer, Heidelberg (1998)"},{"key":"27_CR22","unstructured":"Martens, J.: Deep learning via hessian-free optimization. In: Proceedings of the 27th International Conference on Machine Learning (ICML), vol.\u00a0951 (2010)"},{"key":"27_CR23","doi-asserted-by":"crossref","unstructured":"Martens, J., Sutskever, I.: Learning recurrent neural networks with hessian-free optimization. In: Proc. ICML (2011)","DOI":"10.1007\/978-3-642-35289-8_27"},{"key":"27_CR24","unstructured":"Martens, J., Sutskever, I., Swersky, K.: Estimating the hessian by backpropagating curvature. In: Proc. ICML (2012)"},{"key":"27_CR25","doi-asserted-by":"crossref","unstructured":"Mor\u00e9, J.J.: The levenberg-marquardt algorithm: implementation and theory. Numerical Analysis, 105\u2013116 (1978)","DOI":"10.1007\/BFb0067700"},{"key":"27_CR26","doi-asserted-by":"publisher","first-page":"553","DOI":"10.1137\/0904038","volume":"4","author":"J.J. Mor\u00e9","year":"1983","unstructured":"Mor\u00e9, J.J., Sorensen, D.C.: Computing a trust region step. SIAM Journal on Scientific and Statistical Computing\u00a04, 553 (1983)","journal-title":"SIAM Journal on Scientific and Statistical Computing"},{"key":"27_CR27","doi-asserted-by":"crossref","unstructured":"Nash, S.G.: Newton-type minimization via the lanczos method. SIAM Journal on Numerical Analysis, 770\u2013788 (1984)","DOI":"10.1137\/0721052"},{"issue":"1","key":"27_CR28","doi-asserted-by":"publisher","first-page":"45","DOI":"10.1016\/S0377-0427(00)00426-X","volume":"124","author":"S.G. Nash","year":"2000","unstructured":"Nash, S.G.: A survey of truncated-newton methods. Journal of Computational and Applied Mathematics\u00a0124(1), 45\u201359 (2000)","journal-title":"Journal of Computational and Applied Mathematics"},{"key":"27_CR29","unstructured":"Nesterov, Y.: A method for unconstrained convex minimization problem with the rate of convergence o (1\/k2). In: Doklady AN SSSR, vol.\u00a0269, pp. 543\u2013547 (1983)"},{"key":"27_CR30","doi-asserted-by":"crossref","unstructured":"Nocedal, J., Wright, S.J.: Numerical optimization. Springer (1999)","DOI":"10.1007\/b98874"},{"issue":"1","key":"27_CR31","doi-asserted-by":"publisher","first-page":"147","DOI":"10.1162\/neco.1994.6.1.147","volume":"6","author":"B.A. Pearlmutter","year":"1994","unstructured":"Pearlmutter, B.A.: Fast exact multiplication by the hessian. Neural Computation\u00a06(1), 147\u2013160 (1994)","journal-title":"Neural Computation"},{"key":"27_CR32","unstructured":"Shewchuk, J.R.: An introduction to the conjugate gradient method without the agonizing pain (1994)"},{"key":"27_CR33","unstructured":"Vinyals, O., Povey, D.: Krylov subspace descent for deep learning. arXiv preprint arXiv:1111.4259 (2011)"},{"issue":"8","key":"27_CR34","doi-asserted-by":"publisher","first-page":"463","DOI":"10.1145\/355586.364791","volume":"7","author":"R.E. Wengert","year":"1964","unstructured":"Wengert, R.E.: A simple automatic derivative evaluation program. Communications of the ACM\u00a07(8), 463\u2013464 (1964)","journal-title":"Communications of the ACM"}],"container-title":["Lecture Notes in Computer Science","Neural Networks: Tricks of the Trade"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-35289-8_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,6,26]],"date-time":"2023-06-26T19:53:31Z","timestamp":1687809211000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-35289-8_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012]]},"ISBN":["9783642352881","9783642352898"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-35289-8_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2012]]}}}