{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T15:28:16Z","timestamp":1779290896489,"version":"3.51.4"},"publisher-location":"Berlin, Heidelberg","reference-count":120,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"value":"9783642352881","type":"print"},{"value":"9783642352898","type":"electronic"}],"license":[{"start":{"date-parts":[[2012,1,1]],"date-time":"2012-01-01T00:00:00Z","timestamp":1325376000000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2012]]},"DOI":"10.1007\/978-3-642-35289-8_26","type":"book-chapter","created":{"date-parts":[[2012,11,14]],"date-time":"2012-11-14T12:03:17Z","timestamp":1352894597000},"page":"437-478","source":"Crossref","is-referenced-by-count":1232,"title":["Practical Recommendations for Gradient-Based Training of Deep Architectures"],"prefix":"10.1007","author":[{"given":"Yoshua","family":"Bengio","sequence":"first","affiliation":[]}],"member":"297","reference":[{"issue":"2","key":"26_CR1","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1162\/089976698300017746","volume":"10","author":"S. Amari","year":"1998","unstructured":"Amari, S.: Natural gradient works efficiently in learning. Neural Computation\u00a010(2), 251\u2013276 (1998)","journal-title":"Neural Computation"},{"key":"26_CR2","unstructured":"Bach, F., Moulines, E.: Non-asymptotic analysis of stochastic approximation algorithms. In: NIPS 2011 (2011)"},{"key":"26_CR3","unstructured":"Bagnell, J.A., Bradley, D.M.: Differentiable sparse coding. In: NIPS 2009, pp. 113\u2013120 (2009)"},{"key":"26_CR4","doi-asserted-by":"crossref","unstructured":"Baxter, J.: Learning internal representations. In: COLT 1995, pp. 311\u2013320 (1995)","DOI":"10.1145\/225298.225336"},{"key":"26_CR5","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1023\/A:1007327622663","volume":"28","author":"J. Baxter","year":"1997","unstructured":"Baxter, J.: A Bayesian\/information theoretic model of learning via multiple task sampling. Machine Learning\u00a028, 7\u201340 (1997)","journal-title":"Machine Learning"},{"issue":"1","key":"26_CR6","doi-asserted-by":"publisher","first-page":"3881","DOI":"10.4249\/scholarpedia.3881","volume":"3","author":"Y. Bengio","year":"2008","unstructured":"Bengio, Y.: Neural net language models. Scholarpedia\u00a03(1), 3881 (2008)","journal-title":"Scholarpedia"},{"key":"26_CR7","doi-asserted-by":"crossref","unstructured":"Bengio, Y.: Learning deep architectures for AI. Now Publishers (2009)","DOI":"10.1561\/2200000006"},{"key":"26_CR8","unstructured":"Bengio, Y.: Deep learning of representations for unsupervised and transfer learning. In: JMLR W&CP: Proc. Unsupervised and Transfer Learning (2011)"},{"key":"26_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1007\/978-3-642-24412-4_3","volume-title":"Algorithmic Learning Theory","author":"Y. Bengio","year":"2011","unstructured":"Bengio, Y., Delalleau, O.: On the Expressive Power of Deep Architectures. In: Kivinen, J., Szepesv\u00e1ri, C., Ukkonen, E., Zeugmann, T. (eds.) ALT 2011. LNCS, vol.\u00a06925, pp. 18\u201336. Springer, Heidelberg (2011)"},{"key":"26_CR10","doi-asserted-by":"crossref","unstructured":"Bengio, Y., LeCun, Y.: Scaling learning algorithms towards AI. In: Large Scale Kernel Machines (2007)","DOI":"10.7551\/mitpress\/7496.003.0016"},{"key":"26_CR11","first-page":"1137","volume":"3","author":"Y. Bengio","year":"2003","unstructured":"Bengio, Y., Ducharme, R., Vincent, P., Jauvin, C.: A neural probabilistic language model. JMLR\u00a03, 1137\u20131155 (2003)","journal-title":"JMLR"},{"key":"26_CR12","unstructured":"Bengio, Y., Le Roux, N., Vincent, P., Delalleau, O., Marcotte, P.: Convex neural networks. In: NIPS 2005, pp. 123\u2013130 (2006a)"},{"key":"26_CR13","unstructured":"Bengio, Y., Delalleau, O., Le Roux, N.: The curse of highly variable functions for local kernel machines. In: NIPS 2005, pp. 107\u2013114 (2006b)"},{"key":"26_CR14","doi-asserted-by":"crossref","unstructured":"Bengio, Y., Lamblin, P., Popovici, D., Larochelle, H.: Greedy layer-wise training of deep networks. In: NIPS 2006 (2007)","DOI":"10.7551\/mitpress\/7503.003.0024"},{"key":"26_CR15","doi-asserted-by":"crossref","unstructured":"Bengio, Y., Louradour, J., Collobert, R., Weston, J.: Curriculum learning. In: ICML 2009 (2009)","DOI":"10.1145\/1553374.1553380"},{"key":"26_CR16","unstructured":"Bengio, Y., Alain, G., Rifai, S.: Implicit density estimation by local moment matching to sample from auto-encoders. Technical report, arXiv:1207.0057 (2012)"},{"key":"26_CR17","first-page":"281","volume":"13","author":"J. Bergstra","year":"2012","unstructured":"Bergstra, J., Bengio, Y.: Random search for hyper-parameter optimization. J. Machine Learning Res.\u00a013, 281\u2013305 (2012)","journal-title":"J. Machine Learning Res."},{"key":"26_CR18","doi-asserted-by":"crossref","unstructured":"Bergstra, J., Breuleux, O., Bastien, F., Lamblin, P., Pascanu, R., Desjardins, G., Turian, J., Warde-Farley, D., Bengio, Y.: Theano: a CPU and GPU math expression compiler. In: Proc. Python for Scientific Comp. Conf. (SciPy) (2010)","DOI":"10.25080\/Majora-92bf1922-003"},{"key":"26_CR19","unstructured":"Bergstra, J., Bardenet, R., Bengio, Y., K\u00e9gl, B.: Algorithms for hyper-parameter optimization. In: NIPS 2011 (2011)"},{"key":"26_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"81","DOI":"10.1007\/3-540-46084-5_14","volume-title":"Artificial Neural Networks - ICANN 2002","author":"P. Berkes","year":"2002","unstructured":"Berkes, P., Wiskott, L.: Applying Slow Feature Analysis to Image Sequences Yields a Rich Repertoire of Complex Cell Properties. In: Dorronsoro, J.R. (ed.) ICANN 2002. LNCS, vol.\u00a02415, pp. 81\u201386. Springer, Heidelberg (2002)"},{"key":"26_CR21","doi-asserted-by":"crossref","unstructured":"Bertsekas, D.P.: Incremental gradient, subgradient, and proximal methods for convex optimization: a survey. Technical Report 2848, LIDS (2010)","DOI":"10.7551\/mitpress\/8996.003.0006"},{"key":"26_CR22","first-page":"1737","volume":"10","author":"A. Bordes","year":"2009","unstructured":"Bordes, A., Bottou, L., Gallinari, P.: Sgd-qn: Careful quasi-newton stochastic gradient descent. Journal of Machine Learning Research\u00a010, 1737\u20131754 (2009)","journal-title":"Journal of Machine Learning Research"},{"key":"26_CR23","doi-asserted-by":"crossref","unstructured":"Bordes, A., Weston, J., Collobert, R., Bengio, Y. (2011). Learning structured embeddings of knowledge bases. In: AAAI (2011)","DOI":"10.1609\/aaai.v25i1.7917"},{"key":"26_CR24","unstructured":"Bordes, A., Glorot, X., Weston, J., Bengio, Y.: Joint learning of words and meaning representations for open-text semantic parsing. In: AISTATS 2012 (2012)"},{"key":"26_CR25","unstructured":"Bottou, L.: From machine learning to machine reasoning. Technical report, arXiv.1102 (2011)"},{"key":"26_CR26","series-title":"LNCS","first-page":"421","volume-title":"NN: Tricks of the Trade","author":"L. Bottou","year":"2012","unstructured":"Bottou, L.: Stochastic Gradient Descent Tricks. In: Montavon, G., Orr, G.B., M\u00fcller, K.-R. (eds.) NN: Tricks of the Trade, 2nd edn. LNCS, vol.\u00a07700, pp. 421\u2013436. Springer, Heidelberg (2012)","edition":"2"},{"key":"26_CR27","unstructured":"Bottou, L., Bousquet, O.: The tradeoffs of large scale learning. In: NIPS 2008 (2008)"},{"key":"26_CR28","unstructured":"Bottou, L., LeCun, Y.: Large-scale on-line learning. In: NIPS 2003 (2004)"},{"issue":"2","key":"26_CR29","first-page":"123","volume":"24","author":"L. Breiman","year":"1994","unstructured":"Breiman, L.: Bagging predictors. Machine Learning\u00a024(2), 123\u2013140 (1994)","journal-title":"Machine Learning"},{"issue":"8","key":"26_CR30","doi-asserted-by":"publisher","first-page":"2053","DOI":"10.1162\/NECO_a_00158","volume":"23","author":"O. Breuleux","year":"2011","unstructured":"Breuleux, O., Bengio, Y., Vincent, P.: Quickly generating representative samples from an rbm-derived process. Neural Computation\u00a023(8), 2053\u20132073 (2011)","journal-title":"Neural Computation"},{"key":"26_CR31","unstructured":"Caruana, R.: Multitask connectionist learning. In: Proceedings of the 1993 Connectionist Models Summer School, pp. 372\u2013379 (1993)"},{"key":"26_CR32","doi-asserted-by":"crossref","unstructured":"Cho, K., Raiko, T., Ilin, A.: Enhanced gradient and adaptive learning rate for training restricted boltzmann machines. In: ICML 2011, pp. 105\u2013112 (2011)","DOI":"10.1007\/978-3-642-21735-7_2"},{"key":"26_CR33","unstructured":"Coates, A., Ng, A.Y.: The importance of encoding versus training with sparse coding and vector quantization. In: ICML 2011 (2011)"},{"key":"26_CR34","doi-asserted-by":"crossref","unstructured":"Collobert, R., Bengio, S.: Links between perceptrons, MLPs and SVMs. In: ICML 2004 (2004a)","DOI":"10.1145\/1015330.1015415"},{"key":"26_CR35","doi-asserted-by":"crossref","unstructured":"Collobert, R., Bengio, S.: Links between perceptrons, MLPs and SVMs. In: International Conference on Machine Learning, ICML (2004b)","DOI":"10.1145\/1015330.1015415"},{"key":"26_CR36","first-page":"2493","volume":"12","author":"R. Collobert","year":"2011","unstructured":"Collobert, R., Weston, J., Bottou, L., Karlen, M., Kavukcuoglu, K., Kuksa, P.: Natural language processing (almost) from scratch. Journal of Machine Learning Research\u00a012, 2493\u20132537 (2011a)","journal-title":"Journal of Machine Learning Research"},{"key":"26_CR37","unstructured":"Collobert, R., Kavukcuoglu, K., Farabet, C.: Torch7: A matlab-like environment for machine learning. In: BigLearn, NIPS Workshop (2011b)"},{"key":"26_CR38","unstructured":"Courville, A., Bergstra, J., Bengio, Y.: Unsupervised models of images by spike-and-slab RBMs. In: ICML 2011 (2011)"},{"key":"26_CR39","unstructured":"Dauphin, Y., Glorot, X., Bengio, Y.: Sampled reconstruction for large-scale learning of embeddings. In: Proc. ICML 2011 (2011)"},{"issue":"6","key":"26_CR40","doi-asserted-by":"publisher","first-page":"391","DOI":"10.1002\/(SICI)1097-4571(199009)41:6<391::AID-ASI1>3.0.CO;2-9","volume":"41","author":"S. Deerwester","year":"1990","unstructured":"Deerwester, S., Dumais, S.T., Furnas, G.W., Landauer, T.K., Harshman, R.: Indexing by latent semantic analysis. J. Am. Soc. Information Science\u00a041(6), 391\u2013407 (1990)","journal-title":"J. Am. Soc. Information Science"},{"key":"26_CR41","unstructured":"Duchi, J., Hazan, E., Singer, Y.: Adaptive subgradient methods for online learning and stochastic optimization. Journal of Machine Learning Research (2011)"},{"key":"26_CR42","doi-asserted-by":"publisher","first-page":"781","DOI":"10.1016\/0010-0277(93)90058-4","volume":"48","author":"J.L. Elman","year":"1993","unstructured":"Elman, J.L.: Learning and development in neural networks: The importance of starting small. Cognition\u00a048, 781\u2013799 (1993)","journal-title":"Cognition"},{"key":"26_CR43","unstructured":"Erhan, D., Courville, A., Bengio, Y.: Understanding representations learned in deep architectures. Technical Report 1355, Universit\u00e9 de Montr\u00e9al\/DIRO (2010a)"},{"key":"26_CR44","first-page":"625","volume":"11","author":"D. Erhan","year":"2010","unstructured":"Erhan, D., Bengio, Y., Courville, A., Manzagol, P.-A., Vincent, P., Bengio, S.: Why does unsupervised pre-training help deep learning? J. Machine Learning Res.\u00a011, 625\u2013660 (2010b)","journal-title":"J. Machine Learning Res."},{"issue":"5","key":"26_CR45","doi-asserted-by":"publisher","first-page":"768","DOI":"10.1109\/72.712151","volume":"9","author":"P. Frasconi","year":"1998","unstructured":"Frasconi, P., Gori, M., Sperduti, A.: A general framework for adaptive processing of data structures. IEEE Transactions on Neural Networks\u00a09(5), 768\u2013786 (1998)","journal-title":"IEEE Transactions on Neural Networks"},{"issue":"1","key":"26_CR46","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1162\/neco.1992.4.1.1","volume":"4","author":"S. Geman","year":"1992","unstructured":"Geman, S., Bienenstock, E., Doursat, R.: Neural networks and the bias\/variance dilemma. Neural Computation\u00a04(1), 1\u201358 (1992)","journal-title":"Neural Computation"},{"key":"26_CR47","doi-asserted-by":"crossref","unstructured":"Getoor, L., Taskar, B.: Introduction to Statistical Relational Learning. MIT Press (2006)","DOI":"10.7551\/mitpress\/7432.001.0001"},{"key":"26_CR48","unstructured":"Glorot, X., Bengio, Y.: Understanding the difficulty of training deep feedforward neural networks. In: AISTATS 2010, pp. 249\u2013256 (2010)"},{"key":"26_CR49","unstructured":"Glorot, X., Bordes, A., Bengio, Y. (2011a). Deep sparse rectifier neural networks. In: AISTATS 2011 (2011)"},{"key":"26_CR50","unstructured":"Glorot, X., Bordes, A., Bengio, Y.: Domain adaptation for large-scale sentiment classification: A deep learning approach. In: ICML 2011 (2011b)"},{"key":"26_CR51","unstructured":"Goodfellow, I., Le, Q., Saxe, A., Ng, A.: Measuring invariances in deep networks. In: NIPS 2009, pp. 646\u2013654 (2009)"},{"key":"26_CR52","unstructured":"Goodfellow, I., Courville, A., Bengio, Y.: Spike-and-slab sparse coding for unsupervised feature discovery. In: NIPS Workshop on Challenges in Learning Hierarchical Models (2011)"},{"key":"26_CR53","unstructured":"Graepel, T., Candela, J.Q., Borchert, T., Herbrich, R.: Web-scale Bayesian click-through rate prediction for sponsored search advertising in microsoft\u2019s bing search engine. In: ICML (2010)"},{"key":"26_CR54","doi-asserted-by":"crossref","unstructured":"H\u00e5stad, J.: Almost optimal lower bounds for small depth circuits. In: STOC 1986, pp. 6\u201320 (1986)","DOI":"10.1145\/12130.12132"},{"key":"26_CR55","doi-asserted-by":"publisher","first-page":"113","DOI":"10.1007\/BF01272517","volume":"1","author":"J. H\u00e5stad","year":"1991","unstructured":"H\u00e5stad, J., Goldmann, M.: On the power of small-depth threshold circuits. Computational Complexity\u00a01, 113\u2013129 (1991)","journal-title":"Computational Complexity"},{"key":"26_CR56","unstructured":"Hinton, G.E.: Relaxation and its role in vision. Ph.D. thesis, University of Edinburgh (1978)"},{"key":"26_CR57","unstructured":"Hinton, G.E.: Learning distributed representations of concepts. In: Proc. 8th Annual Conf. Cog. Sc. Society, pp. 1\u201312 (1986)"},{"key":"26_CR58","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1016\/0004-3702(89)90049-0","volume":"40","author":"G.E. Hinton","year":"1989","unstructured":"Hinton, G.E.: Connectionist learning procedures. Artificial Intelligence\u00a040, 185\u2013234 (1989)","journal-title":"Artificial Intelligence"},{"key":"26_CR59","unstructured":"Hinton, G.E.: A practical guide to training restricted Boltzmann machines. Technical Report UTML TR 2010-003, Department of Computer Science, University of Toronto (2010)"},{"key":"26_CR60","series-title":"LNCS","first-page":"599","volume-title":"NN: Tricks of the Trade","author":"G.E. Hinton","year":"2012","unstructured":"Hinton, G.E.: A Practical Guide to Training Restricted Boltzmann Machines. In: Montavon, G., Orr, G.B., M\u00fcller, K.-R. (eds.) NN: Tricks of the Trade, 2nd edn. LNCS, vol.\u00a07700, pp. 599\u2013619. Springer, Heidelberg (2012)","edition":"2"},{"key":"26_CR61","doi-asserted-by":"publisher","first-page":"1527","DOI":"10.1162\/neco.2006.18.7.1527","volume":"18","author":"G.E. Hinton","year":"2006","unstructured":"Hinton, G.E., Osindero, S., Teh, Y.-W.: A fast learning algorithm for deep belief nets. Neural Computation\u00a018, 1527\u20131554 (2006)","journal-title":"Neural Computation"},{"key":"26_CR62","unstructured":"Hutter, F.: Automated Configuration of Algorithms for Solving Hard Computational Problems. Ph.D. thesis, University of British Columbia (2009)"},{"key":"26_CR63","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"507","DOI":"10.1007\/978-3-642-25566-3_40","volume-title":"Learning and Intelligent Optimization","author":"F. Hutter","year":"2011","unstructured":"Hutter, F., Hoos, H.H., Leyton-Brown, K.: Sequential model-based optimization for general algorithm configuration. In: Coello Coello, C.A. (ed.) LION 5. LNCS, vol.\u00a06683, pp. 507\u2013523. Springer, Heidelberg (2011)"},{"key":"26_CR64","doi-asserted-by":"crossref","unstructured":"Jarrett, K., Kavukcuoglu, K., Ranzato, M., LeCun, Y.: What is the best multi-stage architecture for object recognition? In: ICCV (2009)","DOI":"10.1109\/ICCV.2009.5459469"},{"key":"26_CR65","doi-asserted-by":"crossref","unstructured":"Kavukcuoglu, K., Ranzato, M.-A., Fergus, R., LeCun, Y.: Learning invariant features through topographic filter maps. In: CVPR 2009 (2009)","DOI":"10.1109\/CVPR.2009.5206545"},{"key":"26_CR66","doi-asserted-by":"publisher","first-page":"380","DOI":"10.1016\/j.cognition.2008.11.014","volume":"110","author":"K.A. Krueger","year":"2009","unstructured":"Krueger, K.A., Dayan, P.: Flexible shaping: how learning in small steps helps. Cognition\u00a0110, 380\u2013394 (2009)","journal-title":"Cognition"},{"key":"26_CR67","doi-asserted-by":"crossref","unstructured":"Lamblin, P., Bengio, Y.: Important gains from supervised fine-tuning of deep architectures on large labeled sets. In: NIPS 2010 Deep Learning and Unsupervised Feature Learning Workshop (2010)","DOI":"10.1561\/9781601982957"},{"key":"26_CR68","unstructured":"Lang, K.J., Hinton, G.E.: The development of the time-delay neural network architecture for speech recognition. Technical Report CMU-CS-88-152, Carnegie-Mellon University (1988)"},{"key":"26_CR69","doi-asserted-by":"crossref","unstructured":"Larochelle, H., Bengio, Y.: Classification using discriminative restricted Boltzmann machines. In: ICML 2008 (2008)","DOI":"10.1145\/1390156.1390224"},{"key":"26_CR70","first-page":"1","volume":"10","author":"H. Larochelle","year":"2009","unstructured":"Larochelle, H., Bengio, Y., Louradour, J., Lamblin, P.: Exploring strategies for training deep neural networks. J. Machine Learning Res.\u00a010, 1\u201340 (2009)","journal-title":"J. Machine Learning Res."},{"key":"26_CR71","unstructured":"Le, Q., Ngiam, J., Chen, Z., Hao Chia, D.J., Koh, P.W., Ng, A.: Tiled convolutional neural networks. In: NIPS 2010 (2010)"},{"key":"26_CR72","unstructured":"Le, Q., Ngiam, J., Coates, A., Lahiri, A., Prochnow, B., Ng, A.: On optimization methods for deep learning. In: ICML 2011 (2011)"},{"key":"26_CR73","unstructured":"Le Roux, N., Manzagol, P.-A., Bengio, Y.: Topmoumoute online natural gradient algorithm. In: NIPS 2007 (2008)"},{"key":"26_CR74","doi-asserted-by":"crossref","unstructured":"Le Roux, N., Bengio, Y., Fitzgibbon, A.: Improving first and second-order methods by modeling uncertainty. In: Optimization for Machine Learning. MIT Press (2011)","DOI":"10.7551\/mitpress\/8996.003.0017"},{"key":"26_CR75","unstructured":"Le Roux, N., Schmidt, M., Bach, F.: A stochastic gradient method with an exponential convergence rate for strongly-convex optimization with finite training sets. Technical report, arXiv:1202.6258 (2012)"},{"key":"26_CR76","unstructured":"LeCun, Y.: Mod\u00e8les connexionistes de l\u2019apprentissage. Ph.D. thesis, Universit\u00e9 de Paris VI (1987)"},{"key":"26_CR77","unstructured":"LeCun, Y.: Generalization and network design strategies. Technical Report CRG-TR-89-4, University of Toronto (1989)"},{"issue":"4","key":"26_CR78","doi-asserted-by":"publisher","first-page":"541","DOI":"10.1162\/neco.1989.1.4.541","volume":"1","author":"Y. LeCun","year":"1989","unstructured":"LeCun, Y., Boser, B., Denker, J.S., Henderson, D., Howard, R.E., Hubbard, W., Jackel, L.D.: Backpropagation applied to handwritten zip code recognition. Neural Computation\u00a01(4), 541\u2013551 (1989)","journal-title":"Neural Computation"},{"key":"26_CR79","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"9","DOI":"10.1007\/3-540-49430-8_2","volume-title":"Neural Networks: Tricks of the Trade","author":"Y.A. LeCun","year":"1998","unstructured":"LeCun, Y.A., Bottou, L., Orr, G.B., M\u00fcller, K.-R.: Efficient BackProp. In: Orr, G.B., M\u00fcller, K.-R. (eds.) NIPS-WS 1996. LNCS, vol.\u00a01524, pp. 9\u201350. Springer, Heidelberg (1998a)"},{"issue":"11","key":"26_CR80","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y. LeCun","year":"1998","unstructured":"LeCun, Y., Bottou, L., Bengio, Y., Haffner, P.: Gradient based learning applied to document recognition. IEEE\u00a086(11), 2278\u20132324 (1998b)","journal-title":"IEEE"},{"key":"26_CR81","unstructured":"Lee, H., Ekanadham, C., Ng, A. (2008). Sparse deep belief net model for visual area V2. In: NIPS 2007 (2007)"},{"key":"26_CR82","doi-asserted-by":"crossref","unstructured":"Lee, H., Grosse, R., Ranganath, R., Ng, A.Y.: Convolutional deep belief networks for scalable unsupervised learning of hierarchical representations. In: ICML 2009 (2009)","DOI":"10.1145\/1553374.1553453"},{"key":"26_CR83","unstructured":"Martens, J.: Deep learning via Hessian-free optimization. In: ICML 2010, pp. 735\u2013742 (2010)"},{"key":"26_CR84","unstructured":"Mesnil, G., Dauphin, Y., Glorot, X., Rifai, S., Bengio, Y., Goodfellow, I., Lavoie, E., Muller, X., Desjardins, G., Warde-Farley, D., Vincent, P., Courville, A., Bergstra, J.: Unsupervised and transfer learning challenge: a deep learning approach. In: Proc. Unsupervised and Transfer Learning, JMLR W&CP, vol.\u00a07 (2011)"},{"key":"26_CR85","unstructured":"Montavon, G., Braun, M.L., M\u00fcller, K.-R.: Deep Boltzmann machines as feed-forward hierarchies. In: AISTATS 2012 (2012)"},{"key":"26_CR86","unstructured":"Nair, V., Hinton, G.E.: Rectified linear units improve restricted Boltzmann machines. In: ICML 2010 (2010)"},{"key":"26_CR87","unstructured":"Nemirovski, A., Yudin, D.: Problem complexity and method efficiency in optimization. Wiley (1983)"},{"issue":"1","key":"26_CR88","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1007\/s10107-007-0149-x","volume":"120","author":"Y. Nesterov","year":"2009","unstructured":"Nesterov, Y.: Primal-dual subgradient methods for convex problems. Mathematical Programming\u00a0120(1), 221\u2013259 (2009)","journal-title":"Mathematical Programming"},{"key":"26_CR89","doi-asserted-by":"publisher","first-page":"3311","DOI":"10.1016\/S0042-6989(97)00169-7","volume":"37","author":"B.A. Olshausen","year":"1997","unstructured":"Olshausen, B.A., Field, D.J.: Sparse coding with an overcomplete basis set: a strategy employed by V1? Vision Research\u00a037, 3311\u20133325 (1997)","journal-title":"Vision Research"},{"issue":"1","key":"26_CR90","doi-asserted-by":"publisher","first-page":"147","DOI":"10.1162\/neco.1994.6.1.147","volume":"6","author":"B. Pearlmutter","year":"1994","unstructured":"Pearlmutter, B.: Fast exact multiplication by the Hessian. Neural Computation\u00a06(1), 147\u2013160 (1994)","journal-title":"Neural Computation"},{"key":"26_CR91","doi-asserted-by":"crossref","unstructured":"Pinto, N., Doukhan, D., DiCarlo, J.J., Cox, D.D.: A high-throughput screening approach to discovering good forms of biologically inspired visual representation. PLoS Comput. Biol. 5(11), e1000579 (2009)","DOI":"10.1371\/journal.pcbi.1000579"},{"issue":"1","key":"26_CR92","doi-asserted-by":"publisher","first-page":"77","DOI":"10.1016\/0004-3702(90)90005-K","volume":"46","author":"J.B. Pollack","year":"1990","unstructured":"Pollack, J.B.: Recursive distributed representations. Artificial Intelligence\u00a046(1), 77\u2013105 (1990)","journal-title":"Artificial Intelligence"},{"issue":"4","key":"26_CR93","doi-asserted-by":"publisher","first-page":"838","DOI":"10.1137\/0330046","volume":"30","author":"B. Polyak","year":"1992","unstructured":"Polyak, B., Juditsky, A.: Acceleration of stochastic approximation by averaging. SIAM J. Control and Optimization\u00a030(4), 838\u2013855 (1992)","journal-title":"SIAM J. Control and Optimization"},{"key":"26_CR94","unstructured":"Raiko, T., Valpola, H., LeCun, Y. (2012). Deep learning made easier by linear transformations in perceptrons. In: AISTATS 2012 (2012)"},{"key":"26_CR95","doi-asserted-by":"crossref","unstructured":"Ranzato, M., Poultney, C., Chopra, S., LeCun, Y.: Efficient learning of sparse representations with an energy-based model. In: NIPS 2006 (2007)","DOI":"10.7551\/mitpress\/7503.003.0147"},{"key":"26_CR96","first-page":"1185","volume-title":"Advances in Neural Information Processing Systems (NIPS 2007)","author":"M. Ranzato","year":"2008","unstructured":"Ranzato, M., Boureau, Y.-L., LeCun, Y.: Sparse feature learning for deep belief networks. In: Platt, J., Koller, D., Singer, Y., Roweis, S. (eds.) Advances in Neural Information Processing Systems (NIPS 2007), vol.\u00a020, pp. 1185\u20131192. MIT Press, Cambridge (2008a)"},{"key":"26_CR97","unstructured":"Ranzato, M., Boureau, Y., LeCun, Y.: Sparse feature learning for deep belief networks. In: NIPS 2007 (2008b)"},{"key":"26_CR98","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1007\/s10994-006-5833-1","volume":"62","author":"M. Richardson","year":"2006","unstructured":"Richardson, M., Domingos, P.: Markov logic networks. Machine Learning\u00a062, 107\u2013136 (2006)","journal-title":"Machine Learning"},{"key":"26_CR99","unstructured":"Rifai, S., Vincent, P., Muller, X., Glorot, X., Bengio, Y.: Contracting auto-encoders: Explicit invariance during feature extraction. In: ICML 2011 (2011a)"},{"key":"26_CR100","unstructured":"Rifai, S., Dauphin, Y., Vincent, P., Bengio, Y., Muller, X.: The manifold tangent classifier. In: NIPS 2011 (2011b)"},{"key":"26_CR101","doi-asserted-by":"crossref","unstructured":"Rifai, S., Bengio, Y., Dauphin, Y., Vincent, P.: A generative process for sampling contractive auto-encoders. In: ICML 2012 (2012)","DOI":"10.1007\/978-3-642-23783-6_41"},{"key":"26_CR102","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1214\/aoms\/1177729586","volume":"22","author":"H. Robbins","year":"1951","unstructured":"Robbins, H., Monro, S.: A stochastic approximation method. Annals of Mathematical Statistics\u00a022, 400\u2013407 (1951)","journal-title":"Annals of Mathematical Statistics"},{"key":"26_CR103","doi-asserted-by":"publisher","first-page":"533","DOI":"10.1038\/323533a0","volume":"323","author":"D.E. Rumelhart","year":"1986","unstructured":"Rumelhart, D.E., Hinton, G.E., Williams, R.J.: Learning representations by back-propagating errors. Nature\u00a0323, 533\u2013536 (1986)","journal-title":"Nature"},{"key":"26_CR104","unstructured":"Salakhutdinov, R., Hinton, G.: Deep Boltzmann machines. In: AISTATS 2009 (2009)"},{"key":"26_CR105","unstructured":"Saxe, A.M., Koh, P.W., Chen, Z., Bhand, M., Suresh, B., Ng, A.: On random weights and unsupervised feature learning. In: ICML 2011 (2011)"},{"key":"26_CR106","unstructured":"Schaul, T., Zhang, S., LeCun, Y.: No More Pesky Learning Rates. Technical report (2012)"},{"key":"26_CR107","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"207","DOI":"10.1007\/3-540-49430-8_11","volume-title":"Neural Networks: Tricks of the Trade","author":"N.N. Schraudolph","year":"1998","unstructured":"Schraudolph, N.N.: Centering Neural Network Gradient Factors. In: Orr, G.B., M\u00fcller, K.-R. (eds.) NIPS-WS 1996. LNCS, vol.\u00a01524, pp. 207\u2013548. Springer, Heidelberg (1998)"},{"key":"26_CR108","unstructured":"Socher, R., Manning, C., Ng, A.Y.: Parsing natural scenes and natural language with recursive neural networks. In: ICML 2011 (2011)"},{"key":"26_CR109","first-page":"627","volume":"12","author":"A. Srinivasan","year":"2011","unstructured":"Srinivasan, A., Ramakrishnan, G.: Parameter screening and optimisation for ILP using designed experiments. Journal of Machine Learning Research\u00a012, 627\u2013662 (2011)","journal-title":"Journal of Machine Learning Research"},{"key":"26_CR110","doi-asserted-by":"crossref","unstructured":"Swersky, K., Chen, B., Marlin, B., de Freitas, N.: A tutorial on stochastic approximation algorithms for training restricted boltzmann machines and deep belief nets. In: Information Theory and Applications Workshop (2010)","DOI":"10.1109\/ITA.2010.5454138"},{"issue":"5500","key":"26_CR111","doi-asserted-by":"publisher","first-page":"2319","DOI":"10.1126\/science.290.5500.2319","volume":"290","author":"J. Tenenbaum","year":"2000","unstructured":"Tenenbaum, J., de Silva, V., Langford, J.C.: A global geometric framework for nonlinear dimensionality reduction. Science\u00a0290(5500), 2319\u20132323 (2000)","journal-title":"Science"},{"key":"26_CR112","doi-asserted-by":"crossref","unstructured":"Tieleman, T., Hinton, G.: Using fast weights to improve persistent contrastive divergence. In: ICML 2009 (2009)","DOI":"10.1145\/1553374.1553506"},{"key":"26_CR113","unstructured":"van der Maaten, L., Hinton, G.E.: Visualizing data using t-sne. J. Machine Learning Res.\u00a09 (2008)"},{"key":"26_CR114","doi-asserted-by":"crossref","unstructured":"Vincent, P.: A connection between score matching and denoising autoencoders. Neural Computation\u00a023(7) (2011)","DOI":"10.1162\/NECO_a_00142"},{"key":"26_CR115","doi-asserted-by":"crossref","unstructured":"Vincent, P., Larochelle, H., Bengio, Y., Manzagol, P.-A.: Extracting and composing robust features with denoising autoencoders. In: ICML 2008 (2008)","DOI":"10.1145\/1390156.1390294"},{"key":"26_CR116","unstructured":"Vincent, P., Larochelle, H., Lajoie, I., Bengio, Y., Manzagol, P.-A.: Stacked denoising autoencoders: Learning useful representations in a deep network with a local denoising criterion. J. Machine Learning Res.\u00a011 (2010)"},{"key":"26_CR117","doi-asserted-by":"crossref","unstructured":"Weston, J., Ratle, F., Collobert, R.: Deep learning via semi-supervised embedding. In: ICML 2008 (2008)","DOI":"10.1145\/1390156.1390303"},{"key":"26_CR118","unstructured":"Weston, J., Bengio, S., Usunier, N.: Wsabie: Scaling up to large vocabulary image annotation. In: Proceedings of the International Joint Conference on Artificial Intelligence, IJCAI (2011)"},{"issue":"4","key":"26_CR119","doi-asserted-by":"publisher","first-page":"715","DOI":"10.1162\/089976602317318938","volume":"14","author":"L. Wiskott","year":"2002","unstructured":"Wiskott, L., Sejnowski, T.J.: Slow feature analysis: Unsupervised learning of invariances. Neural Computation\u00a014(4), 715\u2013770 (2002)","journal-title":"Neural Computation"},{"key":"26_CR120","unstructured":"Zou, W.Y., Ng, A.Y., Yu, K.: Unsupervised learning of visual invariance with temporal coherence. In: NIPS 2011 Workshop on Deep Learning and Unsupervised Feature Learning (2011)"}],"container-title":["Lecture Notes in Computer Science","Neural Networks: Tricks of the Trade"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-35289-8_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,5,1]],"date-time":"2024-05-01T10:56:07Z","timestamp":1714560967000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-35289-8_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012]]},"ISBN":["9783642352881","9783642352898"],"references-count":120,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-35289-8_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2012]]}}}