{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T10:18:53Z","timestamp":1779358733426,"version":"3.51.4"},"publisher-location":"Berlin, Heidelberg","reference-count":28,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"value":"9783642352881","type":"print"},{"value":"9783642352898","type":"electronic"}],"license":[{"start":{"date-parts":[[2012,1,1]],"date-time":"2012-01-01T00:00:00Z","timestamp":1325376000000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2012]]},"DOI":"10.1007\/978-3-642-35289-8_25","type":"book-chapter","created":{"date-parts":[[2012,11,14]],"date-time":"2012-11-14T12:03:17Z","timestamp":1352894597000},"page":"421-436","source":"Crossref","is-referenced-by-count":1327,"title":["Stochastic Gradient Descent Tricks"],"prefix":"10.1007","author":[{"given":"L\u00e9on","family":"Bottou","sequence":"first","affiliation":[]}],"member":"297","reference":[{"key":"25_CR1","first-page":"1737","volume":"10","author":"A. Bordes","year":"2009","unstructured":"Bordes, A., Bottou, L., Gallinari, P.: SGD-QN: Careful quasi-Newton stochastic gradient descent. Journal of Machine Learning Research\u00a010, 1737\u20131754 (2009); with erratum, JMLR 11, 2229\u20132240 (2010)","journal-title":"Journal of Machine Learning Research"},{"key":"25_CR2","unstructured":"Bottou, L., Bousquet, O.: The tradeoffs of large scale learning. In: Platt, J., Koller, D., Singer, Y., Roweis, S. (eds.) Advances in Neural Information Processing Systems, vol.\u00a020, pp. 161\u2013168. NIPS Foundation (2008), http:\/\/books.nips.cc"},{"key":"25_CR3","volume-title":"Online Learning and Neural Networks","author":"L. Bottou","year":"1998","unstructured":"Bottou, L.: Online algorithms and stochastic approximations. In: Saad, D. (ed.) Online Learning and Neural Networks. Cambridge University Press, Cambridge (1998)"},{"key":"25_CR4","unstructured":"Bousquet, O.: Concentration Inequalities and Empirical Processes Theory Applied to the Analysis of Learning Algorithms. Ph.D. thesis, Ecole Polytechnique, Palaiseau, France (2002)"},{"issue":"3","key":"25_CR5","first-page":"273","volume":"20","author":"C. Cortes","year":"1995","unstructured":"Cortes, C., Vapnik, V.: Support-vector network. Machine Learning\u00a020(3), 273\u2013297 (1995)","journal-title":"Machine Learning"},{"key":"25_CR6","volume-title":"Numerical Methods For Unconstrained Optimization and Nonlinear Equations","author":"J. Dennis","year":"1983","unstructured":"Dennis, J., Schnabel, R.B.: Numerical Methods For Unconstrained Optimization and Nonlinear Equations. Prentice-Hall, Inc., Englewood Cliffs (1983)"},{"key":"25_CR7","doi-asserted-by":"crossref","unstructured":"Joachims, T.: Training linear SVMs in linear time. In: Proceedings of the 12th ACM SIGKDD International Conference, New York (2006)","DOI":"10.1145\/1150402.1150429"},{"key":"25_CR8","first-page":"282","volume-title":"Proceedings of the Eighteenth International Conference on Machine Learning (ICML)","author":"J.D. Lafferty","year":"2001","unstructured":"Lafferty, J.D., McCallum, A., Pereira, F.C.N.: Conditional random fields: Probabilistic models for segmenting and labeling sequence data. In: Brodley, C.E., Danyluk, A.P. (eds.) Proceedings of the Eighteenth International Conference on Machine Learning (ICML), pp. 282\u2013289. Morgan Kaufmann, Williams College (2001)"},{"issue":"5","key":"25_CR9","doi-asserted-by":"publisher","first-page":"1974","DOI":"10.1109\/18.705577","volume":"44","author":"W.S. Lee","year":"1998","unstructured":"Lee, W.S., Bartlett, P.L., Williamson, R.C.: The importance of convexity in learning with squared loss. IEEE Transactions on Information Theory\u00a044(5), 1974\u20131980 (1998)","journal-title":"IEEE Transactions on Information Theory"},{"key":"25_CR10","first-page":"361","volume":"5","author":"D.D. Lewis","year":"2004","unstructured":"Lewis, D.D., Yang, Y., Rose, T.G., Li, F.: RCV1: A new benchmark collection for text categorization research. Journal of Machine Learning Research\u00a05, 361\u2013397 (2004)","journal-title":"Journal of Machine Learning Research"},{"key":"25_CR11","doi-asserted-by":"crossref","unstructured":"Lin, C.J., Weng, R.C., Keerthi, S.S.: Trust region newton methods for large-scale logistic regression. In: Ghahramani, Z. (ed.) Proc. Twenty-Fourth International Conference on Machine Learning (ICML), pp. 561\u2013568. ACM (2007)","DOI":"10.1145\/1273496.1273567"},{"key":"25_CR12","first-page":"281","volume-title":"Proceedings of the Fifth Berkeley Symposium on Mathematics, Statistics, and Probabilities","author":"J. MacQueen","year":"1967","unstructured":"MacQueen, J.: Some methods for classification and analysis of multivariate observations. In: LeCam, L.M., Neyman, J. (eds.) Proceedings of the Fifth Berkeley Symposium on Mathematics, Statistics, and Probabilities, vol.\u00a01, pp. 281\u2013297. University of California Press, Berkeley (1967)"},{"issue":"2","key":"25_CR13","doi-asserted-by":"publisher","first-page":"245","DOI":"10.5802\/afst.961","volume":"9","author":"P. Massart","year":"2000","unstructured":"Massart, P.: Some applications of concentration inequalities to statistics. Annales de la Facult\u00e9 des Sciences de Toulouse series 6\u00a09(2), 245\u2013303 (2000)","journal-title":"Annales de la Facult\u00e9 des Sciences de Toulouse series 6"},{"key":"25_CR14","volume-title":"Online Learning and Neural Networks","author":"N. Murata","year":"1998","unstructured":"Murata, N.: A statistical study of on-line learning. In: Saad, D. (ed.) Online Learning and Neural Networks. Cambridge University Press, Cambridge (1998)"},{"issue":"4","key":"25_CR15","doi-asserted-by":"publisher","first-page":"838","DOI":"10.1137\/0330046","volume":"30","author":"B.T. Polyak","year":"1992","unstructured":"Polyak, B.T., Juditsky, A.B.: Acceleration of stochastic approximation by averaging. SIAM J. Control Optim.\u00a030(4), 838\u2013855 (1992)","journal-title":"SIAM J. Control Optim."},{"key":"25_CR16","unstructured":"Robbins, H., Siegmund, D.: A convergence theorem for non negative almost supermartingales and some applications. In: Rustagi, J.S. (ed.) Optimizing Methods in Statistics. Academic Press (1971)"},{"key":"25_CR17","unstructured":"Rosenblatt, F.: The perceptron: A perceiving and recognizing automaton. Tech. Rep. 85-460-1, Project PARA, Cornell Aeronautical Lab (1957)"},{"key":"25_CR18","doi-asserted-by":"crossref","first-page":"318","DOI":"10.7551\/mitpress\/5236.001.0001","volume-title":"Parallel Distributed Processing: Explorations in the Microstructure of Cognition","author":"D.E. Rumelhart","year":"1986","unstructured":"Rumelhart, D.E., Hinton, G.E., Williams, R.J.: Learning internal representations by error propagation. In: Parallel Distributed Processing: Explorations in the Microstructure of Cognition, vol.\u00a0I, pp. 318\u2013362. Bradford Books, Cambridge (1986)"},{"key":"25_CR19","unstructured":"Ruppert, D.: Efficient estimations from a slowly convergent robbins-monro process. Tech. Rep. 781, Cornell University Operations Research and Industrial Engineering (1988)"},{"key":"25_CR20","doi-asserted-by":"crossref","unstructured":"Sang, E.F.T.K., Buchholz, S.: Introduction to the CoNLL-2000 shared task: Chunking. In: Cardie, C., Daelemans, W., Nedellec, C., Tjong Kim Sang, E.F. (eds.) Proceedings of CoNLL 2000 and LLL 2000, Lisbon, Portugal, pp. 127\u2013132 (2000)","DOI":"10.3115\/1117601.1117631"},{"key":"25_CR21","doi-asserted-by":"crossref","unstructured":"Shalev-Shwartz, S., Singer, Y., Srebro, N.: Pegasos: Primal estimated subgradient solver for SVM. In: Proc. 24th Intl. Conf. on Machine Learning (ICML 2007), pp. 807\u2013814. ACM (2007)","DOI":"10.1145\/1273496.1273598"},{"key":"25_CR22","doi-asserted-by":"crossref","unstructured":"Shalev-Shwartz, S., Srebro, N.: SVM optimization: inverse dependence on training set size. In: Proceedings of the 25th International Machine Learning Conference (ICML 2008), pp. 928\u2013935. ACM (2008)","DOI":"10.1145\/1390156.1390273"},{"key":"25_CR23","doi-asserted-by":"crossref","first-page":"267","DOI":"10.1111\/j.2517-6161.1996.tb02080.x","volume":"58","author":"R. Tibshirani","year":"1996","unstructured":"Tibshirani, R.: Regression shrinkage and selection via the lasso. Journal of the Royal Statistical Society (Series B)\u00a058, 267\u2013288 (1996)","journal-title":"Journal of the Royal Statistical Society (Series B)"},{"key":"25_CR24","doi-asserted-by":"crossref","unstructured":"Tsybakov, A.B.: Optimal aggregation of classifiers in statistical learning. Annals of Statististics 32(1) (2004)","DOI":"10.1214\/aos\/1079120131"},{"issue":"2","key":"25_CR25","doi-asserted-by":"publisher","first-page":"264","DOI":"10.1137\/1116025","volume":"16","author":"V.N. Vapnik","year":"1971","unstructured":"Vapnik, V.N., Chervonenkis, A.Y.: On the uniform convergence of relative frequencies of events to their probabilities. Theory of Probability and its Applications\u00a016(2), 264\u2013280 (1971)","journal-title":"Theory of Probability and its Applications"},{"key":"25_CR26","doi-asserted-by":"crossref","unstructured":"Widrow, B., Hoff, M.E.: Adaptive switching circuits. In: IRE WESCON Conv. Record, Part 4, pp. 96\u2013104 (1960)","DOI":"10.21236\/AD0241531"},{"key":"25_CR27","unstructured":"Xu, W.: Towards optimal one pass large scale learning with averaged stochastic gradient descent (2011), http:\/\/arxiv.org\/abs\/1107.2490"},{"key":"25_CR28","unstructured":"Zinkevich, M.: Online convex programming and generalized infinitesimal gradient ascent. In: Proc. Twentieth International Conference on Machine Learning (2003)"}],"container-title":["Lecture Notes in Computer Science","Neural Networks: Tricks of the Trade"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-35289-8_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,20]],"date-time":"2025-04-20T21:29:28Z","timestamp":1745184568000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-35289-8_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012]]},"ISBN":["9783642352881","9783642352898"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-35289-8_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2012]]}}}