{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,27]],"date-time":"2026-06-27T15:56:38Z","timestamp":1782575798540,"version":"3.54.5"},"publisher-location":"Heidelberg","reference-count":24,"publisher":"Physica-Verlag HD","isbn-type":[{"value":"9783790826036","type":"print"},{"value":"9783790826043","type":"electronic"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2010]]},"DOI":"10.1007\/978-3-7908-2604-3_16","type":"book-chapter","created":{"date-parts":[[2010,11,8]],"date-time":"2010-11-08T10:41:11Z","timestamp":1289212871000},"page":"177-186","source":"Crossref","is-referenced-by-count":3392,"title":["Large-Scale Machine Learning with Stochastic Gradient Descent"],"prefix":"10.1007","author":[{"given":"L\u00e9on","family":"Bottou","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2010,9,30]]},"reference":[{"key":"16_CR1","first-page":"1737","volume":"10","author":"A. Bordes","year":"2009","unstructured":"BORDES. A., BOTTOU, L., and GALLINARI, P. (2009): SGD-QN: Careful Quasi-Newton Stochastic Gradient Descent. Journal of Machine Learning Research, 10:1737-1754. With Erratum (to appear).","journal-title":"Journal of Machine Learning Research"},{"key":"16_CR2","first-page":"161","volume":"20","author":"L. Bottou","year":"2008","unstructured":"BOTTOU, L. and BOUSQUET, O. (2008): The Tradeoffs of Large Scale Learning, In Advances in Neural Information Processing Systems, vol.20, 161-168.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"2","key":"16_CR3","doi-asserted-by":"publisher","first-page":"137","DOI":"10.1002\/asmb.538","volume":"21","author":"L. Bottou","year":"2004","unstructured":"BOTTOU, L. and LECUN, Y. (2004): On-line Learning for Very Large Datasets. Applied Stochastic Models in Business and Industry, 21(2):137-151","journal-title":"Applied Stochastic Models in Business and Industry"},{"key":"16_CR4","volume-title":"Concentration Inequalities and Empirical Processes Theory Applied to the Analysis of Learning Algorithms","author":"O. Bousquet","year":"2002","unstructured":"BOUSQUET, O. (2002): Concentration Inequalities and Empirical Processes Theory Applied to the Analysis of Learning Algorithms. Th\u00e8se de doctorat, Ecole Polytechnique, Palaiseau, France."},{"key":"16_CR5","first-page":"273","volume":"20","author":"C. Cortes","year":"1995","unstructured":"CORTES, C. and VAPNIK, V. N. (1995): Support Vector Networks, Machine Learning, 20:273-297.","journal-title":"Machine Learning"},{"key":"16_CR6","unstructured":"DENNIS, J. E., Jr., and SCHNABEL, R. B. (1983): Numerical Methods For Unconstrained Optimization and Nonlinear Equations. Prentice-Hall"},{"key":"16_CR7","doi-asserted-by":"crossref","unstructured":"JOACHIMS, T. (2006): Training Linear SVMs in Linear Time. In Proceedings of the 12th ACM SIGKDD, ACM Press.","DOI":"10.1145\/1150402.1150429"},{"key":"16_CR8","unstructured":"LAFFERTY, J. D., MCCALLUM, A., and PEREIRA, F. (2001): Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data. In Proceedings of ICML 2001, 282-289, Morgan Kaufman."},{"issue":"5","key":"16_CR9","doi-asserted-by":"publisher","first-page":"1974","DOI":"10.1109\/18.705577","volume":"44","author":"W. S. Lee","year":"1998","unstructured":"LEE, W. S., BARTLETT, P. L., and WILLIAMSON, R. C. (1998): The Importance of Convexity in Learning with Squared Loss. IEEE Transactions on Information Theory, 44(5):1974-1980.","journal-title":"IEEE Transactions on Information Theory"},{"key":"16_CR10","first-page":"361","volume":"5","author":"D. D. Lewis","year":"2004","unstructured":"LEWIS, D. D., YANG, Y., ROSE, T. G., and LI, F. (2004): RCV1: A New Benchmark Collection for Text Categorization Research. Journal of Machine Learning Research, 5:361-397.","journal-title":"Journal of Machine Learning Research"},{"key":"16_CR11","doi-asserted-by":"crossref","unstructured":"LIN, C. J., WENG, R. C., and KEERTHI, S. S. (2007): Trust region Newton methods for large-scale logistic regression. In Proceedings of ICML 2007, 561-568, ACM Press.","DOI":"10.1145\/1273496.1273567"},{"key":"16_CR12","first-page":"281","volume":"1","author":"J. Macqueen","year":"1967","unstructured":"MACQUEEN, J. (1967): Some Methods for Classification and Analysis of Multivariate Observations. In Fifth Berkeley Symposium on Mathematics, Statistics, and Probabilities, vol.1, 281-297, University of California Press.","journal-title":"Fifth Berkeley Symposium on Mathematics, Statistics, and Probabilities"},{"issue":"2","key":"16_CR13","doi-asserted-by":"crossref","first-page":"245","DOI":"10.5802\/afst.961","volume":"6","author":"P. Massart","year":"2000","unstructured":"MASSART, P. (2000): Some applications of concentration inequalities to Statistics, Annales de la Facult\u00e9 des Sciences de Toulouse, series 6,9,(2):245-303.","journal-title":"Annales de la Facult\u00e9 des Sciences de Toulouse, series"},{"key":"16_CR14","doi-asserted-by":"crossref","unstructured":"MURATA, N. (1998): A Statistical Study of On-line Learning. In Online Learning and Neural Networks, Cambridge University Press.","DOI":"10.1017\/CBO9780511569920.005"},{"issue":"4","key":"16_CR15","doi-asserted-by":"publisher","first-page":"838","DOI":"10.1137\/0330046","volume":"30","author":"B. T. Polyak","year":"1992","unstructured":"POLYAK, B. T. and JUDITSKY, A. B. (1992): Acceleration of stochastic approximation by averaging. SIAM J. Control and Optimization, 30(4):838-855.","journal-title":"SIAM J. Control and Optimization"},{"key":"16_CR16","unstructured":"ROSENBLATT, F. (1957): The Perceptron: A perceiving and recognizing automaton. Technical Report 85-460-1, Project PARA, Cornell Aeronautical Lab."},{"key":"16_CR17","first-page":"318","volume":"I","author":"D. E. Rumelhart","year":"1986","unstructured":"RUMELHART, D. E., HINTON, G. E., and WILLIAMS, R. J. (1986): Learning internal representations by error propagation. In Parallel distributed processing: Explorations in the microstructure of cognition, vol.I, 318-362, Bradford Books.","journal-title":"Parallel distributed processing: Explorations in the microstructure of cognition"},{"key":"16_CR18","doi-asserted-by":"crossref","unstructured":"SHALEV-SHWARTZ, S. and SREBRO, N. (2008): SVM optimization: inverse dependence on training set size. In Proceedings of the ICML 2008, 928-935, ACM.","DOI":"10.1145\/1390156.1390273"},{"issue":"1","key":"16_CR19","doi-asserted-by":"crossref","first-page":"267","DOI":"10.1111\/j.2517-6161.1996.tb02080.x","volume":"58","author":"R. Tibshirani","year":"1996","unstructured":"TIBSHIRANI, R. (1996): Regression shrinkage and selection via the Lasso. Journal of the Royal Statistical Society, Series B, 58(1):267-288.","journal-title":"Journal of the Royal Statistical Society, Series B"},{"key":"16_CR20","doi-asserted-by":"crossref","unstructured":"TJONG KIM SANG E. F., and BUCHHOLZ, S. (2000): Introduction to the CoNLL-2000 Shared Task: Chunking. In Proceedings of CoNLL-2000, 127-132.","DOI":"10.3115\/1117601.1117631"},{"key":"16_CR21","doi-asserted-by":"crossref","unstructured":"TSYBAKOV, A. B. (2004): Optimal aggregation of classifiers in statistical learning, Annals of Statististics, 32(1).","DOI":"10.1214\/aos\/1079120131"},{"issue":"2","key":"16_CR22","doi-asserted-by":"publisher","first-page":"264","DOI":"10.1137\/1116025","volume":"16","author":"V. N. Vapnik","year":"1971","unstructured":"VAPNIK, V. N. and CHERVONENKIS, A. YA. (1971): On the Uniform Convergence of Relative Frequencies of Events to Their Probabilities. Theory of Probability and its Applications, 16(2):264-280.","journal-title":"Theory of Probability and its Applications"},{"key":"16_CR23","first-page":"96","volume":"4","author":"B. Widrow","year":"1960","unstructured":"WIDROW, B. and HOFF, M. E. (1960): Adaptive switching circuits. IRE WESCON Conv. Record, Part 4., 96-104.","journal-title":"IRE WESCON Conv. Record, Part"},{"key":"16_CR24","unstructured":"XU, W. (2010): Towards Optimal One Pass Large Scale Learning with Averaged Stochastic Gradient Descent. Journal of Machine Learning Research (to appear)."}],"container-title":["Proceedings of COMPSTAT'2010"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-7908-2604-3_16.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,27]],"date-time":"2025-02-27T17:25:51Z","timestamp":1740677151000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-7908-2604-3_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2010]]},"ISBN":["9783790826036","9783790826043"],"references-count":24,"URL":"https:\/\/doi.org\/10.1007\/978-3-7908-2604-3_16","relation":{},"subject":[],"published":{"date-parts":[[2010]]}}}