{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T04:07:04Z","timestamp":1769918824882,"version":"3.49.0"},"reference-count":37,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2019,10,23]],"date-time":"2019-10-23T00:00:00Z","timestamp":1571788800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2019,10,23]],"date-time":"2019-10-23T00:00:00Z","timestamp":1571788800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"NSF","award":["1906169"],"award-info":[{"award-number":["1906169"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2020,3]]},"DOI":"10.1007\/s10994-019-05839-6","type":"journal-article","created":{"date-parts":[[2019,10,23]],"date-time":"2019-10-23T23:59:57Z","timestamp":1571875197000},"page":"467-492","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":154,"title":["Gradient descent optimizes over-parameterized deep ReLU networks"],"prefix":"10.1007","volume":"109","author":[{"given":"Difan","family":"Zou","sequence":"first","affiliation":[]},{"given":"Yuan","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Dongruo","family":"Zhou","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9830-793X","authenticated-orcid":false,"given":"Quanquan","family":"Gu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,10,23]]},"reference":[{"key":"5839_CR1","unstructured":"Allen-Zhu, Z., Li, Y., & Song, Z. (2018a). A convergence theory for deep learning via over-parameterization. arXiv preprint \narXiv:1811.03962"},{"key":"5839_CR2","unstructured":"Allen-Zhu, Z., Li, Y., & Song, Z. (2018b) On the convergence rate of training recurrent neural networks. arXiv preprint \narXiv:1810.12065"},{"key":"5839_CR3","unstructured":"Arora, S., Cohen, N., Golowich, N., & Hu, W. (2018a). A convergence analysis of gradient descent for deep linear neural networks. arXiv preprint \narXiv:1810.02281"},{"key":"5839_CR4","unstructured":"Arora, S., Cohen, N., & Hazan, E. (2018b). On the optimization of deep networks: Implicit acceleration by overparameterization. arXiv preprint \narXiv:1802.06509"},{"key":"5839_CR5","unstructured":"Bartlett, P., Helmbold, D., & Long, P. (2018). Gradient descent with identity initialization efficiently learns positive definite linear transformations. In International conference on machine learning, pp. 520\u2013529."},{"key":"5839_CR6","unstructured":"Brutzkus, A., & Globerson, A. (2017). Globally optimal gradient descent for a convnet with gaussian inputs. arXiv preprint \narXiv:1702.07966"},{"key":"5839_CR7","unstructured":"Du, S. S., Lee, J. D., & Tian, Y. (2017). When is a convolutional filter easy to learn? arXiv preprint \narXiv:1709.06129"},{"key":"5839_CR8","unstructured":"Du, S. S., Lee, J. D., Li, H., Wang, L., & Zhai, X. (2018a). Gradient descent finds global minima of deep neural networks. arXiv preprint \narXiv:1811.03804"},{"key":"5839_CR9","unstructured":"Du, S. S., Zhai, X., Poczos, B., & Singh, A. (2018b). Gradient descent provably optimizes over-parameterized neural networks. arXiv preprint \narXiv:1810.02054"},{"key":"5839_CR10","unstructured":"Gunasekar, S., Lee, J., Soudry, D., & Srebro, N. (2018). Implicit bias of gradient descent on linear convolutional networks. arXiv preprint \narXiv:1806.00468"},{"key":"5839_CR11","unstructured":"Hanin, B. (2017). Universal function approximation by deep neural nets with bounded width and ReLU activations. arXiv preprint \narXiv:1708.02691"},{"key":"5839_CR12","unstructured":"Hanin, B., Sellke, M. (2017). Approximating continuous functions by ReLU nets of minimal width. arXiv preprint \narXiv:1710.11278"},{"key":"5839_CR13","unstructured":"Hardt, M., & Ma, T. (2016). Identity matters in deep learning. arXiv preprint \narXiv:1611.04231"},{"key":"5839_CR14","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2015). Delving deep into rectifiers: Surpassing human-level performance on imagenet classification. In Proceedings of the IEEE international conference on computer vision, pp 1026\u20131034.","DOI":"10.1109\/ICCV.2015.123"},{"key":"5839_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"issue":"6","key":"5839_CR16","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MSP.2012.2205597","volume":"29","author":"G Hinton","year":"2012","unstructured":"Hinton, G., Deng, L., Yu, D., Dahl, G. E., Ar, Mohamed, Jaitly, N., et al. (2012). Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups. IEEE Signal Processing Magazine, 29(6), 82\u201397.","journal-title":"IEEE Signal Processing Magazine"},{"issue":"8","key":"5839_CR17","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural Computation, 9(8), 1735\u20131780.","journal-title":"Neural Computation"},{"issue":"2","key":"5839_CR18","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1016\/0893-6080(91)90009-T","volume":"4","author":"K Hornik","year":"1991","unstructured":"Hornik, K. (1991). Approximation capabilities of multilayer feedforward networks. Neural Networks, 4(2), 251\u2013257.","journal-title":"Neural Networks"},{"key":"5839_CR19","unstructured":"Kawaguchi, K. (2016). Deep learning without poor local minima. In Advances in Neural Information Processing Systems, pp 586\u2013594."},{"key":"5839_CR20","unstructured":"Krizhevsky, A. (2009). Learning multiple layers of features from tiny images. Tech. rep., Citeseer."},{"key":"5839_CR21","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G. E. (2012). Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems, pp 1097\u20131105."},{"issue":"11","key":"5839_CR22","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y LeCun","year":"1998","unstructured":"LeCun, Y., Bottou, L., Bengio, Y., & Haffner, P. (1998). Gradient-based learning applied to document recognition. Proceedings of the IEEE, 86(11), 2278\u20132324.","journal-title":"Proceedings of the IEEE"},{"key":"5839_CR23","unstructured":"Li, Y., & Liang, Y. (2018). Learning overparameterized neural networks via stochastic gradient descent on structured data. arXiv preprint \narXiv:1808.01204"},{"key":"5839_CR24","unstructured":"Li, Y., & Yuan, Y. (2017). Convergence analysis of two-layer neural networks with ReLU activation. arXiv preprint \narXiv:1705.09886"},{"key":"5839_CR25","unstructured":"Liang, S., & Srikant, R. (2016). Why deep neural networks for function approximation? arXiv preprint \narXiv:1610.04161"},{"key":"5839_CR26","unstructured":"Lin, H., & Jegelka, S. (2018). Resnet with one-neuron hidden layers is a universal approximator. In Advances in neural information processing systems, pp. 6172\u20136181."},{"key":"5839_CR27","unstructured":"Lu, Z., Pu, H., Wang, F., Hu, Z., & Wang, L. (2017). The expressive power of neural networks: A view from the width. arXiv preprint \narXiv:1709.02540"},{"issue":"7587","key":"5839_CR28","doi-asserted-by":"publisher","first-page":"484","DOI":"10.1038\/nature16961","volume":"529","author":"D Silver","year":"2016","unstructured":"Silver, D., Huang, A., Maddison, C. J., Guez, A., Sifre, L., Van Den Driessche, G., et al. (2016). Mastering the game of go with deep neural networks and tree search. Nature, 529(7587), 484\u2013489.","journal-title":"Nature"},{"key":"5839_CR29","unstructured":"Telgarsky, M. (2015). Representation benefits of deep feedforward networks. arXiv preprint \narXiv:1509.08101"},{"key":"5839_CR30","unstructured":"Telgarsky, M. (2016). Benefits of depth in neural networks. arXiv preprint \narXiv:1602.04485"},{"key":"5839_CR31","unstructured":"Tian, Y. (2017). An analytical formula of population gradient for two-layered ReLU network and its applications in convergence and critical point analysis. arXiv preprint \narXiv:1703.00560"},{"key":"5839_CR32","unstructured":"Vershynin, R. (2010). Introduction to the non-asymptotic analysis of random matrices. arXiv preprint \narXiv:1011.3027"},{"key":"5839_CR33","doi-asserted-by":"publisher","first-page":"103","DOI":"10.1016\/j.neunet.2017.07.002","volume":"94","author":"D Yarotsky","year":"2017","unstructured":"Yarotsky, D. (2017). Error bounds for approximations with deep ReLU networks. Neural Networks, 94, 103\u2013114.","journal-title":"Neural Networks"},{"key":"5839_CR34","unstructured":"Yarotsky, D. (2018). Optimal approximation of continuous functions by very deep ReLU networks. arXiv preprint \narXiv:1802.03620"},{"key":"5839_CR35","unstructured":"Zhang, X., Yu, Y., Wang, L., & Gu, Q. (2018). Learning one-hidden-layer ReLU networks via gradient descent. arXiv preprint \narXiv:1806.07808"},{"key":"5839_CR36","unstructured":"Zhou, D. X. (2019). Universality of deep convolutional neural networks. In Applied and computational harmonic analysis."},{"key":"5839_CR37","unstructured":"Zou, D., Cao, Y., Zhou. D., & Gu, Q. (2018). Stochastic gradient descent optimizes over-parameterized deep relu networks. arXiv preprint \narXiv:1811.08888"}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-019-05839-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10994-019-05839-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-019-05839-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,10,23]],"date-time":"2020-10-23T00:05:59Z","timestamp":1603411559000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10994-019-05839-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,10,23]]},"references-count":37,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2020,3]]}},"alternative-id":["5839"],"URL":"https:\/\/doi.org\/10.1007\/s10994-019-05839-6","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,10,23]]},"assertion":[{"value":"4 May 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 July 2019","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 September 2019","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 October 2019","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}