{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T11:44:57Z","timestamp":1771933497144,"version":"3.50.1"},"reference-count":30,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2023,8,14]],"date-time":"2023-08-14T00:00:00Z","timestamp":1691971200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,8,14]],"date-time":"2023-08-14T00:00:00Z","timestamp":1691971200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Nat Mach Intell"],"DOI":"10.1038\/s42256-023-00700-x","type":"journal-article","created":{"date-parts":[[2023,8,14]],"date-time":"2023-08-14T16:03:12Z","timestamp":1692028992000},"page":"908-918","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":13,"title":["Activity\u2013weight duality in feed-forward neural networks reveals two co-determinants for generalization"],"prefix":"10.1038","volume":"5","author":[{"given":"Yu","family":"Feng","sequence":"first","affiliation":[]},{"given":"Wei","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4589-981X","authenticated-orcid":false,"given":"Yuhai","family":"Tu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,8,14]]},"reference":[{"key":"700_CR1","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y LeCun","year":"2015","unstructured":"LeCun, Y., Bengio, Y. & Hinton, G. Deep learning. Nature 521, 436\u2013444 (2015).","journal-title":"Nature"},{"key":"700_CR2","unstructured":"Goodfellow, I., Courville, A. & Bengio, Y. Deep Learning Vol. 1 (MIT Press, 2016)."},{"key":"700_CR3","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S. & Sun, J. Deep residual learning for image recognition. In Proc. IEEE Conference on Computer Vision and Pattern Recognition 770\u2013778 (IEEE, 2016).","DOI":"10.1109\/CVPR.2016.90"},{"key":"700_CR4","unstructured":"Wu, Y. et al. Google\u2019s neural machine translation system: bridging the gap between human and machine translation. Preprint at https:\/\/arxiv.org\/abs\/1609.08144 (2016)."},{"key":"700_CR5","doi-asserted-by":"publisher","first-page":"484","DOI":"10.1038\/nature16961","volume":"529","author":"D Silver","year":"2016","unstructured":"Silver, D. et al. Mastering the game of Go with deep neural networks and tree search. Nature 529, 484\u2013489 (2016).","journal-title":"Nature"},{"key":"700_CR6","doi-asserted-by":"publisher","first-page":"583","DOI":"10.1038\/s41586-021-03819-2","volume":"596","author":"J Jumper","year":"2021","unstructured":"Jumper, J. et al. Highly accurate protein structure prediction with AlphaFold. Nature 596, 583\u2013589 (2021).","journal-title":"Nature"},{"key":"700_CR7","unstructured":"Jiang, Y., Neyshabur, B., Mobahi, H., Krishnan, D. & Bengio, S. Fantastic generalization measures and where to find them. In 8th International Conference on Learning Representations (2020)."},{"key":"700_CR8","unstructured":"Keskar, N. S., Mudigere, D., Nocedal, J., Smelyanskiy, M. & Tang, P. T. P. On large-batch training for deep learning: generalization gap and sharp minima. In 5th International Conference on Learning Representations (2017)."},{"key":"700_CR9","unstructured":"Dinh, L., Pascanu, R., Bengio, S. & Bengio, Y. Sharp minima can generalize for deep nets. In International Conference on Machine Learning 1019\u20131028 (PMLR, 2017)."},{"key":"700_CR10","unstructured":"Zhu, Z., Wu, J., Yu, B., Wu, L. & Ma, J. The anisotropic noise in stochastic gradient descent: its behavior of escaping from sharp minima and regularization effects. In Proc. International Conference on Machine Learning 7654\u20137663 (PMLR, 2019)."},{"key":"700_CR11","first-page":"1","volume":"21","author":"J Martens","year":"2020","unstructured":"Martens, J. New insights and perspectives on the natural gradient method. J. Mach. Learn. Res. 21, 1\u201376 (2020).","journal-title":"J. Mach. Learn. Res."},{"key":"700_CR12","doi-asserted-by":"crossref","unstructured":"Chaudhari, P., & Soatto, S. Stochastic gradient descent performs variational inference, converges to limit cycles for deep networks. In 2018 Information Theory and Applications Workshop (ITA) 1\u201310 (IEEE, 2018).","DOI":"10.1109\/ITA.2018.8503224"},{"key":"700_CR13","doi-asserted-by":"publisher","first-page":"e2015617118","DOI":"10.1073\/pnas.2015617118","volume":"118","author":"Y Feng","year":"2021","unstructured":"Feng, Y. & Tu, Y. The inverse variance\u2013flatness relation in stochastic gradient descent is critical for finding flat minima. Proc. Natl Acad. Sci. USA 118, e2015617118 (2021).","journal-title":"Proc. Natl Acad. Sci. USA"},{"key":"700_CR14","doi-asserted-by":"publisher","first-page":"237101","DOI":"10.1103\/PhysRevLett.130.237101","volume":"130","author":"N Yang","year":"2023","unstructured":"Yang, N., Tang, C. & Tu, Y. Stochastic gradient descent introduces an effective landscape-dependent regularization favoring flat solutions. Phys. Rev. Lett. 130, 237101 (2023).","journal-title":"Phys. Rev. Lett."},{"key":"700_CR15","first-page":"10677","volume":"32","author":"AS Golatkar","year":"2019","unstructured":"Golatkar, A. S., Achille, A. & Soatto, S. Time matters in regularizing deep networks: weight decay and data augmentation affect early learning dynamics, matter little near convergence. Adv. Neural Inf. Process. Syst. 32, 10677\u201310687 (2019).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"700_CR16","first-page":"5330","volume":"30","author":"X Lian","year":"2017","unstructured":"Lian, X. et al. Can decentralized algorithms outperform centralized algorithms? A case study for decentralized parallel stochastic gradient descent. Adv. Neural Inf. Process. Syst. 30, 5330\u20135340 (2017).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"700_CR17","first-page":"112:1","volume":"20","author":"CJ Shallue","year":"2019","unstructured":"Shallue, C. J. et al. Measuring the effects of data parallelism on neural network training. J. Mach. Learn. Res. 20, 112:1\u2013112:49 (2019).","journal-title":"J. Mach. Learn. Res."},{"key":"700_CR18","unstructured":"Lian, X., Zhang, W., Zhang, C. & Liu, J. Asynchronous decentralized parallelstochastic gradient descent. In International Conference on Machine Learning 3043\u20133052 (2018)."},{"key":"700_CR19","unstructured":"Zhang, W. et al. Loss landscape dependent self-adjusting learning rates in decentralized stochastic gradient descent. Preprint at https:\/\/arxiv.org\/abs\/2112.01433 (2021)."},{"key":"700_CR20","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S. & Sun, S. Deep residual learning for image recognition. In Proc. IEEE Conference on Computer Vision and Pattern Recognition 770\u2013778 (IEEE, 2016).","DOI":"10.1109\/CVPR.2016.90"},{"key":"700_CR21","doi-asserted-by":"crossref","unstructured":"Huang, G., Liu, Z., Van Der Maaten, L. & Weinberger, K. Q. Densely connected convolutional networks. In 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2261\u20132269 (IEEE, 2017).","DOI":"10.1109\/CVPR.2017.243"},{"key":"700_CR22","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1162\/neco.1997.9.1.1","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S. & Schmidhuber, J. Flat minima. Neural Comput. 9, 1\u201342 (1997).","journal-title":"Neural Comput."},{"key":"700_CR23","unstructured":"Wei, C. & Ma, T. Improved sample complexities for deep networks and robust classification via an all-layer margin. In International Conference on Learning Representations (2020)."},{"key":"700_CR24","doi-asserted-by":"publisher","first-page":"124018","DOI":"10.1088\/1742-5468\/ab39d9","volume":"2019","author":"P Chaudhari","year":"2019","unstructured":"Chaudhari, P. et al. Entropy-SGD: biasing gradient descent into wide valleys. J. Stat. Mech. Theor. E 2019, 124018 (2019).","journal-title":"J. Stat. Mech. Theor. E"},{"key":"700_CR25","unstructured":"Foret, P., Kleiner, A., Mobahi, H. & Neyshabur, B. Sharpness-aware minimization for efficiently improving generalization. In 9th International Conference on Learning Representations (2021)."},{"key":"700_CR26","doi-asserted-by":"publisher","first-page":"161","DOI":"10.1073\/pnas.1908636117","volume":"117","author":"C Baldassi","year":"2020","unstructured":"Baldassi, C., Pittorino, F. & Zecchina, R. Shaping the learning landscape in neural networks around wide flat minima. Proc. Natl Acad. Sci. USA 117, 161\u2013170 (2020).","journal-title":"Proc. Natl Acad. Sci. USA"},{"key":"700_CR27","unstructured":"Yang, R., Mao, J. & Chaudhari, P. Does the data induce capacity control in deep learning? In International Conference on Machine Learning 25166\u201325197 (PMLR, 2022)."},{"key":"700_CR28","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1109\/MSP.2012.2211477","volume":"29","author":"L Deng","year":"2012","unstructured":"Deng, L. The MNIST database of handwritten digit images for machine learning research. IEEE Signal Process. Mag. 29, 141\u2013142 (2012).","journal-title":"IEEE Signal Process. Mag."},{"key":"700_CR29","unstructured":"Krizhevsky, A., Nair, V. & Hinton, G. CIFAR-10. Canadian Institute for Advanced Research http:\/\/www.cs.toronto.edu\/~kriz\/cifar.html (2010)."},{"key":"700_CR30","doi-asserted-by":"publisher","unstructured":"Feng, Y. A\u2013W duality in neural network: the flatter and smaller solution is better for generalization. Zenodo https:\/\/doi.org\/10.5281\/zenodo.8031053 (2023).","DOI":"10.5281\/zenodo.8031053"}],"container-title":["Nature Machine Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.nature.com\/articles\/s42256-023-00700-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s42256-023-00700-x","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s42256-023-00700-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,12]],"date-time":"2023-12-12T18:13:25Z","timestamp":1702404805000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.nature.com\/articles\/s42256-023-00700-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,8,14]]},"references-count":30,"journal-issue":{"issue":"8","published-online":{"date-parts":[[2023,8]]}},"alternative-id":["700"],"URL":"https:\/\/doi.org\/10.1038\/s42256-023-00700-x","relation":{},"ISSN":["2522-5839"],"issn-type":[{"value":"2522-5839","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,8,14]]},"assertion":[{"value":"4 April 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 June 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 August 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declare no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}]}}