{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,5]],"date-time":"2026-01-05T22:27:18Z","timestamp":1767652038722,"version":"3.37.3"},"reference-count":87,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2025,2,1]],"date-time":"2025-02-01T00:00:00Z","timestamp":1738368000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,2,1]],"date-time":"2025-02-01T00:00:00Z","timestamp":1738368000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,2,1]],"date-time":"2025-02-01T00:00:00Z","timestamp":1738368000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Neural Netw. Learning Syst."],"published-print":{"date-parts":[[2025,2]]},"DOI":"10.1109\/tnnls.2024.3356310","type":"journal-article","created":{"date-parts":[[2024,2,7]],"date-time":"2024-02-07T18:50:04Z","timestamp":1707331804000},"page":"3329-3342","source":"Crossref","is-referenced-by-count":1,"title":["Attentive Learning Facilitates Generalization of Neural Networks"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7810-9346","authenticated-orcid":false,"given":"Shiye","family":"Lei","sequence":"first","affiliation":[{"name":"Sydney AI Centre and the School of Computer Science, Faculty of Engineering, The University of Sydney, Darlington, NSW, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5584-2385","authenticated-orcid":false,"given":"Fengxiang","family":"He","sequence":"additional","affiliation":[{"name":"Artificial Intelligence and its Applications Institute, School of Informatics, University of Edinburgh, Edinburgh, U.K"}]},{"given":"Haowen","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Mathematics, ETH Z&#x00FC;rich, Z&#x00FC;rich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7225-5449","authenticated-orcid":false,"given":"Dacheng","family":"Tao","sequence":"additional","affiliation":[{"name":"Sydney AI Centre and the School of Computer Science, Faculty of Engineering, The University of Sydney, Darlington, NSW, Australia"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"25","author":"Krizhevsky"},{"doi-asserted-by":"publisher","key":"ref2","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref3","article-title":"Image captions are natural prompts for text-to-image models","author":"Lei","year":"2023","journal-title":"arXiv:2307.08526"},{"doi-asserted-by":"publisher","key":"ref4","DOI":"10.1109\/TNNLS.2020.3046924"},{"doi-asserted-by":"publisher","key":"ref5","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref6","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1016\/j.artint.2023.103860"},{"key":"ref8","article-title":"Playing Atari with deep reinforcement learning","author":"Mnih","year":"2013","journal-title":"arXiv:1312.5602"},{"doi-asserted-by":"publisher","key":"ref9","DOI":"10.13140\/RG.2.2.18893.74727"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.1038\/nature24270"},{"doi-asserted-by":"publisher","key":"ref11","DOI":"10.1109\/CVPR.2018.00568"},{"doi-asserted-by":"publisher","key":"ref12","DOI":"10.1002\/rob.21918"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.1038\/s41586-019-1923-7"},{"doi-asserted-by":"publisher","key":"ref14","DOI":"10.1038\/s41586-021-03819-2"},{"doi-asserted-by":"publisher","key":"ref15","DOI":"10.1016\/j.artint.2022.103667"},{"doi-asserted-by":"publisher","key":"ref16","DOI":"10.1016\/j.artint.2021.103627"},{"volume-title":"Foundations of Machine Learning","year":"2018","author":"Mohri","key":"ref17"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.1016\/j.artint.2022.103803"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.1016\/j.artint.2022.103811"},{"doi-asserted-by":"publisher","key":"ref20","DOI":"10.1109\/TNNLS.2021.3109942"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.1109\/TNNLS.2023.3297113"},{"key":"ref22","article-title":"Spectrally-normalized margin bounds for neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"30","author":"Bartlett"},{"volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Neyshabur","article-title":"A PAC-Bayesian approach to spectrally-normalized margin bounds for neural networks","key":"ref23"},{"doi-asserted-by":"publisher","key":"ref24","DOI":"10.1016\/j.artint.2023.103951"},{"volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Kawaguchi","article-title":"Deep learning without poor local minima","key":"ref25"},{"key":"ref26","article-title":"Depth creates no bad local minima","author":"Lu","year":"2017","journal-title":"arXiv:1702.08580"},{"volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Zhou","article-title":"Critical points of neural networks: Analytical forms and landscape properties","key":"ref27"},{"issue":"1","key":"ref28","first-page":"2822","article-title":"The implicit bias of gradient descent on separable data","volume":"19","author":"Soudry","year":"2018","journal-title":"J. Mach. Learn. Res."},{"volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Lyu","article-title":"Gradient descent maximizes the margin of homogeneous neural networks","key":"ref29"},{"key":"ref30","first-page":"17176","article-title":"Directional convergence and alignment in deep learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Ji"},{"key":"ref31","first-page":"3496","article-title":"SGD on neural networks learns functions of increasing complexity","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Kalimeris"},{"year":"2009","author":"Krizhevsky","article-title":"Learning multiple layers of features from tiny images","key":"ref32"},{"doi-asserted-by":"publisher","key":"ref33","DOI":"10.1145\/3446776"},{"doi-asserted-by":"publisher","key":"ref34","DOI":"10.1109\/TCYB.2020.2979968"},{"doi-asserted-by":"publisher","key":"ref35","DOI":"10.1109\/TCYB.2021.3062881"},{"doi-asserted-by":"publisher","key":"ref36","DOI":"10.1109\/TNNLS.2022.3167409"},{"doi-asserted-by":"publisher","key":"ref37","DOI":"10.1109\/TNNLS.2020.2966319"},{"doi-asserted-by":"publisher","key":"ref38","DOI":"10.1109\/TNNLS.2021.3131813"},{"key":"ref39","article-title":"On large-batch training for deep learning: Generalization gap and sharp minima","author":"Keskar","year":"2016","journal-title":"arXiv:1609.04836"},{"volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Xie","article-title":"A diffusion theory for deep learning dynamics: Stochastic gradient descent exponentially favors flat minima","key":"ref40"},{"doi-asserted-by":"publisher","key":"ref41","DOI":"10.1016\/j.artint.2022.103739"},{"key":"ref42","first-page":"1305","article-title":"Implicit bias of gradient descent for wide two-layer neural networks trained with the logistic loss","volume-title":"Proc. Conf. Learn. Theory","author":"Chizat"},{"key":"ref43","article-title":"Gradient descent on two-layer nets: Margin maximization and simplicity bias","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Lyu"},{"year":"2021","author":"Lei","article-title":"Understanding deep learning via large-scale systematic experiments","key":"ref44"},{"key":"ref45","article-title":"Spatial-temporal-fusion BNN: Variational Bayesian feature layer","author":"Lei","year":"2021","journal-title":"arXiv:2112.06281"},{"doi-asserted-by":"publisher","key":"ref46","DOI":"10.1073\/pnas.1903070116"},{"doi-asserted-by":"publisher","key":"ref47","DOI":"10.1088\/1742-5468\/ac3a74"},{"key":"ref48","first-page":"5301","article-title":"On the spectral bias of neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Rahaman"},{"key":"ref49","article-title":"Frequency principle: Fourier analysis sheds light on deep neural networks","author":"John Xu","year":"2019","journal-title":"arXiv:1901.06523"},{"key":"ref50","article-title":"Neural networks behave as hash encoders: An empirical study","author":"He","year":"2021","journal-title":"arXiv:2101.05490"},{"key":"ref51","first-page":"7978","article-title":"Open-set label noise can improve robustness against inherent label noise","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Wei"},{"key":"ref52","article-title":"Hold me tight! Influence of discriminative features on deep network boundaries","author":"Ortiz-Jimenez","year":"2020","journal-title":"arXiv:2002.06349"},{"key":"ref53","article-title":"The pitfalls of simplicity bias in neural networks","author":"Shah","year":"2020","journal-title":"arXiv:2006. 07710"},{"doi-asserted-by":"publisher","key":"ref54","DOI":"10.1109\/TNNLS.2023.3326654"},{"doi-asserted-by":"publisher","key":"ref55","DOI":"10.1088\/0954-898X_5_4_006"},{"doi-asserted-by":"publisher","key":"ref56","DOI":"10.1126\/science.290.5500.2319"},{"doi-asserted-by":"publisher","key":"ref57","DOI":"10.1162\/089976603321780317"},{"doi-asserted-by":"publisher","key":"ref58","DOI":"10.1090\/jams\/852"},{"doi-asserted-by":"publisher","key":"ref59","DOI":"10.1109\/TPAMI.2023.3322540"},{"volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Pope","article-title":"The intrinsic dimension of images and its impact on learning","key":"ref60"},{"doi-asserted-by":"publisher","key":"ref61","DOI":"10.1109\/TSP.2009.2031722"},{"key":"ref62","article-title":"k-NN regression adapts to local intrinsic dimension","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"24","author":"Kpotufe"},{"doi-asserted-by":"publisher","key":"ref63","DOI":"10.1038\/s41598-020-72222-0"},{"doi-asserted-by":"publisher","key":"ref64","DOI":"10.1007\/978-3-642-41181-6_5"},{"doi-asserted-by":"publisher","key":"ref65","DOI":"10.1109\/TPAMI.2014.2343220"},{"doi-asserted-by":"publisher","key":"ref66","DOI":"10.1145\/347090.347145"},{"key":"ref67","doi-asserted-by":"crossref","first-page":"179","DOI":"10.1016\/B978-012099975-0.50005-1","article-title":"Multidimensional scaling","volume-title":"Measurement, Judgment and Decision Making","author":"Carroll","year":"1998"},{"doi-asserted-by":"publisher","key":"ref68","DOI":"10.1126\/science.290.5500.2323"},{"doi-asserted-by":"publisher","key":"ref69","DOI":"10.1002\/0470013192.bsa501"},{"key":"ref70","article-title":"Intrinsic dimension estimation using packing numbers","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"15","author":"K\u00e9gl"},{"doi-asserted-by":"publisher","key":"ref71","DOI":"10.1016\/j.patcog.2008.09.016"},{"doi-asserted-by":"publisher","key":"ref72","DOI":"10.1103\/PhysRevLett.52.1661"},{"key":"ref73","article-title":"Maximum likelihood estimation of intrinsic dimension","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"17","author":"Levina"},{"doi-asserted-by":"publisher","key":"ref74","DOI":"10.1016\/j.patcog.2014.02.013"},{"doi-asserted-by":"publisher","key":"ref75","DOI":"10.1038\/s41598-017-11873-y"},{"key":"ref76","article-title":"Sample complexity of testing the manifold hypothesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"23","author":"Narayanan"},{"key":"ref77","first-page":"1","article-title":"Adaptive approximation and generalization of deep neural network with intrinsic dimensionality","volume":"21","author":"Nakada","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref78","article-title":"The effect of the intrinsic dimension on the generalization of quadratic classifiers","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Latorre"},{"volume-title":"Deep Learning","year":"2016","author":"Goodfellow","key":"ref79"},{"key":"ref80","first-page":"499","article-title":"Stability and generalization","volume":"2","author":"Bousquet","year":"2002","journal-title":"J. Mach. Learn. Res."},{"key":"ref81","article-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2014","journal-title":"arXiv:1409.1556"},{"doi-asserted-by":"publisher","key":"ref82","DOI":"10.5244\/c.30.87"},{"volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Dosovitskiy","article-title":"An image is worth 16 \u00d7 16 words: Transformers for image recognition at scale","key":"ref83"},{"issue":"8","key":"ref84","first-page":"2","article-title":"Neural networks for machine learning lecture 6a overview of mini-batch gradient descent","volume":"14","author":"Hinton","year":"2012","journal-title":"Cited on"},{"volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Kingma","article-title":"Adam: A method for stochastic optimization","key":"ref85"},{"key":"ref86","first-page":"7721","article-title":"Accuracy on the line: On the strong correlation between out-of-distribution and in-distribution generalization","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Miller"},{"volume-title":"Active learning literature survey","year":"2009","author":"Settles","key":"ref87"}],"container-title":["IEEE Transactions on Neural Networks and Learning Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/5962385\/10877690\/10423835.pdf?arnumber=10423835","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,7]],"date-time":"2025-02-07T07:10:48Z","timestamp":1738912248000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10423835\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2]]},"references-count":87,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tnnls.2024.3356310","relation":{},"ISSN":["2162-237X","2162-2388"],"issn-type":[{"type":"print","value":"2162-237X"},{"type":"electronic","value":"2162-2388"}],"subject":[],"published":{"date-parts":[[2025,2]]}}}