{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T08:41:06Z","timestamp":1772095266151,"version":"3.50.1"},"reference-count":71,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"8","license":[{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Cornell Center for Applied Mathematics (CAM) Postdoctoral Fellowship"},{"name":"NSF","award":["CCF-2046018"],"award-info":[{"award-number":["CCF-2046018"]}]},{"name":"NSF","award":["DMS-2210368"],"award-info":[{"award-number":["DMS-2210368"]}]},{"name":"NSF","award":["CCF-2308446"],"award-info":[{"award-number":["CCF-2308446"]}]},{"name":"IBM Academic Award"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Inform. Theory"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1109\/tit.2025.3568697","type":"journal-article","created":{"date-parts":[[2025,5,13]],"date-time":"2025-05-13T13:46:29Z","timestamp":1747143989000},"page":"6227-6247","source":"Crossref","is-referenced-by-count":1,"title":["Information-Theoretic Generalization Bounds for Deep Neural Networks"],"prefix":"10.1109","volume":"71","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1797-6101","authenticated-orcid":false,"given":"Haiyun","family":"He","sequence":"first","affiliation":[{"name":"Center for Applied Mathematics, Cornell University, Ithaca, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3406-3950","authenticated-orcid":false,"given":"Ziv","family":"Goldfeld","sequence":"additional","affiliation":[{"name":"School of Electrical and Computer Engineering, Cornell University, Ithaca, NY, USA"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1093\/imaiai\/iaz007"},{"key":"ref2","first-page":"5947","article-title":"Exploring generalization in deep learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"30","author":"Neyshabur"},{"key":"ref3","first-page":"888","article-title":"Fisher-rao metric, geometry, and complexity of neural networks","volume-title":"Proc. 22nd Int. Conf. Artif. Intell. Statist.","author":"Liang"},{"key":"ref4","first-page":"254","article-title":"Stronger generalization bounds for deep nets via a compression approach","volume-title":"Proc. 35th Int. Conf. Mach. Learn.","volume":"80","author":"Arora"},{"key":"ref5","first-page":"1","article-title":"Computing nonvacuous generalization bounds for deep (Stochastic) neural networks with many more parameters than training data","volume-title":"Proc. 33rd Conf. Uncertainty Artif. Intell.","author":"Dziugaite"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/279943.279989"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/307400.307435"},{"key":"ref8","article-title":"A PAC-Bayesian approach to spectrally-normalized margin bounds for neural networks","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Neyshabur"},{"key":"ref9","article-title":"Non-vacuous generalization bounds at the ImageNet scale: A PAC-Bayesian compression approach","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Zhou"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.1.1"},{"key":"ref11","first-page":"1019","article-title":"Sharp minima can generalize for deep nets","volume-title":"Proc. 34th Int. Conf. Mach. Learn.","author":"Dinh"},{"key":"ref12","article-title":"On large-batch training for deep learning: Generalization gap and sharp minima","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Keskar"},{"key":"ref13","article-title":"Towards understanding generalization of deep learning: Perspective of loss landscapes","volume-title":"Proc. ICML Workshop Principled Approaches Deep Learn.","author":"Wu"},{"issue":"1","key":"ref14","first-page":"2822","article-title":"The implicit bias of gradient descent on separable data","volume":"19","author":"Soudry","year":"2018","journal-title":"J. Mach. Learn. Res."},{"key":"ref15","article-title":"A Bayesian perspective on generalization and stochastic gradient descent","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Smith"},{"key":"ref16","article-title":"On the generalization mystery in deep learning","author":"Chatterjee","year":"2022","journal-title":"arXiv:2203.10036"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-73074-5_5"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3446776"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1017\/9781009025096.003"},{"key":"ref20","first-page":"2524","article-title":"Information-theoretic analysis of generalization capability of learning algorithms","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"30","author":"Xu"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2019.2945779"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/JSAIT.2020.2991139"},{"key":"ref23","first-page":"7234","article-title":"Chaining mutual information and tightening generalization bounds","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"31","author":"Asadi"},{"key":"ref24","first-page":"4212","article-title":"Chained generalisation bounds","volume-title":"Proc. Conf. Learn. Theory","author":"Clerico"},{"key":"ref25","first-page":"16457","article-title":"Conditioning and processing: Techniques to improve information-theoretic generalization bounds","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Hafez-Kolahi"},{"key":"ref26","first-page":"9925","article-title":"Sharpened generalization bounds based on conditional mutual information and an application to noisy, iterative algorithms","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Haghifam"},{"key":"ref27","first-page":"3437","article-title":"Reasoning about generalization via conditional mutual information","volume-title":"Proc. Conf. Learn. Theory","author":"Steinke"},{"key":"ref28","first-page":"24670","article-title":"Information-theoretic generalization bounds for black-box learning algorithms","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Harutyunyan"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2021.3085190"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/JSAIT.2024.3391900"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ISIT45174.2021.9518043"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ISIT.2019.8849359"},{"issue":"1","key":"ref33","first-page":"1929","article-title":"Dropout: A simple way to prevent neural networks from overfitting","volume":"15","author":"Srivastava","year":"2014","journal-title":"J. Mach. Learn. Res."},{"key":"ref34","first-page":"1058","article-title":"Regularization of neural networks using DropConnect","volume-title":"Proc. IEEE Int. Conf. Mach. Learn. (PMLR)","author":"Wan"},{"key":"ref35","first-page":"2299","article-title":"Estimating information flow in deep neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Goldfeld"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2020.2975480"},{"key":"ref37","article-title":"Understanding the role of momentum in stochastic gradient methods","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Gitman"},{"key":"ref38","article-title":"Adding gradient noise improves learning for very deep networks","author":"Neelakantan","year":"2015","journal-title":"arXiv:1511.06807"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1995.7.1.108"},{"key":"ref40","first-page":"1674","article-title":"Non-convex learning via stochastic gradient Langevin dynamics: A nonasymptotic analysis","volume-title":"Proc. Conf. Learn. Theory","author":"Raginsky"},{"key":"ref41","first-page":"8106","article-title":"An exact characterization of the generalization error for the Gibbs algorithm","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Aminian"},{"key":"ref42","article-title":"In search of the real inductive bias: On the role of implicit regularization in deep learning","volume-title":"Proc. 3rd Int. Conf. Learn. Represent.","author":"Neyshabur"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3446776"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3571070"},{"key":"ref46","article-title":"Opening the black box of deep neural networks via information","author":"Shwartz-Ziv","year":"2017","journal-title":"arXiv:1703.00810"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1088\/1742-5468\/ab3985"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2909031"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/JSAIT.2020.2991561"},{"issue":"139","key":"ref50","first-page":"1","article-title":"Chaining meets chain rule: Multilevel entropic regularization and training of neural networks","volume":"21","author":"Asadi","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ITW.2018.8613445"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ISIT.2018.8437571"},{"key":"ref53","first-page":"26:1","article-title":"Generalization bounds for noisy iterative algorithms using properties of additive noise channels","author":"Wang","year":"2021","journal-title":"J. Mach. Learn. Res."},{"key":"ref54","first-page":"3526","article-title":"Information-theoretic generalization bounds for stochastic gradient descent","volume-title":"Proc. Conf. Learn. Theory","author":"Neu"},{"key":"ref55","article-title":"On the generalization of models trained with SGD: information-theoretic bounds and implications","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Wang"},{"key":"ref56","first-page":"6306","article-title":"Neural discrete representation learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst. Annu. Conf. Neural Inf. Process. Syst.","author":"Van Den Oord"},{"key":"ref57","first-page":"19109","article-title":"Tighter expected generalization error bounds via Wasserstein distance","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Rodr\u00edguez-G\u00e1lvez"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.2307\/1403865"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1016\/0024-3795(93)90331-H"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1137\/1101006"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4939-7005-6_7"},{"key":"ref62","volume-title":"Comparisons of Stochastic Matrices With Applications in Information Theory, Statistics, Economics and Population","author":"Cohen","year":"1998"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1002\/0471200611"},{"key":"ref64","first-page":"681","article-title":"Bayesian learning via stochastic gradient Langevin dynamics","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Welling"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ITW.2016.7606789"},{"key":"ref66","first-page":"2306","article-title":"Universal approximation with deep narrow networks","volume-title":"Proc. Conf. Learn. Theory","author":"Kidger"},{"key":"ref67","first-page":"12282","article-title":"Neural tangent kernel analysis of deep narrow neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Lee"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1090\/gsm\/058"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2021.3130189"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2015.2482978"},{"issue":"3","key":"ref71","first-page":"462","article-title":"Estimates of the proximity of Gaussian measures","volume":"34","author":"Barsov","year":"1987","journal-title":"Sov. Mathematics-Doklady"}],"container-title":["IEEE Transactions on Information Theory"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/18\/11095909\/11003178.pdf?arnumber=11003178","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,26]],"date-time":"2025-07-26T06:26:47Z","timestamp":1753511207000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11003178\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8]]},"references-count":71,"journal-issue":{"issue":"8"},"URL":"https:\/\/doi.org\/10.1109\/tit.2025.3568697","relation":{},"ISSN":["0018-9448","1557-9654"],"issn-type":[{"value":"0018-9448","type":"print"},{"value":"1557-9654","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,8]]}}}