{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T20:16:43Z","timestamp":1775852203156,"version":"3.50.1"},"reference-count":76,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100008982","name":"National Science Foundation CISE: CRII Award","doi-asserted-by":"publisher","award":["CIF-194780"],"award-info":[{"award-number":["CIF-194780"]}],"id":[{"id":"10.13039\/501100008982","id-type":"DOI","asserted-by":"publisher"}]},{"name":"MIT-IBM Watson AI Lab"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Sel. Areas Inf. Theory"],"published-print":{"date-parts":[[2020,5]]},"DOI":"10.1109\/jsait.2020.2991561","type":"journal-article","created":{"date-parts":[[2020,4,30]],"date-time":"2020-04-30T20:07:14Z","timestamp":1588277234000},"page":"19-38","source":"Crossref","is-referenced-by-count":131,"title":["The Information Bottleneck Problem and its Applications in Machine Learning"],"prefix":"10.1109","volume":"1","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3406-3950","authenticated-orcid":false,"given":"Ziv","family":"Goldfeld","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2109-0979","authenticated-orcid":false,"given":"Yury","family":"Polyanskiy","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref73","first-page":"530","article-title":"Mutual information neural estimation","author":"belghazi","year":"2018","journal-title":"Proc Int Conf Mach Learn (ICML)"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683351"},{"key":"ref71","author":"han","year":"2017","journal-title":"Optimal rates of entropy estimation over Lipschitz balls"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ISIT.2015.7282680"},{"key":"ref76","author":"goldfeld","year":"2020","journal-title":"Limit distribution for smooth total variation and ?&#x00B2;-divergence in high dimensions"},{"key":"ref74","author":"chung","year":"2019","journal-title":"Neural entropic estimation A faster path to mutual information estimation"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/18.771151"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1162\/089976603321780272"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1090\/S0002-9947-1936-1501854-3"},{"key":"ref33","first-page":"1","article-title":"Differential entropy estimation under Gaussian noise","author":"goldfeld","year":"2018","journal-title":"Proc IEEE Int Conf Sci Elect Eng (ICSEE)"},{"key":"ref32","first-page":"1","article-title":"?-VAE: Learning basic visual concepts with a constrained variational framework","author":"higgins","year":"2017","journal-title":"Proc Int Conf Learn Represent (ICLR)"},{"key":"ref31","first-page":"368","article-title":"Deep variational information bottleneck","author":"alemi","year":"2017","journal-title":"Proc Int Conf Learn Represent (ICLR)"},{"key":"ref30","first-page":"617","article-title":"Agglomerative information bottleneck","author":"slonim","year":"2000","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref37","author":"cover","year":"2006","journal-title":"Elements of Information Theory"},{"key":"ref36","first-page":"1465","article-title":"Minimal achievable sufficient statistic learning","author":"cvitkovic","year":"2019","journal-title":"Proc Int Conf Mach Learn (ICML)"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1162\/NECO_a_00961"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ISIT.2019.8849414"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-017-1054-2"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1103\/PhysRevE.69.066138"},{"key":"ref61","first-page":"2524","article-title":"Information-theoretic analysis of generalization capability of learning algorithms","author":"xu","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.3390\/e19070361"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1970.1054469"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1016\/0893-6080(89)90014-2"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1962.1057738"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1103\/PhysRevA.45.6056"},{"key":"ref66","author":"advani","year":"2017","journal-title":"High-dimensional dynamics of generalization error in neural networks"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.3390\/e22020151"},{"key":"ref67","author":"achille","year":"2019","journal-title":"Where is the information in a deep neural network?"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2004.833360"},{"key":"ref69","first-page":"2157","article-title":"Estimating the unseen: Improved estimators for entropy and other properties","author":"valiant","year":"2013","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref2","first-page":"781","article-title":"On the uniform convergence of relative frequencies of events to their probabilities","volume":"181","author":"vapnik","year":"1968","journal-title":"Rep Acad Sci USSR"},{"key":"ref1","author":"hastie","year":"2009","journal-title":"The Elements of Statistical Learning Data Mining Inference and Prediction"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1975.1055437"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1972.1054855"},{"key":"ref21","first-page":"149","article-title":"Common information is far less than mutual information","volume":"2","author":"g\u00e1cs","year":"1973","journal-title":"Probl Control Inf Theory"},{"key":"ref24","article-title":"Distributed variational representation learning","author":"estella-aguerri","year":"2019","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1972.1054753"},{"key":"ref26","first-page":"35","article-title":"Distributed information bottleneck method for discrete and Gaussian sources","author":"estella-aguerri","year":"2018","journal-title":"Proc Int Zurich Seminar on Communications (IZS)"},{"key":"ref25","first-page":"165","article-title":"Information bottleneck for Gaussian variables","volume":"6","author":"chechik","year":"2005","journal-title":"J Mach Learn Res"},{"key":"ref50","article-title":"Explaining and harnessing adversarial examples","author":"goodfellow","year":"2015","journal-title":"Proc Int Conf Learn Represent (ICLR)"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/SP.2017.49"},{"key":"ref59","first-page":"1517","article-title":"Benefits of depth in neural networks","author":"telgarsky","year":"2016","journal-title":"Proc Conf Learn Theory (COLT)"},{"key":"ref58","first-page":"907","article-title":"The power of depth for feedforward neural networks","author":"eldan","year":"2016","journal-title":"Proc Conf Learn Theory (COLT)"},{"key":"ref57","first-page":"2924","article-title":"On the number of linear regions of deep neural networks","author":"montufar","year":"2014","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref56","first-page":"1476","article-title":"Convergence diagnostics for stochastic gradient descent with constant learning rate","author":"chee","year":"2018","journal-title":"Proc Artif Intell Statist (AISTATS) Conf"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511569920.005"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9781139030687"},{"key":"ref53","first-page":"245","article-title":"On the dimension and entropy of order ? of the mixture of probability distributions","volume":"13","author":"csisz\u00e1r","year":"1962","journal-title":"Acta Mathematica Hungarica"},{"key":"ref52","first-page":"2924","article-title":"Recurrent models of visual attention","author":"mnih","year":"2014","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref10","author":"gabri\u00e9","year":"2018","journal-title":"Entropy and mutual information in models of deep neural networks"},{"key":"ref11","author":"yu","year":"2018","journal-title":"Understanding convolutional neural network training with information theory"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1037\/h0058165"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_11"},{"key":"ref13","first-page":"2299","article-title":"Estimating information flow in neural networks","author":"goldfeld","year":"2019","journal-title":"Proc Int Conf Mach Learn (ICML)"},{"key":"ref14","author":"wickstr\u00f8m","year":"2019","journal-title":"Information plane analysis of deep neural networks via matrix-based Renyi&#x2019;s entropy and tensor kernels"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2909031"},{"key":"ref16","first-page":"368","article-title":"The information bottleneck method","author":"tishby","year":"1999","journal-title":"Proc Allerton Conf Commun Control Comput"},{"key":"ref17","author":"goldfeld","year":"2019","journal-title":"Convergence of smoothed empirical measures with applications to entropy estimation"},{"key":"ref18","first-page":"305","article-title":"Completeness, similar regions, and unbiased estimation: Part I","volume":"10","author":"lehmann","year":"1950","journal-title":"Sankhya Indian J Stat"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1214\/aoms\/1177729694"},{"key":"ref4","doi-asserted-by":"crossref","first-page":"436","DOI":"10.1038\/nature14539","article-title":"Deep learning","volume":"521","author":"lecun","year":"2015","journal-title":"Nature"},{"key":"ref3","first-page":"248","author":"minsky","year":"1969","journal-title":"Perceptrons"},{"key":"ref6","author":"shwartz-ziv","year":"2017","journal-title":"Opening the black box of deep neural networks via information"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ITW.2015.7133169"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2784440"},{"key":"ref7","first-page":"1","article-title":"On the emergence of invariance and disentangling in deep representations","volume":"19","author":"achille","year":"2018","journal-title":"J Mach Learn Res"},{"key":"ref49","first-page":"1","article-title":"Regularizing neural networks by penalizing confident output distributions","author":"pereyra","year":"2017","journal-title":"Proc Int Conf Learn Represent (ICLR)"},{"key":"ref9","first-page":"1","article-title":"On the information bottleneck theory of deep learning","author":"saxe","year":"2018","journal-title":"Proc Int Conf Learn Represent (ICLR)"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2013.2288257"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1973.1055037"},{"key":"ref48","first-page":"1929","article-title":"Dropout: A simple way to prevent neural networks from overfitting","volume":"15","author":"srivastava","year":"2014","journal-title":"J Mach Learn Res"},{"key":"ref47","article-title":"Auto-encoding variational Bayes","author":"kingma","year":"2014","journal-title":"Proc Int Conf Learn Represent (ICLR)"},{"key":"ref42","author":"polyanskiy","year":"2017","journal-title":"Lecture Notes on Information Theory"},{"key":"ref41","author":"berger","year":"1971","journal-title":"Rate-Distortion Theory A Mathematical Basis for Data Compression"},{"key":"ref44","article-title":"Multiterminal source coding","author":"tung","year":"1978"},{"key":"ref43","first-page":"171","article-title":"Multiterminal source coding","volume":"229","author":"berger","year":"1978","journal-title":"Information Theory Approach to Communications"}],"container-title":["IEEE Journal on Selected Areas in Information Theory"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/8700143\/8768428\/9082644-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8700143\/8768428\/09082644.pdf?arnumber=9082644","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T13:16:35Z","timestamp":1651065395000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9082644\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5]]},"references-count":76,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/jsait.2020.2991561","relation":{},"ISSN":["2641-8770"],"issn-type":[{"value":"2641-8770","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,5]]}}}