{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T00:36:08Z","timestamp":1767141368512,"version":"build-2238731810"},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"8-9","license":[{"start":{"date-parts":[[2019,5,9]],"date-time":"2019-05-09T00:00:00Z","timestamp":1557360000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"},{"start":{"date-parts":[[2019,5,9]],"date-time":"2019-05-09T00:00:00Z","timestamp":1557360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"funder":[{"DOI":"10.13039\/501100000266","name":"Engineering and Physical Sciences Research Council","doi-asserted-by":"crossref","award":["EP\/N035127\/1"],"award-info":[{"award-number":["EP\/N035127\/1"]}],"id":[{"id":"10.13039\/501100000266","id-type":"DOI","asserted-by":"crossref"}]},{"name":"AstraZeneca Data Science Fellowship"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2019,9,15]]},"DOI":"10.1007\/s10994-019-05795-1","type":"journal-article","created":{"date-parts":[[2019,5,10]],"date-time":"2019-05-10T06:59:37Z","timestamp":1557471577000},"page":"1261-1286","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":23,"title":["Efficient feature selection using shrinkage estimators"],"prefix":"10.1007","volume":"108","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6582-7453","authenticated-orcid":false,"given":"Konstantinos","family":"Sechidis","sequence":"first","affiliation":[]},{"given":"Laura","family":"Azzimonti","sequence":"additional","affiliation":[]},{"given":"Adam","family":"Pocock","sequence":"additional","affiliation":[]},{"given":"Giorgio","family":"Corani","sequence":"additional","affiliation":[]},{"given":"James","family":"Weatherall","sequence":"additional","affiliation":[]},{"given":"Gavin","family":"Brown","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,5,9]]},"reference":[{"key":"5795_CR1","volume-title":"Categorical data analysis","author":"A Agresti","year":"2013","unstructured":"Agresti, A. (2013). Categorical data analysis (3rd ed.). New York: Wiley.","edition":"3"},{"issue":"3","key":"5795_CR2","doi-asserted-by":"crossref","first-page":"297","DOI":"10.1007\/s10260-005-0121-y","volume":"14","author":"A Agresti","year":"2005","unstructured":"Agresti, A., & Hitchcock, D. B. (2005). Bayesian inference for categorical data analysis. Statistical Methods and Applications, 14(3), 297\u2013330.","journal-title":"Statistical Methods and Applications"},{"key":"5795_CR3","first-page":"171","volume":"11","author":"CF Aliferis","year":"2010","unstructured":"Aliferis, C. F., Statnikov, A., Tsamardinos, I., Mani, S., & Koutsoukos, X. D. (2010). Local causal and markov blanket induction for causal discovery and feature selection for classification part I: Algorithms and empirical evaluation. Journal of Machine Learning Research (JMLR), 11, 171\u2013234.","journal-title":"Journal of Machine Learning Research (JMLR)"},{"issue":"5","key":"5795_CR4","doi-asserted-by":"crossref","first-page":"1738","DOI":"10.3390\/e15051738","volume":"15","author":"E Archer","year":"2013","unstructured":"Archer, E., Park, I. M., & Pillow, J. W. (2013). Bayesian and quasi-Bayesian estimators for mutual information from discrete data. Entropy, 15(5), 1738\u20131755.","journal-title":"Entropy"},{"issue":"2","key":"5795_CR5","doi-asserted-by":"crossref","first-page":"272","DOI":"10.1109\/TPAMI.2016.2544315","volume":"39","author":"A Barbu","year":"2017","unstructured":"Barbu, A., She, Y., Ding, L., & Gramajo, G. (2017). Feature selection with annealing for computer vision and big data learning. IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI), 39(2), 272\u2013286.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)"},{"issue":"4","key":"5795_CR6","doi-asserted-by":"crossref","first-page":"537","DOI":"10.1109\/72.298224","volume":"5","author":"R Battiti","year":"1994","unstructured":"Battiti, R. (1994). Using mutual information for selecting features in supervised neural net learning. IEEE Transactions on Neural Networks, 5(4), 537\u2013550.","journal-title":"IEEE Transactions on Neural Networks"},{"key":"5795_CR7","doi-asserted-by":"crossref","first-page":"111","DOI":"10.1016\/j.ins.2014.05.042","volume":"282","author":"V Bol\u00f3n-Canedo","year":"2014","unstructured":"Bol\u00f3n-Canedo, V., S\u00e1nchez-Marono, N., Alonso-Betanzos, A., Ben\u00edtez, J. M., & Herrera, F. (2014). A review of microarray datasets and applied feature selection methods. Information Sciences, 282, 111\u2013135.","journal-title":"Information Sciences"},{"key":"5795_CR8","first-page":"163","volume":"18","author":"DR Brillinger","year":"2004","unstructured":"Brillinger, D. R. (2004). Some data analyses using mutual information. Brazilian Journal of Probability and Statistics, 18, 163\u2013182.","journal-title":"Brazilian Journal of Probability and Statistics"},{"key":"5795_CR9","first-page":"27","volume":"13","author":"G Brown","year":"2012","unstructured":"Brown, G., Pocock, A., Zhao, M.-J., & Lujan, M. (2012). Conditional likelihood maximisation: A unifying framework for information theoretic feature selection. Journal of Machine Learning Research (JMLR), 13, 27\u201366.","journal-title":"Journal of Machine Learning Research (JMLR)"},{"key":"5795_CR10","doi-asserted-by":"crossref","DOI":"10.1201\/b14884","volume-title":"Bayes and empirical Bayes methods for data analysis","author":"BP Carlin","year":"2008","unstructured":"Carlin, B. P., & Louis, T. A. (2008). Bayes and empirical Bayes methods for data analysis (3rd ed.). Boca Raton: Chapman & Hall.","edition":"3"},{"key":"5795_CR11","volume-title":"Elements of information theory","author":"TM Cover","year":"2006","unstructured":"Cover, T. M., & Thomas, J. A. (2006). Elements of information theory (2nd ed.). New York: Wiley.","edition":"2"},{"key":"5795_CR12","volume-title":"Large-scale inference: Empirical Bayes methods for estimation, testing, and prediction","author":"B Efron","year":"2012","unstructured":"Efron, B. (2012). Large-scale inference: Empirical Bayes methods for estimation, testing, and prediction (Vol. 1). Cambridge: Cambridge University Press."},{"key":"5795_CR13","first-page":"1531","volume":"5","author":"F Fleuret","year":"2004","unstructured":"Fleuret, F. (2004). Fast binary feature selection with conditional mutual information. Journal of Machine Learning Research (JMLR), 5, 1531\u20131555.","journal-title":"Journal of Machine Learning Research (JMLR)"},{"issue":"Mar","key":"5795_CR14","first-page":"1289","volume":"3","author":"G Forman","year":"2003","unstructured":"Forman, G. (2003). An extensive empirical study of feature selection metrics for text classification. Journal of Machine Learning Research (JMLR), 3(Mar), 1289\u20131305.","journal-title":"Journal of Machine Learning Research (JMLR)"},{"issue":"405","key":"5795_CR15","doi-asserted-by":"crossref","first-page":"165","DOI":"10.1080\/01621459.1989.10478752","volume":"84","author":"JH Friedman","year":"1989","unstructured":"Friedman, J. H. (1989). Regularized discriminant analysis. Journal of the American Statistical Association, 84(405), 165\u2013175.","journal-title":"Journal of the American Statistical Association"},{"key":"5795_CR16","first-page":"1157","volume":"3","author":"I Guyon","year":"2003","unstructured":"Guyon, I., & Elisseeff, A. (2003). An introduction to variable and feature selection. Journal of Machine Learning Research (JMLR), 3, 1157\u20131182.","journal-title":"Journal of Machine Learning Research (JMLR)"},{"key":"5795_CR17","first-page":"1469","volume":"10","author":"J Hausser","year":"2009","unstructured":"Hausser, J., & Strimmer, K. (2009). Entropy inference and the james-stein estimator, with application to nonlinear gene association networks. Journal of Machine Learning Research (JMLR), 10, 1469\u20131484.","journal-title":"Journal of Machine Learning Research (JMLR)"},{"key":"5795_CR18","doi-asserted-by":"crossref","unstructured":"Hutter, M. (2002). Distribution of mutual information. In T. G. Dietterich, S. Becker, & Z. Ghahramani (Eds.), Advances in neural information processing systems (NIPS) (pp. 399\u2013406). MIT Press.","DOI":"10.7551\/mitpress\/1120.003.0056"},{"key":"5795_CR19","unstructured":"Jakulin, A. (2005). Machine learning based on attribute interactions. Ph.D. thesis, University of Ljubljana, Slovenia."},{"key":"5795_CR20","unstructured":"James, W., & Stein, C. (1961). Estimation with quadratic loss. In Proceedings of the Fourth Berkeley symposium on mathematical statistics and probability, Volume 1: Contributions to the theory of statistics (pp. 361\u2013379). University of California Press."},{"issue":"5","key":"5795_CR21","doi-asserted-by":"crossref","first-page":"603","DOI":"10.1016\/S0927-5398(03)00007-0","volume":"10","author":"O Ledoit","year":"2003","unstructured":"Ledoit, O., & Wolf, M. (2003). Improved estimation of the covariance matrix of stock returns with an application to portfolio selection. Journal of Empirical Finance, 10(5), 603\u2013621.","journal-title":"Journal of Empirical Finance"},{"key":"5795_CR22","doi-asserted-by":"crossref","unstructured":"Lewis, David\u00a0D. (1992). Feature selection and feature extraction for text categorization. In Proceedings of the workshop on Speech and Natural Language.","DOI":"10.3115\/1075527.1075574"},{"key":"5795_CR23","doi-asserted-by":"crossref","unstructured":"Lin, D., & Tang, X. (2006). Conditional infomax learning: an integrated framework for feature extraction and fusion. In European conference on computer vision (ECCV)","DOI":"10.1007\/11744023_6"},{"key":"5795_CR24","doi-asserted-by":"crossref","unstructured":"Liu, H., & Ditzler, G. (2017). A fast information-theoretic approximation of joint mutual information feature selection. In IJCNN (pp. 4610\u20134617).","DOI":"10.1109\/IJCNN.2017.7966441"},{"key":"5795_CR25","doi-asserted-by":"crossref","unstructured":"Llinares-L\u00f3pez, F., Sugiyama, M., Papaxanthos, L., & Borgwardt, K. (2015). Fast and memory-efficient significant pattern mining via permutation testing. In Proceedings of the 21th ACM SIGKDD international conference on knowledge discovery and data mining (pp. 725\u2013734). ACM.","DOI":"10.1145\/2783258.2783363"},{"key":"5795_CR26","doi-asserted-by":"crossref","unstructured":"Meyer, P. E., & Bontempi, G. (2006). On the use of variable complementarity for feature selection in cancer classification. In Works on the application of evolutionary algorithms.","DOI":"10.1007\/11732242_9"},{"issue":"3","key":"5795_CR27","doi-asserted-by":"crossref","first-page":"261","DOI":"10.1109\/JSTSP.2008.923858","volume":"2","author":"PE Meyer","year":"2008","unstructured":"Meyer, P. E., Schretter, C., & Bontempi, G. (2008). Information-theoretic feature selection in microarray data using variable complementarity. IEEE Journal of Selected Topics in Signal Processing, 2(3), 261\u2013274.","journal-title":"IEEE Journal of Selected Topics in Signal Processing"},{"key":"5795_CR28","doi-asserted-by":"crossref","unstructured":"Nemenman, I., Shafee, F., & Bialek, W. (2002). Entropy and inference, revisited. In T. G. Dietterich, S. Becker, & Z. Ghahramani (Eds.), Advances in neural information processing systems (NIPS) (pp. 471\u2013478). MIT Press.","DOI":"10.7551\/mitpress\/1120.003.0065"},{"issue":"6","key":"5795_CR29","doi-asserted-by":"crossref","first-page":"1191","DOI":"10.1162\/089976603321780272","volume":"15","author":"L Paninski","year":"2003","unstructured":"Paninski, L. (2003). Estimation of entropy and mutual information. Neural Computation, 15(6), 1191\u20131253.","journal-title":"Neural Computation"},{"key":"5795_CR30","unstructured":"Papaxanthos, L., Llinares-L\u00f3pez, F., Bodenham, D., & Borgwardt, K. (2016). Finding significant combinations of features in the presence of categorical covariates. In D. D. Lee, M. Sugiyama, U. V. Luxburg, I. Guyon, & R. Garnett (Eds.), Advances in neural information processing systems (pp. 2279\u20132287). Curran Associates, Inc."},{"issue":"8","key":"5795_CR31","doi-asserted-by":"crossref","first-page":"1226","DOI":"10.1109\/TPAMI.2005.159","volume":"27","author":"H Peng","year":"2005","unstructured":"Peng, H., Long, F., & Ding, C. (2005). Feature selection based on mutual information criteria of max-dependency, max-relevance, and min-redundancy. IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI), 27(8), 1226\u20131238.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)"},{"issue":"1","key":"5795_CR32","doi-asserted-by":"crossref","first-page":"1175","DOI":"10.2202\/1544-6115.1175","volume":"4","author":"J Sch\u00e4fer","year":"2005","unstructured":"Sch\u00e4fer, J., & Strimmer, K. (2005). A shrinkage approach to large-scale covariance matrix estimation and implications for functional genomics. Statistical Applications in Genetics and Molecular Biology, 4(1), 1175\u20131189.","journal-title":"Statistical Applications in Genetics and Molecular Biology"},{"issue":"16\u201317","key":"5795_CR33","doi-asserted-by":"crossref","first-page":"3233","DOI":"10.1080\/03610926.2011.593284","volume":"41","author":"M Scutari","year":"2012","unstructured":"Scutari, M., & Brogini, A. (2012). Bayesian network structure learning with permutation tests. Communications in Statistics\u2014Theory and Methods, 41(16\u201317), 3233\u20133243.","journal-title":"Communications in Statistics\u2014Theory and Methods"},{"issue":"2","key":"5795_CR34","doi-asserted-by":"crossref","first-page":"357","DOI":"10.1007\/s10994-017-5648-2","volume":"107","author":"K Sechidis","year":"2018","unstructured":"Sechidis, K., & Brown, G. (2018). Simple strategies for semi-supervised feature selection. Machine Learning, 107(2), 357\u2013395.","journal-title":"Machine Learning"},{"key":"5795_CR35","doi-asserted-by":"crossref","first-page":"159","DOI":"10.1016\/j.ijar.2017.04.002","volume":"85","author":"K Sechidis","year":"2017","unstructured":"Sechidis, K., Sperrin, M., Petherick, E. S., Lujn, M., & Brown, G. (2017). Dealing with under-reported variables: An information theoretic solution. International Journal of Approximate Reasoning, 85, 159\u2013177.","journal-title":"International Journal of Approximate Reasoning"},{"key":"5795_CR36","first-page":"12","volume":"1","author":"K Sechidis","year":"2018","unstructured":"Sechidis, K., Papangelou, K., Metcalfe, P. D., Svensson, D., Weatherall, J., & Brown, G. (2018). Distinguishing prognostic and predictive biomarkers: An information theoretic approach. Bioinformatics, 1, 12.","journal-title":"Bioinformatics"},{"issue":"Suppl 2","key":"5795_CR37","doi-asserted-by":"crossref","first-page":"S231","DOI":"10.1093\/bioinformatics\/18.suppl_2.S231","volume":"18","author":"R Steuer","year":"2002","unstructured":"Steuer, R., Kurths, J., Daub, C., Weise, J., & Selbig, J. (2002). The mutual information: Detecting and evaluating dependencies between variables. Bioinformatics, 18(Suppl 2), S231\u2013S240.","journal-title":"Bioinformatics"},{"issue":"32","key":"5795_CR38","doi-asserted-by":"crossref","first-page":"12996","DOI":"10.1073\/pnas.1302233110","volume":"110","author":"A Terada","year":"2013","unstructured":"Terada, A., Okada-Hatakeyama, M., Tsuda, K., & Sese, J. (2013). Statistical significance of combinatorial regulations. Proceedings of the National Academy of Sciences, 110(32), 12996\u201313001.","journal-title":"Proceedings of the National Academy of Sciences"},{"issue":"1","key":"5795_CR39","doi-asserted-by":"crossref","first-page":"175","DOI":"10.1007\/s00521-013-1368-0","volume":"24","author":"JR Vergara","year":"2014","unstructured":"Vergara, J. R., & Est\u00e9vez, P. A. (2014). A review of feature selection methods based on mutual information. Neural Computing and Applications, 24(1), 175\u2013186.","journal-title":"Neural Computing and Applications"},{"key":"5795_CR40","doi-asserted-by":"crossref","first-page":"46","DOI":"10.1016\/j.patcog.2015.11.007","volume":"53","author":"NX Vinh","year":"2016","unstructured":"Vinh, N. X., Zhou, S., Chan, J., & Bailey, J. (2016). Can high-order dependencies improve mutual information based feature selection? Pattern Recognition, 53, 46\u201358.","journal-title":"Pattern Recognition"},{"key":"5795_CR41","unstructured":"Yang, H. H., & Moody, J. (1999). Data visualization and feature selection: New algorithms for nongaussian data. In S. A. Solla, T. K. Leen, & K. M\u00fcller (Eds.), Advances in neural information processing systems (NIPS) (pp. 687\u2013693). MIT Press."}],"updated-by":[{"DOI":"10.1007\/s10994-020-05884-6","type":"correction","label":"Correction","source":"publisher","updated":{"date-parts":[[2020,6,4]],"date-time":"2020-06-04T00:00:00Z","timestamp":1591228800000}}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-019-05795-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10994-019-05795-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-019-05795-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,17]],"date-time":"2024-07-17T18:06:09Z","timestamp":1721239569000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10994-019-05795-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,5,9]]},"references-count":41,"journal-issue":{"issue":"8-9","published-print":{"date-parts":[[2019,9,15]]}},"alternative-id":["5795"],"URL":"https:\/\/doi.org\/10.1007\/s10994-019-05795-1","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,5,9]]},"assertion":[{"value":"19 July 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 April 2019","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 May 2019","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 June 2020","order":4,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Correction","order":5,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"There was a mistake in the proof of the optimal shrinkage intensity for our estimator presented in Section 3.1.","order":6,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}}]}}