{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T20:45:17Z","timestamp":1780692317883,"version":"3.54.1"},"reference-count":132,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2015,6,30]],"date-time":"2015-06-30T00:00:00Z","timestamp":1435622400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Artif Intell Rev"],"published-print":{"date-parts":[[2015,12]]},"DOI":"10.1007\/s10462-015-9433-y","type":"journal-article","created":{"date-parts":[[2015,6,29]],"date-time":"2015-06-29T22:16:53Z","timestamp":1435616213000},"page":"467-508","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":115,"title":["Dealing with the evaluation of supervised classification algorithms"],"prefix":"10.1007","volume":"44","author":[{"given":"Guzman","family":"Santafe","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"I\u00f1aki","family":"Inza","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jose A.","family":"Lozano","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2015,6,30]]},"reference":[{"issue":"2","key":"9433_CR1","first-page":"113","volume":"1","author":"EL Allwein","year":"2001","unstructured":"Allwein EL, Schapire RE, Singer Y (2001) Reducing multiclass to binary: A unifying approach for margin classifiers. J Mach Learn Res 1(2):113\u2013141","journal-title":"J Mach Learn Res"},{"key":"9433_CR2","unstructured":"Anagnostopoulos C, Hand DJ (2012) hmeasure: the H-measure and other scalar classification performance metrics. http:\/\/CRAN.R-project.org\/package=hmeasure , R package version 1.0"},{"issue":"20","key":"9433_CR3","doi-asserted-by":"crossref","first-page":"1165","DOI":"10.1016\/S0167-8655(99)00084-7","volume":"11\u201313","author":"A Andersson","year":"1999","unstructured":"Andersson A, Davidsson P, Lin\u00e9n J (1999) Measure-based classifier performance evaluation. Pattern Recognit Lett 11\u201313(20):1165\u20131173","journal-title":"Pattern Recognit Lett"},{"key":"9433_CR4","unstructured":"Batuwita R, Palade V (2009) A new performance measure for class imbalance learning. Application to bioinformatics problems. In: Proceedings of the 26th international conference on machine learning and applications, pp 545\u2013550"},{"key":"9433_CR5","first-page":"1089","volume":"5","author":"Y Bengio","year":"2004","unstructured":"Bengio Y, Grandvalet Y (2004) No unbiased estimator of the variance of k-fold cross-validation. J Mach Learn Res 5:1089\u20131105","journal-title":"J Mach Learn Res"},{"key":"9433_CR6","doi-asserted-by":"crossref","first-page":"75","DOI":"10.1007\/0-387-24555-3_5","volume-title":"Statistical modeling and analysis for complex data problems, chap 5","author":"Y Bengio","year":"2005","unstructured":"Bengio Y, Grandvalet Y (2005) Bias in estimating the variance of k-fold cross-validation. In: Duchesne P, R\u00e9millard B (eds) Statistical modeling and analysis for complex data problems, chap 5. Springer, Berlin, pp 75\u201395"},{"key":"9433_CR7","doi-asserted-by":"crossref","unstructured":"Berrar D, Lozano JA (2013) Significance tests or confidence intervals: which are preferable for the comparison of classifiers? J Exp Theor Artif Intell 25(2):189\u2013206","DOI":"10.1080\/0952813X.2012.680252"},{"key":"9433_CR8","doi-asserted-by":"crossref","unstructured":"Bouckaert RR (2004) Estimationg replicability of classifier learning experiments. In: Brodley CE (ed) Proceedings of the 21st international conference on machine learning. ACM","DOI":"10.1145\/1015330.1015338"},{"key":"9433_CR9","unstructured":"Bouckaert RR, Frank E (2004) Evaluating the replicability of significance tests fo comparing learning algorihtms. In: Proceedings of the 8th Pacifica-Asian conference on knowledge discovery and data mining, pp 3\u201312"},{"key":"9433_CR10","unstructured":"Boyd K, Eng KH, Page CD (2013) Area under the precision-recall curve: point estimates and confidence intervals. In: Machine learning and knowledge discovery in databases. ECML PKDD 2013, Part III, pp 451\u2013466"},{"issue":"7","key":"9433_CR11","doi-asserted-by":"crossref","first-page":"1145","DOI":"10.1016\/S0031-3203(96)00142-2","volume":"30","author":"A Bradley","year":"1997","unstructured":"Bradley A (1997) The use of the area under the ROC curve in the evaluation of machine learning algorithms. Pattern Recognit 30(7):1145\u20131159","journal-title":"Pattern Recognit"},{"issue":"6","key":"9433_CR12","doi-asserted-by":"crossref","first-page":"1267","DOI":"10.1016\/j.patcog.2003.08.017","volume":"37","author":"U Braga-Neto","year":"2004","unstructured":"Braga-Neto U, Dougherty E (2004) Bolstered error estimation. Pattern Recognit 37(6):1267\u20131281","journal-title":"Pattern Recognit"},{"key":"9433_CR13","unstructured":"Brain D, Webb GI (1999) On the effect of data set size on bias and variance in classification learning. In: Proceedings of the 4th Australian knowledge acquisition workshop, pp 117\u2013128"},{"key":"9433_CR14","doi-asserted-by":"crossref","unstructured":"Brain D, Webb GI (2002) The need for low bias algorithms in classification learning from large data sets. In: Proceedings of the 16th European conference principles of data mining and knowledge discovery, pp 62\u201373","DOI":"10.1007\/3-540-45681-3_6"},{"key":"9433_CR15","doi-asserted-by":"crossref","unstructured":"Brier GW (1950) Verification of forecasts expressed in terms of probability. Monthly Weather Rev 78:1\u20133","DOI":"10.1175\/1520-0493(1950)078<0001:VOFEIT>2.0.CO;2"},{"issue":"1","key":"9433_CR16","doi-asserted-by":"crossref","first-page":"22","DOI":"10.1109\/TNNLS.2012.2222925","volume":"24","author":"M Budka","year":"2013","unstructured":"Budka M (2013) Density-preserving sampling: robust and efficient alternative to cross-validation for error estimation. IEEE Trans Neural Netw Learn Syst 24(1):22\u201334","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"issue":"3","key":"9433_CR17","doi-asserted-by":"crossref","first-page":"503","DOI":"10.1093\/biomet\/76.3.503","volume":"76","author":"P Burman","year":"1989","unstructured":"Burman P (1989) A comparative study of ordinary cross-validation, v-fold cross-validation and the repeated learning-testing methods. Biometrika 76(3):503\u2013514","journal-title":"Biometrika"},{"key":"9433_CR18","volume-title":"Positive unlabeled learning with applications in computational biology","author":"B Calvo","year":"2010","unstructured":"Calvo B (2010) Positive unlabeled learning with applications in computational biology. Lambert Academic Publishing, Saarbr\u00fccken"},{"issue":"1","key":"9433_CR19","doi-asserted-by":"crossref","first-page":"2000","DOI":"10.1145\/1007730.1007733","volume":"6","author":"NV Chawla","year":"2004","unstructured":"Chawla NV, Japkowicz N (2004) Editorial: Special issue on learning from imbalanced data sets. ACM SIGKDD Explor Newslett 6(1):2000\u20132004","journal-title":"ACM SIGKDD Explor Newslett"},{"key":"9433_CR20","doi-asserted-by":"crossref","first-page":"997","DOI":"10.1037\/0003-066X.49.12.997","volume":"49","author":"J Cohen","year":"1994","unstructured":"Cohen J (1994) The earth is round ( $$p <.05$$ p < . 05 ). Am Psychol 49:997\u20131003","journal-title":"Am Psychol"},{"key":"9433_CR21","unstructured":"Cortes C, Mohri M (2004) AUC optimization vs. error rate minimization. In: Proceedings of the 16th advances in neural information processing systems conference, p 313"},{"key":"9433_CR22","volume-title":"Applied nonparametric statistics","author":"WW Daniel","year":"1990","unstructured":"Daniel WW (1990) Applied nonparametric statistics. Duxbury Thomson Learning, Pacific Grove"},{"key":"9433_CR23","doi-asserted-by":"crossref","unstructured":"Davis J, Goadrich M (2006) The relationship between precision-recall and ROC curves. In: Proceedings of the 23rd international conference on machine learning, pp 233\u2013240","DOI":"10.1145\/1143844.1143874"},{"key":"9433_CR24","doi-asserted-by":"crossref","DOI":"10.1017\/CBO9780511802843","volume-title":"Bootstrap methods and their application","author":"A Davison","year":"1997","unstructured":"Davison A, Hinkley D (1997) Bootstrap methods and their application. Cambridge University Press, Cambridge"},{"issue":"4","key":"9433_CR25","doi-asserted-by":"crossref","first-page":"1251","DOI":"10.1214\/aos\/1176349736","volume":"13","author":"A Dawid","year":"1985","unstructured":"Dawid A (1985) Calibration-based empirical probability. Ann Stat 13(4):1251\u20131274","journal-title":"Ann Stat"},{"key":"9433_CR26","first-page":"1","volume":"7","author":"J Demsar","year":"2006","unstructured":"Demsar J (2006) Statistical comparisons of classifiers over multiple data sets. J Mach Learn Res 7:1\u201330","journal-title":"J Mach Learn Res"},{"key":"9433_CR27","unstructured":"Demsar J (2008) On the appropriateness of statistical tests in machine learning. In: 3rd workshop on evaluation methods for machine learning"},{"key":"9433_CR28","unstructured":"Denis DJ (2003) An alternative to null-hypothesis significance tests. Theory Sci 4(1)"},{"key":"9433_CR29","first-page":"3313","volume":"11","author":"JP Dmochowski","year":"2010","unstructured":"Dmochowski JP, Sajda P, Parra LC (2010) Maximum likelihood in cost-sensitive learning: model specification, approximations, and upper bounds. J Mach Learn Res 11:3313\u20133332","journal-title":"J Mach Learn Res"},{"key":"9433_CR30","doi-asserted-by":"crossref","unstructured":"Drummond C (2006) Machine learning as an experimental science (revisited). In: Proceedings of the 1st workshop on evaluation methods for machine learning","DOI":"10.1145\/1553374.1553546"},{"key":"9433_CR31","doi-asserted-by":"crossref","unstructured":"Drummond C (2008) Finding a balance between anarchy and orthodoxy. In: Proceedings of the 3rd workshop on evaluation methods for machine learning","DOI":"10.1145\/1553374.1553546"},{"issue":"1","key":"9433_CR32","doi-asserted-by":"crossref","first-page":"95","DOI":"10.1007\/s10994-006-8199-5","volume":"65","author":"C Drummond","year":"2006","unstructured":"Drummond C, Holte RC (2006) Cost curves: an improved methyod for visualizing classifier performance. Mach Learn 65(1):95\u2013130","journal-title":"Mach Learn"},{"issue":"1","key":"9433_CR33","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1080\/09528130903010295","volume":"22","author":"C Drummond","year":"2010","unstructured":"Drummond C, Japkowicz N (2010) Warning: Statistical benchmarking is addictive. Kicking the habit in machine learning. J Exp Theor Artif Intell 22(1):67\u201380","journal-title":"J Exp Theor Artif Intell"},{"issue":"1","key":"9433_CR34","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1214\/aos\/1176344552","volume":"7","author":"B Efron","year":"1979","unstructured":"Efron B (1979) Bootstrap methods: another look at the jackknife. Ann Stat 7(1):1\u201326","journal-title":"Ann Stat"},{"key":"9433_CR35","doi-asserted-by":"crossref","unstructured":"Efron B (1982) The jackknife, the bootstrap and other resampling plans. Soc Ind Appl Math","DOI":"10.1137\/1.9781611970319"},{"issue":"382","key":"9433_CR36","doi-asserted-by":"crossref","first-page":"316","DOI":"10.1080\/01621459.1983.10477973","volume":"78","author":"B Efron","year":"1983","unstructured":"Efron B (1983) Estimating the error rate of a prediction rule: improvement on cross-validation. J Am Stat Assoc 78(382):316\u2013331","journal-title":"J Am Stat Assoc"},{"issue":"1","key":"9433_CR37","first-page":"54","volume":"1","author":"B Efron","year":"1986","unstructured":"Efron B, Tibshirani R (1986) Bootstrap methods for standard errors, confidence intervals, and other measures of statistical accuracy. Statistics 1(1):54\u201377","journal-title":"Statistics"},{"key":"9433_CR38","doi-asserted-by":"crossref","DOI":"10.1007\/978-1-4899-4541-9","volume-title":"An Introduction to the Bootstrap","author":"B Efron","year":"1993","unstructured":"Efron B, Tibshirani R (1993) An Introduction to the Bootstrap. Chapman & Hall, London"},{"issue":"438","key":"9433_CR39","first-page":"548","volume":"92","author":"B Efron","year":"1997","unstructured":"Efron B, Tibshirani R (1997) Improvements on cross-validation: the 632+ bootstrap method. J Am Stat Assoc 92(438):548\u2013560","journal-title":"J Am Stat Assoc"},{"key":"9433_CR40","doi-asserted-by":"crossref","first-page":"103","DOI":"10.1016\/S1386-5056(97)00062-2","volume":"46","author":"M Egmont-Petersen","year":"1997","unstructured":"Egmont-Petersen M, Talmon JL, Hasman A (1997) Robustness metrics for measuring the influence of additive noise on the performance of statistical classifiers. Int J Med Inform 46:103\u2013112","journal-title":"Int J Med Inform"},{"key":"9433_CR41","unstructured":"Elazmeh W, Japkowicz N, Matwin S (2006) A framework for measuring classification difference with imbalance. In: Proceedings of the 1st workshop on evaluation methods for machine learning"},{"key":"9433_CR42","unstructured":"Elkan C (2001) The foundations of cost-sensitive learning. In: Proceedings of the 4th international joint conference on artificial intelligence, vol 17, pp 973\u2013978"},{"issue":"8","key":"9433_CR43","doi-asserted-by":"crossref","first-page":"861","DOI":"10.1016\/j.patrec.2005.10.010","volume":"27","author":"T Fawcett","year":"2006","unstructured":"Fawcett T (2006) An introduction to ROC analysis. Pattern Recognit Lett 27(8):861\u2013874","journal-title":"Pattern Recognit Lett"},{"key":"9433_CR44","doi-asserted-by":"crossref","first-page":"27","DOI":"10.1016\/j.patrec.2008.08.010","volume":"30","author":"C Ferri","year":"2009","unstructured":"Ferri C, Hern\u00e1ndez-Orallo R, Modroiu R (2009) An experimental comparison of performance measures for classification. Pattern Recognit Lett 30:27\u201338","journal-title":"Pattern Recognit Lett"},{"key":"9433_CR45","doi-asserted-by":"crossref","first-page":"920","DOI":"10.1080\/01621459.1993.10476358","volume":"88","author":"H Finner","year":"1993","unstructured":"Finner H (1993) On a monotonicity problem in step-down multiple test procedures. J Am Stat Assoc 88:920\u2013923","journal-title":"J Am Stat Assoc"},{"key":"9433_CR46","volume-title":"Statistical methods and scientific inference","author":"RA Fisher","year":"1937","unstructured":"Fisher RA (1937) Statistical methods and scientific inference. Hafner publishing Co, New York"},{"key":"9433_CR47","doi-asserted-by":"crossref","first-page":"55","DOI":"10.1023\/A:1009778005914","volume":"1","author":"JH Friedman","year":"1997","unstructured":"Friedman JH (1997) On bias, variance, 0\/1 loss, and the curse-of-dimensionality. Data Min Knowl Discov 1:55\u201377","journal-title":"Data Min Knowl Discov"},{"key":"9433_CR48","doi-asserted-by":"crossref","first-page":"86","DOI":"10.1214\/aoms\/1177731944","volume":"11","author":"M Friedman","year":"1940","unstructured":"Friedman M (1940) A comparison of alternative tests of significance for the problem of m rankings. Ann Math Stat 11:86\u201392","journal-title":"Ann Math Stat"},{"issue":"2","key":"9433_CR49","doi-asserted-by":"crossref","first-page":"137","DOI":"10.1007\/s11222-009-9153-8","volume":"21","author":"T Fushiki","year":"2011","unstructured":"Fushiki T (2011) Estimation of prediction error by using k-fold cross-validation. Stat Comput 21(2):137\u2013146","journal-title":"Stat Comput"},{"key":"9433_CR50","doi-asserted-by":"crossref","first-page":"1761","DOI":"10.1016\/j.patcog.2011.01.017","volume":"44","author":"M Galar","year":"2011","unstructured":"Galar M, Fern\u00e1ndez A, Barrenechea E, Bustince H, Herrera F (2011) An overview of ensemble methods for binary classifiers in multi-class problems: experimental study on one-vs-one and one-vs-all schemes. Pattern Recognit 44:1761\u20131776","journal-title":"Pattern Recognit"},{"key":"9433_CR51","doi-asserted-by":"crossref","DOI":"10.1201\/EBK1439826119","volume-title":"Knowledge Discovery from Data Streams","author":"J Gama","year":"2010","unstructured":"Gama J (2010) Knowledge Discovery from Data Streams. Chapman and Hall\/CRC, London"},{"key":"9433_CR52","doi-asserted-by":"crossref","unstructured":"Gama J, Sebastiao R, Pereira Rodrigues P (2009) Issues in evaluation of stream learning algorithms. In: Proceedings of the 15th ACM SIGKDD international conference on Knowledge discovery and data mining, pp 329\u2013338","DOI":"10.1145\/1557019.1557060"},{"key":"9433_CR53","first-page":"2677","volume":"9","author":"S Garcia","year":"2008","unstructured":"Garcia S, Herrera F (2008) An extension on statistical comparisons of classifiers over multiple data sets for all pairwise comparisons. J Mach Learn Res 9:2677\u20132694","journal-title":"J Mach Learn Res"},{"issue":"10","key":"9433_CR54","doi-asserted-by":"crossref","first-page":"2044","DOI":"10.1016\/j.ins.2009.12.010","volume":"180","author":"S Garcia","year":"2010","unstructured":"Garcia S, Fernandez A, Luengo J, Herrera F (2010a) Advanced nonparametric tests for multiple comparisons in the design of experiments in computational intelligence and data mining: experimental analysis of power. Inf Sci 180(10):2044\u20132064","journal-title":"Inf Sci"},{"key":"9433_CR55","doi-asserted-by":"crossref","unstructured":"Garcia V, Mollineda RA, Sanchez JS (2010b) Theoretical analysis of a performance measure for imbalanced data. In: Proceedings of the 18th IEEE international conference on pattern recognition, pp 617\u2013620","DOI":"10.1109\/ICPR.2010.156"},{"issue":"5","key":"9433_CR56","doi-asserted-by":"crossref","first-page":"791","DOI":"10.3758\/BF03196706","volume":"11","author":"S Glover","year":"2004","unstructured":"Glover S, Dixon P (2004) Likelihood ratios: A simple and flexible statistic for empirical psychologists. Psychon Bull Rev 11(5):791\u2013806","journal-title":"Psychon Bull Rev"},{"key":"9433_CR57","doi-asserted-by":"crossref","unstructured":"Golland P, Fischl B (2003) Permutation tests for classification: towards statistical significance in image-based studies. In: Proceedings of the 18th international conference on information processing in medical imaging, vol 18, pp 330\u2013341","DOI":"10.1007\/978-3-540-45087-0_28"},{"key":"9433_CR58","doi-asserted-by":"crossref","unstructured":"Golland P, Liang F, Makherjee S, Panchenko D (2005) Permutation tests for classification. In: Proceedings of the 18th annual conference on learning Theory, vol 18, pp 501\u2013515","DOI":"10.1007\/11503415_34"},{"key":"9433_CR59","doi-asserted-by":"crossref","first-page":"123","DOI":"10.1093\/bjps\/19.2.123","volume":"19","author":"IJ Good","year":"1968","unstructured":"Good IJ (1968) Corroboration, explanation, evolving probability, simplicity, and a sharpened razor. Br J Philos Sci 19:123\u2013143","journal-title":"Br J Philos Sci"},{"key":"9433_CR60","doi-asserted-by":"crossref","unstructured":"Good PI (2000) Permutation test: a practical guide to resampling methods for testing hypotheses. Springer","DOI":"10.1007\/978-1-4757-3235-1"},{"issue":"3","key":"9433_CR61","doi-asserted-by":"crossref","first-page":"135","DOI":"10.1053\/j.seminhematol.2008.04.003","volume":"45","author":"S Goodman","year":"2008","unstructured":"Goodman S (2008) A dirty dozen: twelve p-value misconceptions. Semin Hematol 45(3):135\u2013140","journal-title":"Semin Hematol"},{"key":"9433_CR62","unstructured":"Grandvalet Y, Bengio Y (2006) Hypothesis testing for cross-validation. Tech. rep., D\u00e9partement d\u2019informatique et recherche op\u00e9rationnelle, Universit\u00e9 de Montr\u00e9al"},{"issue":"1","key":"9433_CR63","doi-asserted-by":"crossref","first-page":"10","DOI":"10.1145\/1656274.1656278","volume":"11","author":"M Hall","year":"2009","unstructured":"Hall M, Frank E, Holmes G, Pfahringer B, Reutemann P, Witten IH (2009) The WEKA data mining software: an update. SIGKDD Explor 11(1):10\u201318","journal-title":"SIGKDD Explor"},{"issue":"1","key":"9433_CR64","first-page":"1","volume":"7","author":"H Haller","year":"2002","unstructured":"Haller H, Krauss S (2002) Misinterpretations of significance: A problem students share with their teachers. Methods Psychol Res Online 7(1):1\u201320","journal-title":"Methods Psychol Res Online"},{"issue":"4","key":"9433_CR65","doi-asserted-by":"crossref","first-page":"736","DOI":"10.1175\/1520-0434(1997)012<0736:RDFMPF>2.0.CO;2","volume":"12","author":"TM Hamill","year":"1996","unstructured":"Hamill TM (1996) Reliability diagrams for multicategory probabilistic forecast. Weather Forecast 12(4):736\u2013741","journal-title":"Weather Forecast"},{"issue":"5","key":"9433_CR66","doi-asserted-by":"crossref","first-page":"335","DOI":"10.1016\/0167-8655(86)90054-1","volume":"4","author":"DJ Hand","year":"1986","unstructured":"Hand DJ (1986) Recent advances in error rate estimation. Pattern Recognit Lett 4(5):335\u2013346","journal-title":"Pattern Recognit Lett"},{"issue":"3","key":"9433_CR67","doi-asserted-by":"crossref","first-page":"317","DOI":"10.2307\/2983526","volume":"157","author":"DJ Hand","year":"1994","unstructured":"Hand DJ (1994) Deconstructing statistical questions. J R Stat Soc Ser A 157(3):317\u2013356","journal-title":"J R Stat Soc Ser A"},{"key":"9433_CR68","doi-asserted-by":"crossref","first-page":"103","DOI":"10.1007\/s10994-009-5119-5","volume":"77","author":"DJ Hand","year":"2009","unstructured":"Hand DJ (2009) Measuring classifier performance: a coherent alternative to the area under de ROC curve. Mach Learn 77:103\u2013123","journal-title":"Mach Learn"},{"key":"9433_CR69","doi-asserted-by":"crossref","first-page":"1502","DOI":"10.1002\/sim.3859","volume":"29","author":"DJ Hand","year":"2010","unstructured":"Hand DJ (2010) Evaluation diagnostic tests: the area under the ROC curve and the balance of errors. Stat Med 29:1502\u20131510","journal-title":"Stat Med"},{"issue":"5","key":"9433_CR70","doi-asserted-by":"crossref","first-page":"492","DOI":"10.1016\/j.patrec.2012.12.004","volume":"34","author":"DJ Hand","year":"2013","unstructured":"Hand DJ, Anagnostopoulos C (2013) When is the area under the receiver operating characteristic curve an appropriate measure of classifier performance? Pattern Recognit Lett 34(5):492\u2013495","journal-title":"Pattern Recognit Lett"},{"key":"9433_CR71","doi-asserted-by":"crossref","first-page":"41","DOI":"10.1016\/j.patrec.2013.12.011","volume":"40","author":"DJ Hand","year":"2014","unstructured":"Hand DJ, Anagnostopoulos C (2014) A better Beta for the H measure of classification performance. Pattern Recogn Lett 40:41\u201346","journal-title":"Pattern Recogn Lett"},{"key":"9433_CR72","doi-asserted-by":"crossref","first-page":"171","DOI":"10.1023\/A:1010920819831","volume":"45","author":"DJ Hand","year":"2001","unstructured":"Hand DJ, Till RJ (2001) A simple generalisation of the area under the ROC curve for multiple class classification problems. Mach Learn 45:171\u2013186","journal-title":"Mach Learn"},{"key":"9433_CR73","doi-asserted-by":"crossref","DOI":"10.1007\/978-0-387-21606-5","volume-title":"The elements of statistical learning","author":"T Hastie","year":"2001","unstructured":"Hastie T, Tibshirani R, Friedman J (2001) The elements of statistical learning. Springer, Berlin"},{"key":"9433_CR74","doi-asserted-by":"crossref","first-page":"417","DOI":"10.2307\/2531823","volume":"43","author":"BS Holland","year":"1987","unstructured":"Holland BS, Copenhaver MD (1987) An improved sequentially rejective bonferroni test procedure. Biometrics 43:417\u2013423","journal-title":"Biometrics"},{"issue":"1","key":"9433_CR75","doi-asserted-by":"crossref","first-page":"11","DOI":"10.1023\/A:1023985022691","volume":"52","author":"T Hsing","year":"2003","unstructured":"Hsing T, Attoor S, Dougherty E (2003) Relation between permutation-test p values and classifier error estimates. Mach Learn 52(1):11\u201330","journal-title":"Mach Learn"},{"key":"9433_CR76","doi-asserted-by":"crossref","first-page":"571","DOI":"10.1080\/03610928008827904","volume":"9","author":"RL Iman","year":"1980","unstructured":"Iman RL, Davenport JM (1980) Approximations of the critical region of the friedman statistic. Commun Stat 9:571\u2013595","journal-title":"Commun Stat"},{"issue":"14","key":"9433_CR77","doi-asserted-by":"crossref","first-page":"1960","DOI":"10.1016\/j.patrec.2008.06.018","volume":"29","author":"A Isaksson","year":"2008","unstructured":"Isaksson A, Wallman M, Goransson H, Gustafsson M (2008) Cross-validation and bootstrapping are unreliable in small sample classification. Pattern Recognit Lett 29(14):1960\u20131965","journal-title":"Pattern Recognit Lett"},{"key":"9433_CR78","doi-asserted-by":"crossref","first-page":"87","DOI":"10.1007\/s00357-008-9003-y","volume":"25","author":"A Jamain","year":"2008","unstructured":"Jamain A, Hand DJ (2008) Mining supervised classification performance studies: a meta-analytic investigation. J Classif 25:87\u2013112","journal-title":"J Classif"},{"key":"9433_CR79","unstructured":"Japkowicz N (2006) Why question machine learning evaluation methods (an illustrative review of the shortcomings of current methods). In: Proceedings of the 1st workshop on evaluation methods for machine learning"},{"key":"9433_CR80","unstructured":"Japkowicz N (2008) Classifier evaluation: a need for better education and restructuring. In: Proceedings of the 3rd workshop on evaluation methods for machine learning"},{"key":"9433_CR81","doi-asserted-by":"crossref","DOI":"10.1017\/CBO9780511921803","volume-title":"Evaluating learning algorithms","author":"N Japkowicz","year":"2011","unstructured":"Japkowicz N, Shah M (2011) Evaluating learning algorithms. Cambridge University Press, Cambridge, A classification perspective"},{"key":"9433_CR82","doi-asserted-by":"crossref","first-page":"175","DOI":"10.1007\/978-94-010-1436-6_6","volume":"2","author":"ET Jaynes","year":"1976","unstructured":"Jaynes ET (1976) Confidence intervals vs. bayesian intervals. Found Probab Theory Stat Inference Stat Theor Sci 2:175\u2013257","journal-title":"Found Probab Theory Stat Inference Stat Theor Sci"},{"issue":"3","key":"9433_CR83","doi-asserted-by":"crossref","first-page":"763","DOI":"10.2307\/3802789","volume":"63","author":"DH Johnson","year":"1999","unstructured":"Johnson DH (1999) The insignificance of statistical significance testing. J Wildl Manag 63(3):763\u2013772","journal-title":"J Wildl Manag"},{"issue":"11","key":"9433_CR84","doi-asserted-by":"crossref","first-page":"2259","DOI":"10.1109\/TPAMI.2012.21","volume":"34","author":"A Joshi","year":"2012","unstructured":"Joshi A, Porikli F, Papanikolopoulos NP (2012) Scalable active learning for multiclass image classification. IEEE Trans Pattern Anal Mach Intell 34(11):2259\u20132273","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"9433_CR85","doi-asserted-by":"crossref","unstructured":"Joshi MV, Agarwal RC, Kumar V (2001) Mining needle in a haystack: classifying rare classes via two-phase rule induction. In: Proceedings of the 27th ACM SIGMOD international conference on management of data, pp 91\u2013102","DOI":"10.1145\/375663.375673"},{"key":"9433_CR86","unstructured":"Kohavi R (1995) A study of cross-validation and bootstrap for accuracy estimation and model selection. In: Proceedings of the 14th international joint conference on artificial intelligence, pp 1137\u20131143"},{"key":"9433_CR87","unstructured":"Kohavi R, Wolpert DH (1996) Bias plus variance decomposition for zero-one loss functions. In: Saitta L (ed) Proceedings of the 13th international conference on machine learning, Morgan Kaumann, pp 275\u2013283"},{"issue":"260","key":"9433_CR88","doi-asserted-by":"crossref","first-page":"583","DOI":"10.1080\/01621459.1952.10483441","volume":"47","author":"W Kruskal","year":"1952","unstructured":"Kruskal W, Wallis WA (1952) Use of ranks in one-criterion variance analysis. J Am Stat Assoc 47(260):583\u2013621","journal-title":"J Am Stat Assoc"},{"issue":"2","key":"9433_CR89","doi-asserted-by":"crossref","first-page":"195","DOI":"10.1023\/A:1007452223027","volume":"30","author":"M Kubat","year":"1998","unstructured":"Kubat M, Holte RC, Matwin S (1998) Machine learning for the detection of oil spills in satellite radar images. Mach Learn 30(2):195\u2013215","journal-title":"Mach Learn"},{"key":"9433_CR90","unstructured":"Kuhn M (2015) Caret: classification and regression training. http:\/\/CRAN.R-project.org\/package=caret , R package version 6.0-41"},{"key":"9433_CR91","unstructured":"Lacoste A, Laviolette F, Marchand M (2012) Bayesian comparison of machine learning algorithms on single and multiple datasets. In: Proceedings of the 15th international conference on artificial intellegence and statistics, pp 665\u2013675"},{"key":"9433_CR92","doi-asserted-by":"crossref","first-page":"45","DOI":"10.1037\/h0072400","volume":"22","author":"SC Larson","year":"1931","unstructured":"Larson SC (1931) The shrinkage of the coefficient of multiple correlation. J Educ Psychol 22:45\u201355","journal-title":"J Educ Psychol"},{"key":"9433_CR93","unstructured":"Lavesson N (2006) Evaluation of supervised learning algorithms and classifiers. Master\u2019s thesis, Blekinge Institute of Technology"},{"key":"9433_CR94","unstructured":"Ling CX, Li C (1998) Data mining for direct marketing: Problems and solutions. In: Proceedings of the 4th international conference on knowledge discovery and data minig, pp 73\u201379"},{"issue":"3","key":"9433_CR95","doi-asserted-by":"crossref","first-page":"679","DOI":"10.3758\/s13428-010-0049-5","volume":"43","author":"M Masson","year":"2011","unstructured":"Masson M (2011) A tutorial on a practical bayesian alternative to null-hypothesis significance testing. Behav Res Methods 43(3):679\u201390","journal-title":"Behav Res Methods"},{"issue":"18","key":"9433_CR96","doi-asserted-by":"crossref","first-page":"2127","DOI":"10.1002\/(SICI)1097-0258(19970930)16:18<2127::AID-SIM633>3.0.CO;2-W","volume":"16","author":"WL May","year":"1997","unstructured":"May WL, Johnson WD (1997) Confidence intervals for differences in correlated binary proportions. Stat Med 16(18):2127\u20132136","journal-title":"Stat Med"},{"key":"9433_CR97","doi-asserted-by":"crossref","DOI":"10.1002\/0471725293","volume-title":"Discriminant analysis and statistical pattern recognition","author":"G McLachlan","year":"1992","unstructured":"McLachlan G (1992) Discriminant analysis and statistical pattern recognition. Wiley, New York"},{"issue":"1","key":"9433_CR98","doi-asserted-by":"crossref","first-page":"521","DOI":"10.1016\/j.patcog.2011.06.019","volume":"45","author":"JG Moreno-Torres","year":"2012","unstructured":"Moreno-Torres JG, Reader T, Al\u00e1iz-Rodri\u00edguez R, Chawla NV, Herrera F (2012a) A unifying view on dataset shift in classification. Pattern Recognit 45(1):521\u2013530","journal-title":"Pattern Recognit"},{"issue":"8","key":"9433_CR99","doi-asserted-by":"crossref","first-page":"1304","DOI":"10.1109\/TNNLS.2012.2199516","volume":"23","author":"JG Moreno-Torres","year":"2012","unstructured":"Moreno-Torres JG, S\u00e1ez JA, Herrera F (2012b) Study on the impact of partition-induced dataset shift on k-fold cross-validation. IEEE Trans Neural Netw Learn Syst 23(8):1304\u20131312","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"issue":"3","key":"9433_CR100","doi-asserted-by":"crossref","first-page":"239","DOI":"10.1023\/A:1024068626366","volume":"52","author":"C Nadeau","year":"2003","unstructured":"Nadeau C, Bengio Y (2003) Inference for the generalization error. Mach Learn 52(3):239\u2013281","journal-title":"Mach Learn"},{"key":"9433_CR101","unstructured":"Nakhaeizadeh G, Schnabl A (1998) Towards the personalization of algorihtms evaluation in data mining. In. In Proceedings of the 3rd international conference on knowledge discovery and data mining, pp 289\u2013293"},{"key":"9433_CR102","first-page":"1833","volume":"11","author":"M Ojala","year":"2010","unstructured":"Ojala M, Garriga GC (2010) Permutation tests for studying classifier performance. J Mach Learn Res 11:1833\u20131863","journal-title":"J Mach Learn Res"},{"issue":"1","key":"9433_CR103","doi-asserted-by":"crossref","first-page":"88","DOI":"10.1016\/j.jcss.2013.03.009","volume":"80","author":"J Otero","year":"2014","unstructured":"Otero J, S\u00e1nchez L, Couso I, Palacios A (2014) Bootstrap analysis of multiple repetitions of experiments using an interval-valued multiple comparison procedure. J Comput Syst Sci 80(1):88\u2013100","journal-title":"J Comput Syst Sci"},{"issue":"11","key":"9433_CR104","doi-asserted-by":"crossref","first-page":"1601","DOI":"10.1109\/TKDE.2011.59","volume":"23","author":"RC Prati","year":"2011","unstructured":"Prati RC, Batista GEPA, Monard MC (2011) A survey on graphical methods for classification predictive performance evaluation. IEEE Trans Knowl Data Eng 23(11):1601\u20131618","journal-title":"IEEE Trans Knowl Data Eng"},{"key":"9433_CR105","unstructured":"Provost F, Fawcett T, Kohavi R (1998) The case against accuracy estimation for comparing induction algorithms. In: Proceeding of the 15th international conference on machine learning, pp 445\u2013453"},{"issue":"3","key":"9433_CR106","doi-asserted-by":"crossref","first-page":"205","DOI":"10.1145\/65943.65945","volume":"7","author":"V Raghavan","year":"1989","unstructured":"Raghavan V, Bollmann P, Jung GS (1989) A critical investigation of recall and precision as measures of retrieval system performance. ACM Trans Inf Syst 7(3):205\u2013229","journal-title":"ACM Trans Inf Syst"},{"key":"9433_CR107","doi-asserted-by":"crossref","unstructured":"Ranawana R, Palade V (2006) Optimized precision\u2013a new measure for classifier performance evaluation. In: Proceedings of the 23th IEEE international conference on evolutionary computation, pp 2254\u20132261","DOI":"10.1109\/CEC.2006.1688586"},{"key":"9433_CR108","doi-asserted-by":"crossref","unstructured":"Reader T, Hoens TR, Chawla NV (2010) Consequences of variability in classifier performance estimates. In: Proceedings of the 10th IEEE international conference on data mining, pp 421\u2013430","DOI":"10.1109\/ICDM.2010.110"},{"key":"9433_CR109","first-page":"101","volume":"5","author":"R Rifkin","year":"2004","unstructured":"Rifkin R, Klautau A (2004) In defense of one-vs-all classification. J Mach Learn Res 5:101\u2013141","journal-title":"J Mach Learn Res"},{"issue":"3","key":"9433_CR110","doi-asserted-by":"crossref","first-page":"569","DOI":"10.1109\/TPAMI.2009.187","volume":"32","author":"JD Rodr\u00edguez","year":"2010","unstructured":"Rodr\u00edguez JD, P\u00e9rez A, Lozano JA (2010) Sensitivity analysis of k-fold cross validation in prediction error estimation. IEEE Trans Pattern Anal Mach Intell 32(3):569\u2013575","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"3","key":"9433_CR111","doi-asserted-by":"crossref","first-page":"855","DOI":"10.1016\/j.patcog.2012.09.007","volume":"46","author":"JD Rodr\u00edguez","year":"2013","unstructured":"Rodr\u00edguez JD, P\u00e9rez A, Lozano JA (2013) A general framework for the statistical analysis of the sources of variance for classification error estimators. Pattern Recognit 46(3):855\u2013864","journal-title":"Pattern Recognit"},{"key":"9433_CR112","doi-asserted-by":"crossref","first-page":"663","DOI":"10.1093\/biomet\/77.3.663","volume":"77","author":"DM Rom","year":"1990","unstructured":"Rom DM (1990) A sequentially rejective test procedure based on a modified bonferroni inequality. Biometrika 77:663\u2013665","journal-title":"Biometrika"},{"issue":"5","key":"9433_CR113","doi-asserted-by":"crossref","first-page":"416","DOI":"10.1037\/h0042040","volume":"57","author":"W Rozeboom","year":"1960","unstructured":"Rozeboom W (1960) The fallacy of the null-hypothesis significance test. Psychol Bull 57(5):416\u2013428","journal-title":"Psychol Bull"},{"issue":"2","key":"9433_CR114","doi-asserted-by":"crossref","first-page":"350","DOI":"10.1016\/j.patcog.2010.07.025","volume":"44","author":"CM Schubert","year":"2011","unstructured":"Schubert CM, Thorsen SN, Oxley ME (2011) The ROC manifold for classification systems. Pattern Recognit 44(2):350\u2013362","journal-title":"Pattern Recognit"},{"key":"9433_CR115","doi-asserted-by":"crossref","first-page":"551","DOI":"10.1146\/annurev.ps.46.020195.003021","volume":"46","author":"JP Shaffer","year":"1995","unstructured":"Shaffer JP (1995) Multiple hypothesis testing. Annu Rev Psychol 46:551\u2013584","journal-title":"Annu Rev Psychol"},{"issue":"1\u20132","key":"9433_CR116","doi-asserted-by":"crossref","first-page":"31","DOI":"10.1007\/s10618-010-0175-9","volume":"22","author":"CN Silla","year":"2011","unstructured":"Silla CN, Freitas AA (2011) A survey of hierarchical classification across different application domains. Data Min Knowl Discov 22(1\u20132):31\u201372","journal-title":"Data Min Knowl Discov"},{"key":"9433_CR117","doi-asserted-by":"crossref","first-page":"272","DOI":"10.1111\/j.1469-1809.1946.tb02368.x","volume":"13","author":"C Smith","year":"1947","unstructured":"Smith C (1947) Some examples of discrimination. Ann Eugen 13:272\u2013282","journal-title":"Ann Eugen"},{"key":"9433_CR118","doi-asserted-by":"crossref","unstructured":"Sokolova M, Japkowicz N, Szpakowicz S (2006) Beyond accuracy, f-score and ROC: a family of discriminant measures for performance evaluation. In: Proceedings of the 19th Australian joint conference on artificial intelligence: advances in artificial intelligence, pp 1015\u20131021","DOI":"10.1007\/11941439_114"},{"key":"9433_CR119","doi-asserted-by":"crossref","first-page":"111","DOI":"10.1111\/j.2517-6161.1974.tb00994.x","volume":"36","author":"M Stone","year":"1974","unstructured":"Stone M (1974) Cross-validatory choice and assessment of statistical predictions (with discussion). J R Stat Soc Ser B 36:111\u2013147","journal-title":"J R Stat Soc Ser B"},{"issue":"1","key":"9433_CR120","doi-asserted-by":"crossref","first-page":"29","DOI":"10.1093\/biomet\/64.1.29","volume":"64","author":"M Stone","year":"1977","unstructured":"Stone M (1977) Asymptotics for and against cross-validation. Biometrika 64(1):29\u201335","journal-title":"Biometrika"},{"issue":"04","key":"9433_CR121","doi-asserted-by":"crossref","first-page":"687","DOI":"10.1142\/S0218001409007326","volume":"23","author":"Y Sun","year":"2009","unstructured":"Sun Y, Wong AK, Kamel MS (2009) Classification of imbalanced data: a review. Int J Pattern Recognit Artif Intell 23(04):687","journal-title":"Int J Pattern Recognit Artif Intell"},{"key":"9433_CR122","unstructured":"Tan P, Steinbach M, Kumar V (2006) Introduction to data mining. Addison Wesley, Reading"},{"issue":"3","key":"9433_CR123","doi-asserted-by":"crossref","first-page":"1","DOI":"10.4018\/jdwm.2007070101","volume":"3","author":"G Tsoumakas","year":"2007","unstructured":"Tsoumakas G, Katakis I (2007) Multi-label classification: an overview. Int J Data Wareh Min 3(3):1\u201313","journal-title":"Int J Data Wareh Min"},{"key":"9433_CR124","volume-title":"Information retrieval","author":"CJ Rijsbergen van","year":"1979","unstructured":"van Rijsbergen CJ (1979) Information retrieval. Butterworth-Heinemann, Oxford"},{"key":"9433_CR125","doi-asserted-by":"crossref","DOI":"10.1002\/0470854774","volume-title":"Statistical pattern recognition","author":"AR Webb","year":"2002","unstructured":"Webb AR (2002) Statistical pattern recognition, vol 9, 2nd edn. Wiley, New York","edition":"2"},{"issue":"2","key":"9433_CR126","doi-asserted-by":"crossref","first-page":"159","DOI":"10.1023\/A:1007659514849","volume":"40","author":"G Webb","year":"2000","unstructured":"Webb G (2000) Multiboosting: a technique for combining boosting and wagging. Mach Learn 40(2):159\u2013196","journal-title":"Mach Learn"},{"key":"9433_CR127","unstructured":"Webb GI, Conilione P (2003) Estimating bias and variance from data. Tech. rep"},{"issue":"1","key":"9433_CR128","doi-asserted-by":"crossref","first-page":"7","DOI":"10.1145\/1007730.1007734","volume":"6","author":"GM Weiss","year":"2004","unstructured":"Weiss GM (2004) Mining with rarity: a unifying framework. ACM SIGKDD Explor Newslett 6(1):7\u201319","journal-title":"ACM SIGKDD Explor Newslett"},{"issue":"6","key":"9433_CR129","doi-asserted-by":"crossref","first-page":"80","DOI":"10.2307\/3001968","volume":"1","author":"F Wilcoxon","year":"1945","unstructured":"Wilcoxon F (1945) Individual comparison by ranking methods. Biometrics 1(6):80\u201383","journal-title":"Biometrics"},{"issue":"7","key":"9433_CR130","doi-asserted-by":"crossref","first-page":"1341","DOI":"10.1162\/neco.1996.8.7.1341","volume":"8","author":"DH Wolpert","year":"1996","unstructured":"Wolpert DH (1996) The lack of a priori distinctions between learning algorithms. Neural Comput 8(7):1341\u20131390","journal-title":"Neural Comput"},{"issue":"1","key":"9433_CR131","doi-asserted-by":"crossref","first-page":"116","DOI":"10.1111\/j.1467-9469.2011.00754.x","volume":"39","author":"H Yanagihara","year":"2012","unstructured":"Yanagihara H (2012) Iterative bias correction of the cross validation criterion. Scand J Stat 39(1):116\u2013130","journal-title":"Scand J Stat"},{"key":"9433_CR132","volume-title":"Biostatistical analysis","author":"JH Zar","year":"2010","unstructured":"Zar JH (2010) Biostatistical analysis, 5th edn. Pearson Prentice Hall, Englewood Cliffs","edition":"5"}],"container-title":["Artificial Intelligence Review"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10462-015-9433-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10462-015-9433-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10462-015-9433-y","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,28]],"date-time":"2025-05-28T22:44:53Z","timestamp":1748472293000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10462-015-9433-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,6,30]]},"references-count":132,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2015,12]]}},"alternative-id":["9433"],"URL":"https:\/\/doi.org\/10.1007\/s10462-015-9433-y","relation":{},"ISSN":["0269-2821","1573-7462"],"issn-type":[{"value":"0269-2821","type":"print"},{"value":"1573-7462","type":"electronic"}],"subject":[],"published":{"date-parts":[[2015,6,30]]}}}