{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T07:46:25Z","timestamp":1758267985321,"version":"3.44.0"},"publisher-location":"Cham","reference-count":45,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032058690","type":"print"},{"value":"9783032058706","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T00:00:00Z","timestamp":1758240000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T00:00:00Z","timestamp":1758240000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-05870-6_19","type":"book-chapter","created":{"date-parts":[[2025,9,18]],"date-time":"2025-09-18T11:29:00Z","timestamp":1758194940000},"page":"187-197","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["meval: A Statistical Toolbox for\u00a0Fine-Grained Model Performance Analysis"],"prefix":"10.1007","author":[{"given":"Dishantkumar","family":"Sutariya","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0097-3868","authenticated-orcid":false,"given":"Eike","family":"Petersen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,19]]},"reference":[{"key":"19_CR1","doi-asserted-by":"crossref","unstructured":"Austin, P.C., Steyerberg, E.W.: Bootstrap confidence intervals for loess-based calibration curves 33(15), 2699\u20132700 (2014)","DOI":"10.1002\/sim.6167"},{"key":"19_CR2","unstructured":"Bellamy, R.K.E., Dey, K., Hind, M., Hoffman, S.C., et al.: AI Fairness 360: an extensible toolkit for detecting, understanding, and mitigating unwanted algorithmic bias (2018), https:\/\/arxiv.org\/abs\/1810.01943"},{"key":"19_CR3","unstructured":"Boyd, K., Costa, V.S., Davis, J., Page, C.D.: Unachievable region in precision-recall space and its effect on empirical evaluation. In: Proceedings of the 29th International Conference on Machine Learning, pp. 1619\u20131626 (2012)"},{"key":"19_CR4","doi-asserted-by":"crossref","unstructured":"Br\u00f6cker, J.: Estimating reliability and resolution of probability forecasts through decomposition of the empirical score 39(3\u20134), 655\u2013667 (2011)","DOI":"10.1007\/s00382-011-1191-1"},{"key":"19_CR5","doi-asserted-by":"publisher","first-page":"102305","DOI":"10.1016\/j.media.2021.102305","volume":"75","author":"B Cassidy","year":"2022","unstructured":"Cassidy, B., et al.: Analysis of the ISIC image datasets: usage, benchmarks and recommendations. Med. Image Anal. 75, 102305 (2022)","journal-title":"Med. Image Anal."},{"issue":"149","key":"19_CR6","first-page":"1","volume":"25","author":"JJ Cherian","year":"2024","unstructured":"Cherian, J.J., Cand\u00e8s, E.J.: Statistical inference for fairness auditing. J. Mach. Learn. Res. 25(149), 1\u201349 (2024)","journal-title":"J. Mach. Learn. Res."},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Collins, G.S., et al.: Evaluation of clinical prediction models (part 1): from development to external validation. BMJ p. e074819 (2024)","DOI":"10.1136\/bmj-2023-074819"},{"key":"19_CR8","doi-asserted-by":"crossref","unstructured":"DiCiccio, C., Vasudevan, S., Basu, K., Kenthapadi, K., Agarwal, D.: Evaluating fairness using permutation tests, pp. 1467\u20131477, August 2020","DOI":"10.1145\/3394486.3403199"},{"issue":"6","key":"19_CR9","doi-asserted-by":"publisher","first-page":"2603","DOI":"10.1177\/0962280215602040","volume":"26","author":"D Feng","year":"2015","unstructured":"Feng, D., et al.: A comparison of confidence\/credible interval methods for the area under the ROC curve for continuous diagnostic tests with small sample size. Stat. Methods Med. Res. 26(6), 2603\u20132621 (2015). https:\/\/doi.org\/10.1177\/0962280215602040","journal-title":"Stat. Methods Med. Res."},{"key":"19_CR10","doi-asserted-by":"publisher","unstructured":"Ferro, C.A.T., Fricker, T.E.: A bias-corrected decomposition of the Brier score. Q. J. Royal Meteorol. Soc. 138(668), 1954\u20131960 (2012). https:\/\/doi.org\/10.1002\/qj.1924","DOI":"10.1002\/qj.1924"},{"key":"19_CR11","unstructured":"Flach, P., Kull, M.: Precision-recall-gain curves: PR analysis done right. In: Advances in Neural Information Processing Systems, vol.\u00a028 (2015)"},{"key":"19_CR12","unstructured":"Gildenblat, J.: A python library for confidence intervals. https:\/\/github.com\/jacobgil\/confidenceinterval (2023)"},{"issue":"23","key":"19_CR13","doi-asserted-by":"publisher","first-page":"e215","DOI":"10.1161\/01.CIR.101.23.e215","volume":"101","author":"AL Goldberger","year":"2000","unstructured":"Goldberger, A.L., Amaral, L.A., Glass, L., Hausdorff, J.M., Ivanov, P.C., et al.: PhysioBank, PhysioToolkit, and PhysioNet. Circulation 101(23), e215\u2013e220 (2000)","journal-title":"Circulation"},{"key":"19_CR14","unstructured":"Gruber, S.G., Buettner, F.: Better uncertainty calibration via proper scores for classification and beyond. In: Advances in Neural Information Processing Systems (2022)"},{"key":"19_CR15","doi-asserted-by":"crossref","unstructured":"Jim\u00e9nez-S\u00e1nchez, A., Juodelyte, D., Chamberlain, B., Cheplygina, V.: Detecting shortcuts in medical images \u2013 a case study in chest x-rays. In: International Symposium on Biomedical Imaging (ISBI), IEEE, April 2023","DOI":"10.1109\/ISBI53787.2023.10230572"},{"key":"19_CR16","doi-asserted-by":"publisher","unstructured":"Johnson, A., Lungren, M., Peng, Y., Lu, Z., et al.: MIMIC-CXR-JPG - chest radiographs with structured labels. PhysioNet (2024). https:\/\/doi.org\/10.13026\/JSN5-T979","DOI":"10.13026\/JSN5-T979"},{"key":"19_CR17","doi-asserted-by":"crossref","unstructured":"Johnson, A., Pollard, T.J., et al.: MIMIC-CXR, a de-identified publicly available database of chest radiographs with free-text reports. Sci. Data 6(1) (2019)","DOI":"10.1038\/s41597-019-0322-0"},{"key":"19_CR18","unstructured":"Kearns, M., Neel, S., Roth, A., Wu, Z.S.: Preventing fairness gerrymandering: auditing and learning for subgroup fairness. In: Proceedings of the 35th International Conference on Machine Learning (2018)"},{"issue":"3","key":"19_CR19","doi-asserted-by":"publisher","first-page":"196","DOI":"10.1207\/s15327957pspr0203_4","volume":"2","author":"NL Kerr","year":"1998","unstructured":"Kerr, N.L.: HARKing: hypothesizing after the results are known. Pers. Soc. Psychol. Rev. 2(3), 196\u2013217 (1998). https:\/\/doi.org\/10.1207\/s15327957pspr0203_4","journal-title":"Pers. Soc. Psychol. Rev."},{"key":"19_CR20","unstructured":"Kumar, A., Liang, P.S., Ma, T.: Verified uncertainty calibration. In: Advances in Neural Information Processing Systems, vol.\u00a032. Curran Associates, Inc"},{"key":"19_CR21","doi-asserted-by":"publisher","first-page":"77","DOI":"10.1016\/j.compbiolchem.2013.02.003","volume":"48","author":"W Li","year":"2014","unstructured":"Li, W., et al.: Using volcano plots and regularized-chi statistics in genetic association studies. Comput. Biol. Chem. 48, 77\u201383 (2014)","journal-title":"Comput. Biol. Chem."},{"key":"19_CR22","doi-asserted-by":"crossref","unstructured":"Lotter, W.: Acquisition parameters influence AI recognition of race in chest x-rays and mitigating these factors reduces underdiagnosis bias. Nat. Commun. 15(1) (2024)","DOI":"10.1038\/s41467-024-52003-3"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Mukherjee, P., et al.: Confounding factors need to be accounted for in assessing bias by machine learning algorithms 28(6), 1159\u20131160","DOI":"10.1038\/s41591-022-01847-7"},{"key":"19_CR24","doi-asserted-by":"publisher","unstructured":"Newcombe, R.G.: Confidence intervals for an effect size measure based on the Mann\u2013Whitney statistic. Part 2: asymptotic methods and evaluation. Stat. Med. 25(4), 559\u2013573 (2005). https:\/\/doi.org\/10.1002\/sim.2324","DOI":"10.1002\/sim.2324"},{"key":"19_CR25","doi-asserted-by":"crossref","unstructured":"Oakden-Rayner, L., Dunnmon, J., Carneiro, G., Re, C.: Hidden stratification causes clinically meaningful failures in machine learning for medical imaging. In: Proceedings of the ACM Conference on Health, Inference, and Learning (2020)","DOI":"10.1145\/3368555.3384468"},{"key":"19_CR26","doi-asserted-by":"publisher","unstructured":"Olesen, V., et al.: Slicing through bias: explaining performance gaps in medical image analysis using slice discovery methods. In: MICCAI FAIMI Workshop (2024). https:\/\/doi.org\/10.1007\/978-3-031-72787-0_1","DOI":"10.1007\/978-3-031-72787-0_1"},{"key":"19_CR27","doi-asserted-by":"publisher","unstructured":"Petersen, E., et al.: On (assessing) the fairness of risk score models. In: Proceedings of the 2023 ACM Conference on Fairness, Accountability, and Transparency (2023). https:\/\/doi.org\/10.1145\/3593013.3594045","DOI":"10.1145\/3593013.3594045"},{"key":"19_CR28","doi-asserted-by":"publisher","unstructured":"Petersen, E., Holm, S., Ganz, M., Feragen, A.: The path toward equal performance in medical machine learning. Patterns 4(7) (2023). https:\/\/doi.org\/10.1016\/j.patter.2023.100790","DOI":"10.1016\/j.patter.2023.100790"},{"key":"19_CR29","unstructured":"Pfohl, S.R., et al.: Understanding challenges to the interpretation of disaggregated evaluations of algorithmic fairness (2025), https:\/\/arxiv.org\/abs\/2506.04193"},{"key":"19_CR30","doi-asserted-by":"crossref","unstructured":"Rainio, O., Teuho, J., Kl\u00e9n, R.: Evaluation metrics and statistical tests for machine learning. Sci. Rep. 14(1) (2024)","DOI":"10.1038\/s41598-024-56706-x"},{"key":"19_CR31","doi-asserted-by":"publisher","unstructured":"Raschka, S.: Model evaluation, model selection, and algorithm selection in machine learning, November 2018. https:\/\/doi.org\/10.48550\/ARXIV.1811.12808","DOI":"10.48550\/ARXIV.1811.12808"},{"key":"19_CR32","doi-asserted-by":"crossref","unstructured":"Ricci\u00a0Lara, M.A., Mosquera, C., Ferrante, E., Echeveste, R.: Towards unraveling calibration biases in medical image analysis. In: MICCAI FAIMI Workshop (2023)","DOI":"10.1007\/978-3-031-45249-9_13"},{"key":"19_CR33","unstructured":"Roelofs, R., Cain, N., Shlens, J., Mozer, M.C.: Mitigating bias in calibration error estimation. In: Proceedings of The 25th International Conference on Artificial Intelligence and Statistics, vol.\u00a0151, pp. 4036\u20134054 (2022)"},{"key":"19_CR34","doi-asserted-by":"crossref","unstructured":"Rotemberg, V., et al.: A patient-centric dataset of images and metadata for identifying melanomas using clinical context. Sci. Data 8(1) (2021)","DOI":"10.1038\/s41597-021-00815-z"},{"key":"19_CR35","doi-asserted-by":"crossref","unstructured":"Seabold, S., Perktold, J.: statsmodels: Econometric and statistical modeling with python. In: 9th Python in Science Conference (2010)","DOI":"10.25080\/Majora-92bf1922-011"},{"key":"19_CR36","unstructured":"Seyyed-Kalantari, L., Liu, G., McDermott, M., Chen, I.Y., Ghassemi, M.: CheXclusion: fairness gaps in deep chest X-ray classifiers. In: Biocomputing 2021"},{"issue":"12","key":"19_CR37","doi-asserted-by":"publisher","first-page":"2176","DOI":"10.1038\/s41591-021-01595-0","volume":"27","author":"L Seyyed-Kalantari","year":"2021","unstructured":"Seyyed-Kalantari, L., Zhang, H., McDermott, M.B.A., Chen, I.Y., Ghassemi, M.: Underdiagnosis bias of artificial intelligence algorithms applied to chest radiographs in under-served patient populations. Nat. Med. 27(12), 2176\u20132182 (2021)","journal-title":"Nat. Med."},{"key":"19_CR38","doi-asserted-by":"publisher","unstructured":"Stefan, A.M., Sch\u00f6nbrodt, F.D.: Big little lies: a compendium and simulation ofp-hacking strategies. Royal Soc. Open Sci. 10(2) (2023). https:\/\/doi.org\/10.1098\/rsos.220346","DOI":"10.1098\/rsos.220346"},{"issue":"11","key":"19_CR39","doi-asserted-by":"publisher","first-page":"1389","DOI":"10.1109\/LSP.2014.2337313","volume":"21","author":"X Sun","year":"2014","unstructured":"Sun, X., Xu, W.: Fast implementation of DeLong\u2019s algorithm for comparing the areas under correlated receiver operating characteristic curves. IEEE Signal Process. Lett. 21(11), 1389\u20131393 (2014)","journal-title":"IEEE Signal Process. Lett."},{"key":"19_CR40","doi-asserted-by":"publisher","unstructured":"Varoquaux, G., Colliot, O.: Evaluating machine learning models and their diagnostic value. In: Machine Learning for Brain Disorders. Springer US (2023). https:\/\/doi.org\/10.1007\/978-1-0716-3195-9_20","DOI":"10.1007\/978-1-0716-3195-9_20"},{"key":"19_CR41","unstructured":"Weerts, H., et al.: Fairlearn: assessing and improving fairness of AI systems (2023), http:\/\/jmlr.org\/papers\/v24\/23-0389.html"},{"key":"19_CR42","doi-asserted-by":"crossref","unstructured":"Weng, N., et al.: Are sex-based physiological differences the cause of gender bias for chest X-ray diagnosis? In: MICCAI 2023 FAIMI Workshop, pp. 142\u2013152 (2023)","DOI":"10.1007\/978-3-031-45249-9_14"},{"key":"19_CR43","doi-asserted-by":"publisher","unstructured":"Yang, Y., Zhang, H., Gichoya, J.W., Katabi, D., Ghassemi, M.: The limits of fair medical imaging AI in real-world generalization. Nat. Med. (2024). https:\/\/doi.org\/10.1038\/s41591-024-03113-4","DOI":"10.1038\/s41591-024-03113-4"},{"key":"19_CR44","unstructured":"Zhang, H., Dullerud, N., Roth, K., et al.: Improving the fairness of chest X-ray classifiers. In: Conference on Health, Inference, and Learning (CHIL) (2022)"},{"key":"19_CR45","unstructured":"Zhang, Z., Neill, D.B.: Identifying significant predictive bias in classifiers. In: NeurIPS FAT\/ML Workshop (2017)"}],"container-title":["Lecture Notes in Computer Science","Fairness of AI in Medical Imaging"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-05870-6_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,18]],"date-time":"2025-09-18T22:07:20Z","timestamp":1758233240000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-05870-6_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,19]]},"ISBN":["9783032058690","9783032058706"],"references-count":45,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-05870-6_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,19]]},"assertion":[{"value":"19 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors declare no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"FAIMI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"MICCAI Workshop on Fairness of AI in Medical Imaging","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Daejeon","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Korea (Republic of)","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"faimi2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/faimi-workshop.github.io\/2025-miccai-workshop\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}