{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T23:13:15Z","timestamp":1780614795064,"version":"3.54.1"},"reference-count":115,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Medical Image Analysis"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1016\/j.media.2025.103556","type":"journal-article","created":{"date-parts":[[2025,4,9]],"date-time":"2025-04-09T12:10:13Z","timestamp":1744200613000},"page":"103556","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":2,"special_numbering":"C","title":["Evaluating medical AI systems in dermatology under uncertain ground truth"],"prefix":"10.1016","volume":"103","author":[{"given":"David","family":"Stutz","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4463-8455","authenticated-orcid":false,"given":"Ali Taylan","family":"Cemgil","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Abhijit Guha","family":"Roy","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tatiana","family":"Matejovicova","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3404-8849","authenticated-orcid":false,"given":"Melih","family":"Barsbey","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Patricia","family":"Strachan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1735-9680","authenticated-orcid":false,"given":"Mike","family":"Schaekermann","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2386-8243","authenticated-orcid":false,"given":"Jan","family":"Freyberg","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rajeev","family":"Rikhye","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9489-2161","authenticated-orcid":false,"given":"Beverly","family":"Freeman","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4510-3074","authenticated-orcid":false,"given":"Javier Perez","family":"Matos","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0217-1885","authenticated-orcid":false,"given":"Umesh","family":"Telang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3023-8824","authenticated-orcid":false,"given":"Dale R.","family":"Webster","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7916-9436","authenticated-orcid":false,"given":"Yuan","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Greg S.","family":"Corrado","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3960-6002","authenticated-orcid":false,"given":"Yossi","family":"Matias","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7466-7997","authenticated-orcid":false,"given":"Pushmeet","family":"Kohli","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4079-8275","authenticated-orcid":false,"given":"Yun","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Arnaud","family":"Doucet","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Alan","family":"Karthikesalingam","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.media.2025.103556_b1","series-title":"Consistency is key: Disentangling label variation in natural language processing with intra-annotator agreement","author":"Abercrombie","year":"2023"},{"issue":"4","key":"10.1016\/j.media.2025.103556_b2","doi-asserted-by":"crossref","first-page":"343","DOI":"10.1023\/A:1022873112823","article-title":"Learning from noisy examples","volume":"2","author":"Angluin","year":"1987","journal-title":"Mach. Learn."},{"issue":"1","key":"10.1016\/j.media.2025.103556_b3","first-page":"31","article-title":"The three sides of CrowdTruth","volume":"1","author":"Aroyo","year":"2014","journal-title":"J. Hum. Comput."},{"issue":"1","key":"10.1016\/j.media.2025.103556_b4","first-page":"15","article-title":"Truth is a Lie: Crowd truth and the seven myths of human annotation","volume":"36","author":"Aroyo","year":"2015","journal-title":"AI Mag."},{"key":"10.1016\/j.media.2025.103556_b5","series-title":"Robust and efficient medical imaging with self-supervision","author":"Azizi","year":"2022"},{"key":"10.1016\/j.media.2025.103556_b6","series-title":"Stop measuring calibration when humans disagree","author":"Baan","year":"2022"},{"key":"10.1016\/j.media.2025.103556_b7","unstructured":"Bachrach, Y., Graepel, T., Minka, T., Guiver, J., 2012. How To Grade a Test Without Knowing the Answers - A Bayesian Graphical Model for Adaptive Crowdsourcing and Aptitude Testing. In: Proc. of the International Conference on Machine Learning. ICML."},{"key":"10.1016\/j.media.2025.103556_b8","article-title":"Fine-tuning language models to find agreement among humans with diverse preferences","author":"Bakker","year":"2022"},{"key":"10.1016\/j.media.2025.103556_b9","doi-asserted-by":"crossref","unstructured":"Basile, V., Fell, M., Fornaciari, T., Hovy, D., Paun, S., Plank, B., Poesio, M., Uma, A., 2021. We Need to Consider Disagreement in Evaluation. In: Proceedings of the 1st Workshop on Benchmarking: Past, Present and Future.","DOI":"10.18653\/v1\/2021.bppf-1.3"},{"key":"10.1016\/j.media.2025.103556_b10","series-title":"Proc. of the Annual Meeting of the Association for Computational Linguistics","article-title":"Learning with annotation noise","author":"Beigman","year":"2009"},{"key":"10.1016\/j.media.2025.103556_b11","series-title":"Missing information, unresponsive authors, experimental flaws: The impossibility of assessing the reproducibility of previous human evaluations in NLP","author":"Belz","year":"2023"},{"key":"10.1016\/j.media.2025.103556_b12","doi-asserted-by":"crossref","unstructured":"Bhattacharya, N., Li, Q., Gurari, D., 2019. Why Does a Visual Question Have Different Answers?. In: Proc. of the IEEE International Conference on Computer Vision. ICCV.","DOI":"10.1109\/ICCV.2019.00437"},{"key":"10.1016\/j.media.2025.103556_b13","doi-asserted-by":"crossref","unstructured":"Braylan, A., Alonso, O., Lease, M., 2022. Measuring Annotator Agreement Generally across Complex Structured, Multi-Object, and Free-Text Annotation Tasks. In: Proc. of the International World Wide Web Conference. WWW.","DOI":"10.1145\/3485447.3512242"},{"issue":"11","key":"10.1016\/j.media.2025.103556_b14","doi-asserted-by":"crossref","first-page":"4014","DOI":"10.3390\/app10114014","article-title":"The elephant in the machine: Proposing a new metric of data reliability and its application to a medical case to assess classification reliability","volume":"10","author":"Cabitza","year":"2020","journal-title":"Appl. Sci."},{"key":"10.1016\/j.media.2025.103556_b15","doi-asserted-by":"crossref","first-page":"174","DOI":"10.1080\/10618600.2012.638220","article-title":"Efficient Bayesian inference for generalized Bradley\u2013Terry models","volume":"21","author":"Caron","year":"2010","journal-title":"J. Comput. Graph. Statist."},{"key":"10.1016\/j.media.2025.103556_b16","unstructured":"Carvalho, A., Larson, K., 2013. A Consensual Linear Opinion Pool. In: Proc. of the International Joint Conference on Artificial Intelligence. IJCAI."},{"key":"10.1016\/j.media.2025.103556_b17","doi-asserted-by":"crossref","unstructured":"Chen, Q., Bragg, J., Chilton, L.B., Weld, D.S., 2019a. Cicero: Multi-Turn, Contextual Argumentation for Accurate Crowdsourcing. In: Proc. of the Conference on Human Factors in Computing Systems.","DOI":"10.1145\/3290605.3300761"},{"key":"10.1016\/j.media.2025.103556_b18","unstructured":"Chen, P., Liao, B., Chen, G., Zhang, S., 2019b. Understanding and Utilizing Deep Neural Networks Trained with Noisy Labels. In: Proc. of the International Conference on Machine Learning. ICML."},{"key":"10.1016\/j.media.2025.103556_b19","doi-asserted-by":"crossref","unstructured":"Chu, Z., Ma, J., Wang, H., 2021. Learning from Crowds by Modeling Common Confusions. In: Proc. of the Conference on Artificial Intelligence. AAAI.","DOI":"10.1609\/aaai.v35i7.16730"},{"issue":"1","key":"10.1016\/j.media.2025.103556_b20","doi-asserted-by":"crossref","first-page":"37","DOI":"10.1177\/001316446002000104","article-title":"A coefficient of agreement for nominal scales","volume":"20","author":"Cohen","year":"1960","journal-title":"Educ. Psychol. Meas."},{"key":"10.1016\/j.media.2025.103556_b21","doi-asserted-by":"crossref","unstructured":"Collins, K.M., Bhatt, U., Weller, A., 2022. Eliciting and Learning with Soft Labels from Every Annotator. In: Proc. of the AAAI Conference on Human Computation and Crowdsourcing. HCOMP.","DOI":"10.1609\/hcomp.v10i1.21986"},{"key":"10.1016\/j.media.2025.103556_b22","article-title":"Learning from partial labels","volume":"12","author":"Cour","year":"2011","journal-title":"J. Mach. Learn. Res. (JMLR)"},{"key":"10.1016\/j.media.2025.103556_b23","doi-asserted-by":"crossref","first-page":"92","DOI":"10.1162\/tacl_a_00449","article-title":"Dealing with disagreements: Looking beyond the majority vote in subjective annotations","volume":"10","author":"Davani","year":"2022","journal-title":"Trans. Assoc. Comput. Linguistics (TACL)"},{"issue":"1","key":"10.1016\/j.media.2025.103556_b24","first-page":"20","article-title":"Maximum likelihood estimation of observer error-rates using the EM algorithm","volume":"28","author":"Dawid","year":"1979","journal-title":"J. R. Stat. Soc. (JRSS)"},{"key":"10.1016\/j.media.2025.103556_b25","doi-asserted-by":"crossref","DOI":"10.1162\/COLI_a_00097","article-title":"Did it happen? The pragmatic complexity of veridicality assessment","author":"de Marneffe","year":"2012","journal-title":"Comput. Linguist."},{"issue":"1","key":"10.1016\/j.media.2025.103556_b26","doi-asserted-by":"crossref","first-page":"550","DOI":"10.14778\/1687627.1687690","article-title":"Integrating conflicting data: The role of source dependence","volume":"2","author":"Dong","year":"2009","journal-title":"Proc. the VLDB Endow."},{"key":"10.1016\/j.media.2025.103556_b27","article-title":"Microtalk: Using argumentation to improve crowdsourcing accuracy","volume":"vol. 4","author":"Drapeau","year":"2016"},{"key":"10.1016\/j.media.2025.103556_b28","doi-asserted-by":"crossref","DOI":"10.1259\/bjr.20210435","article-title":"Improving reference standards for validation of AI-based radiography","volume":"94","author":"Duggan","year":"2021","journal-title":"Br. J. Radiol."},{"key":"10.1016\/j.media.2025.103556_b29","series-title":"Proc. of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","article-title":"A crowdsourced frame disambiguation corpus with ambiguity","author":"Dumitrache","year":"2019"},{"key":"10.1016\/j.media.2025.103556_b30","article-title":"Measuring clinician\u2013machine agreement in differential diagnoses for dermatology","volume":"182","author":"Eng","year":"2019","journal-title":"Br. J. Dermatol."},{"key":"10.1016\/j.media.2025.103556_b31","doi-asserted-by":"crossref","unstructured":"Fagin, R., Kumar, R., Mahdian, M., Sivakumar, D., Vee, E., 2004. Comparing and Aggregating Rankings with Ties. In: Proc. of the ACM SIGACT-SIGMOD-SIGART Symposium on Principles of Database Systems. PODS.","DOI":"10.1145\/1055558.1055568"},{"issue":"1","key":"10.1016\/j.media.2025.103556_b32","doi-asserted-by":"crossref","first-page":"134","DOI":"10.1137\/S0895480102412856","article-title":"Comparing top k lists","volume":"17","author":"Fagin","year":"2003","journal-title":"SIAM J. Discret. Math. (SIDMA)"},{"key":"10.1016\/j.media.2025.103556_b33","doi-asserted-by":"crossref","first-page":"543","DOI":"10.1016\/0895-4356(90)90158-L","article-title":"High agreement but low kappa: I. The problems of two paradoxes","volume":"43 6","author":"Feinstein","year":"1990","journal-title":"J. Clin. Epidemiol."},{"key":"10.1016\/j.media.2025.103556_b34","doi-asserted-by":"crossref","unstructured":"Field, A., Blodgett, S.L., Waseem, Z., Tsvetkov, Y., 2021. A Survey of Race, Racism, and Anti-Racism in NLP. In: Proc. of the Annual Meeting of the Association for Computational Linguistics. ACL.","DOI":"10.18653\/v1\/2021.acl-long.149"},{"key":"10.1016\/j.media.2025.103556_b35","series-title":"Statistical Methods for Rates and Proportions","author":"Fleiss","year":"2003"},{"key":"10.1016\/j.media.2025.103556_b36","series-title":"Bayesian nonparametric plackett-luce models for the analysis of clustered ranked data","author":"Francois","year":"2012"},{"key":"10.1016\/j.media.2025.103556_b37","doi-asserted-by":"crossref","unstructured":"Freeman, B., Hammel, N., Phene, S., Huang, A., Ackermann, R., Kanzheleva, O., Hutson, M., Taggart, C., Duong, Q., Sayres, R., 2021. Iterative Quality Control Strategies for Expert Medical Image Labeling. In: Proc. of the AAAI Conference on Human Computation and Crowdsourcing. HCOMP, pp. 60\u201371.","DOI":"10.1609\/hcomp.v9i1.18940"},{"issue":"6","key":"10.1016\/j.media.2025.103556_b38","doi-asserted-by":"crossref","first-page":"2825","DOI":"10.1109\/TIP.2017.2689998","article-title":"Deep label distribution learning with label ambiguity","volume":"26","author":"Gao","year":"2017","journal-title":"IEEE Trans. Image Process. (TIP)"},{"key":"10.1016\/j.media.2025.103556_b39","unstructured":"Gaunt, A., Borsa, D., Bachrach, Y., 2016. Training deep neural nets to aggregate crowdsourced responses. In: Proc. of the Conference on Uncertainty in Artificial Intelligence, Vol. 242251. UAI."},{"key":"10.1016\/j.media.2025.103556_b40","doi-asserted-by":"crossref","unstructured":"Gordon, M.L., Lam, M.S., Park, J.S., Patel, K., Hancock, J.T., Hashimoto, T., Bernstein, M.S., 2022. Jury Learning: Integrating Dissenting Voices into Machine Learning Models. In: Proc. of the Conference on Human Factors in Computing Systems.","DOI":"10.1145\/3491102.3502004"},{"key":"10.1016\/j.media.2025.103556_b41","doi-asserted-by":"crossref","unstructured":"Gordon, M.L., Zhou, K., Patel, K., Hashimoto, T., Bernstein, M.S., 2021. The Disagreement Deconvolution: Bringing Machine Learning Performance Metrics In Line With Reality. In: Proc. of the Conference on Human Factors in Computing Systems.","DOI":"10.1145\/3411764.3445423"},{"key":"10.1016\/j.media.2025.103556_b42","doi-asserted-by":"crossref","unstructured":"Guan, M.Y., Gulshan, V., Dai, A.M., Hinton, G.E., 2018. Who Said What: Modeling Individual Labelers Improves Classification. In: Proc. of the Conference on Artificial Intelligence. AAAI.","DOI":"10.1609\/aaai.v32i1.11756"},{"key":"10.1016\/j.media.2025.103556_b43","article-title":"Using trusted data to train deep networks on labels corrupted by severe noise","author":"Hendrycks","year":"2018"},{"key":"10.1016\/j.media.2025.103556_b44","doi-asserted-by":"crossref","unstructured":"H\u00fcllermeier, E., Beringer, J., 2005. Learning from Ambiguously Labeled Examples. In: Proc. of the International Symposium on Intelligent Data Analysis. IDA.","DOI":"10.1007\/11552253_16"},{"key":"10.1016\/j.media.2025.103556_b45","first-page":"384","article-title":"MM algorithms for generalized Bradley-Terry models","volume":"32","author":"Hunter","year":"2003","journal-title":"Ann. Statist."},{"key":"10.1016\/j.media.2025.103556_b46","article-title":"Development and assessment of an artificial intelligence-based tool for skin condition diagnosis by primary care physicians and nurse practitioners in teledermatology practices","volume":"4 4","author":"Jain","year":"2021","journal-title":"J. Am. Med. Assoc. (JAMA)"},{"issue":"6","key":"10.1016\/j.media.2025.103556_b47","doi-asserted-by":"crossref","first-page":"983","DOI":"10.1145\/293347.293351","article-title":"Efficient noise-tolerant learning from statistical queries","volume":"45","author":"Kearns","year":"1998","journal-title":"J. ACM"},{"issue":"4","key":"10.1016\/j.media.2025.103556_b48","doi-asserted-by":"crossref","first-page":"807","DOI":"10.1137\/0222052","article-title":"Learning in the presence of malicious errors","volume":"22","author":"Kearns","year":"1993","journal-title":"SIAM J. Comput."},{"issue":"3","key":"10.1016\/j.media.2025.103556_b49","doi-asserted-by":"crossref","first-page":"177","DOI":"10.1561\/0600000071","article-title":"Crowdsourcing in computer vision","volume":"10","author":"Kovashka","year":"2016","journal-title":"Found. Trends Comput. Graph. Vis."},{"key":"10.1016\/j.media.2025.103556_b50","series-title":"Learning Multiple Layers of Features from Tiny Images","author":"Krizhevsky","year":"2009"},{"key":"10.1016\/j.media.2025.103556_b51","doi-asserted-by":"crossref","unstructured":"Kumar, R., Vassilvitskii, S., 2010. Generalized distances between rankings. In: Proc. of the International World Wide Web Conference. WWW.","DOI":"10.1145\/1772690.1772749"},{"key":"10.1016\/j.media.2025.103556_b52","doi-asserted-by":"crossref","first-page":"159","DOI":"10.2307\/2529310","article-title":"The measurement of observer agreement for categorical data","author":"Landis","year":"1977","journal-title":"Biometrics"},{"key":"10.1016\/j.media.2025.103556_b53","unstructured":"Lawrence, N.D., Sch\u00f6lkopf, B., 2001. Estimating a Kernel Fisher Discriminant in the Presence of Label Noise. In: Proc. of the International Conference on Machine Learning. ICML."},{"key":"10.1016\/j.media.2025.103556_b54","series-title":"SemEval-2023 task 11: Learning with disagreements (LeWiDi)","author":"Leonardelli","year":"2023"},{"issue":"2","key":"10.1016\/j.media.2025.103556_b55","doi-asserted-by":"crossref","first-page":"97","DOI":"10.14778\/2535568.2448943","article-title":"Truth finding on the deep web: Is the problem solved?","volume":"6","author":"Li","year":"2012","journal-title":"Proc. the VLDB Endow."},{"key":"10.1016\/j.media.2025.103556_b56","doi-asserted-by":"crossref","first-page":"900","DOI":"10.1038\/s41591-020-0842-3","article-title":"A deep learning system for differential diagnosis of skin diseases","volume":"26","author":"Liu","year":"2020","journal-title":"Nature Med."},{"key":"10.1016\/j.media.2025.103556_b57","unstructured":"Lovchinsky, I., Daks, A., Malkin, I., Samangouei, P., Saeedi, A., Liu, Y., Sankaranarayanan, S., Gafner, T., Sternlieb, B., Maher, P., Silberman, N., 2020. Discrepancy Ratio: Evaluating Model Performance When Even Experts Disagree on the Truth. In: Proc. of the International Conference on Learning Representations. ICLR."},{"key":"10.1016\/j.media.2025.103556_b58","series-title":"Individual Choice Behavior: A Theoretical Analysis","author":"Luce","year":"2012"},{"key":"10.1016\/j.media.2025.103556_b59","doi-asserted-by":"crossref","DOI":"10.1038\/s41467-018-07619-7","article-title":"Why rankings of biomedical image analysis competitions should be interpreted with care","volume":"9","author":"Maier-Hein","year":"2018","journal-title":"Nat. Commun."},{"key":"10.1016\/j.media.2025.103556_b60","doi-asserted-by":"crossref","first-page":"276","DOI":"10.11613\/BM.2012.031","article-title":"Interrater reliability: the kappa statistic","volume":"22","author":"McHugh","year":"2012","journal-title":"Biochem. Medica"},{"key":"10.1016\/j.media.2025.103556_b61","doi-asserted-by":"crossref","DOI":"10.1016\/j.ogla.2023.01.007","article-title":"The definition of glaucomatous optic neuropathy in artificial intelligence research and clinical applications","author":"Medeiros","year":"2023","journal-title":"Ophthalmol. Glaucoma"},{"key":"10.1016\/j.media.2025.103556_b62","doi-asserted-by":"crossref","unstructured":"Nguyen, N., Caruana, R., 2008. Classification with partial labels. In: Proc. of the ACM International Conference on Knowledge Discovery & Data Mining.","DOI":"10.1145\/1401890.1401958"},{"key":"10.1016\/j.media.2025.103556_b63","doi-asserted-by":"crossref","unstructured":"Nie, Y., Zhou, X., Bansal, M., 2020. What Can We Learn from Collective Human Opinions on Natural Language Inference Data?. In: Proc. of the Conference on Empirical Methods in Natural Language Processing.","DOI":"10.18653\/v1\/2020.emnlp-main.734"},{"key":"10.1016\/j.media.2025.103556_b64","article-title":"Pervasive label errors in test sets destabilize machine learning benchmarks","author":"Northcutt","year":"2021"},{"key":"10.1016\/j.media.2025.103556_b65","series-title":"Confident learning: Estimating uncertainty in dataset labels","author":"Northcutt","year":"2019"},{"key":"10.1016\/j.media.2025.103556_b66","series-title":"Robustness to label noise depends on the shape of the noise distribution in feature space","author":"Oyen","year":"2022"},{"key":"10.1016\/j.media.2025.103556_b67","doi-asserted-by":"crossref","first-page":"677","DOI":"10.1162\/tacl_a_00293","article-title":"Inherent disagreements in human textual inferences","volume":"7","author":"Pavlick","year":"2019","journal-title":"Trans. Assoc. Comput. Linguistics (TACL)"},{"key":"10.1016\/j.media.2025.103556_b68","doi-asserted-by":"crossref","unstructured":"Peterson, J.C., Battleday, R.M., Griffiths, T.L., Russakovsky, O., 2019. Human Uncertainty Makes Classification More Robust. In: Proc. of the IEEE International Conference on Computer Vision. ICCV.","DOI":"10.1109\/ICCV.2019.00971"},{"issue":"12","key":"10.1016\/j.media.2025.103556_b69","doi-asserted-by":"crossref","first-page":"2381","DOI":"10.1109\/TPAMI.2017.2647944","article-title":"Dynamic programming for instance annotation in multi-instance multi-label learning","volume":"39","author":"Pham","year":"2017","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.media.2025.103556_b70","doi-asserted-by":"crossref","DOI":"10.1016\/j.ophtha.2019.07.024","article-title":"Deep learning and glaucoma specialists: The relative importance of optic disc features to predict glaucoma referral in fundus photographs","author":"Phene","year":"2019","journal-title":"Ophthalmology"},{"key":"10.1016\/j.media.2025.103556_b71","first-page":"193","article-title":"The analysis of permutations","volume":"24","author":"Plackett","year":"1975","journal-title":"J. R. Stat. Soc. Ser. C. Appl. Stat."},{"key":"10.1016\/j.media.2025.103556_b72","series-title":"The \u2019problem\u2019 of human label variation: On ground truth in data, modeling and evaluation","author":"Plank","year":"2022"},{"key":"10.1016\/j.media.2025.103556_b73","unstructured":"Powers, D.M.W., 2012. The Problem with Kappa. In: Proc. of the Conference of the European Chapter of the Association for Computational Linguistics. EACL, pp. 345\u2013355."},{"key":"10.1016\/j.media.2025.103556_b74","doi-asserted-by":"crossref","DOI":"10.3389\/frai.2022.828187","article-title":"In search of ambiguity: A three-stage workflow design to clarify annotation guidelines for crowd workers","volume":"5","author":"Pradhan","year":"2022","journal-title":"Front. Artif. Intell."},{"key":"10.1016\/j.media.2025.103556_b75","unstructured":"Raghu, M., Blumer, K., Sayres, R., Obermeyer, Z., Kleinberg, B., Mullainathan, S., Kleinberg, J., 2019. Direct uncertainty prediction for medical second opinions. In: Proc. of the International Conference on Machine Learning. ICML."},{"key":"10.1016\/j.media.2025.103556_b76","doi-asserted-by":"crossref","unstructured":"Reidsma, D., op den Akker, R., 2008. Exploiting \u2019subjective\u2019 Annotations. In: Proc. of the Annual Meeting of the Association for Computational Linguistics. ACL.","DOI":"10.3115\/1611628.1611631"},{"key":"10.1016\/j.media.2025.103556_b77","doi-asserted-by":"crossref","unstructured":"Rodrigues, F., Pereira, F.C., 2018. Deep Learning from Crowds. In: Proc. of the Conference on Artificial Intelligence. AAAI.","DOI":"10.1609\/aaai.v32i1.11506"},{"key":"10.1016\/j.media.2025.103556_b78","doi-asserted-by":"crossref","unstructured":"R\u00f6ttger, P., Vidgen, B., Hovy, D., Pierrehumbert, J.B., 2022. Two Contrasting Data Annotation Paradigms for Subjective NLP Tasks. In: Proc. of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. NAACL-HLT.","DOI":"10.18653\/v1\/2022.naacl-main.13"},{"key":"10.1016\/j.media.2025.103556_b79","doi-asserted-by":"crossref","unstructured":"R\u00f6ttger, P., Vidgen, B., Nguyen, D., Waseem, Z., Margetts, H.Z., Pierrehumbert, J.B., 2021. HateCheck: Functional Tests for Hate Speech Detection Models. In: Proc. of the Annual Meeting of the Association for Computational Linguistics. ACL.","DOI":"10.18653\/v1\/2021.acl-long.4"},{"key":"10.1016\/j.media.2025.103556_b80","article-title":"Does your dermatology classifier know what it doesn\u2019t know? Detecting the long-tail of unseen conditions","volume":"75","author":"Roy","year":"2022","journal-title":"Med. Image Anal."},{"issue":"3","key":"10.1016\/j.media.2025.103556_b81","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","article-title":"ImageNet large scale visual recognition challenge","volume":"115","author":"Russakovsky","year":"2015","journal-title":"Int. J. Comput. Vis. (IJCV)"},{"key":"10.1016\/j.media.2025.103556_b82","series-title":"PROMISE Winter School","article-title":"Metrics, statistics, tests","author":"Sakai","year":"2013"},{"key":"10.1016\/j.media.2025.103556_b83","doi-asserted-by":"crossref","unstructured":"Sandri, M., Leonardelli, E., Tonelli, S., Jezek, E., 2023. Why Don\u2019t You Do It Right? Analysing Annotators\u2019 Disagreement in Subjective Tasks. In: Proc. of the Conference of the European Chapter of the Association for Computational Linguistics. EACL.","DOI":"10.18653\/v1\/2023.eacl-main.178"},{"key":"10.1016\/j.media.2025.103556_b84","series-title":"Human-AI interaction in the presence of ambiguity: From deliberation-based labeling to ambiguity-aware AI","author":"Schaekermann","year":"2020"},{"key":"10.1016\/j.media.2025.103556_b85","doi-asserted-by":"crossref","first-page":"76:1","DOI":"10.1145\/3359178","article-title":"Understanding expert disagreement in medical data analysis through structured adjudication","volume":"3","author":"Schaekermann","year":"2019","journal-title":"Proc. ACM Hum. Comput. Interact."},{"key":"10.1016\/j.media.2025.103556_b86","series-title":"Proc. of the Conference on Human Factors in Computing Systems","first-page":"1","article-title":"Ambiguity-aware AI assistants for medical data analysis","author":"Schaekermann","year":"2020"},{"key":"10.1016\/j.media.2025.103556_b87","doi-asserted-by":"crossref","unstructured":"Schaekermann, M., Cai, C.J., Huang, A.E., Sayres, R., 2020b. Expert Discussions Improve Comprehension of Difficult Cases in Medical Image Assessment. In: Proc. of the Conference on Human Factors in Computing Systems.","DOI":"10.1145\/3313831.3376290"},{"issue":"6","key":"10.1016\/j.media.2025.103556_b88","doi-asserted-by":"crossref","DOI":"10.1167\/tvst.8.6.40","article-title":"Remote tool-based adjudication for grading diabetic retinopathy","volume":"8","author":"Schaekermann","year":"2019","journal-title":"Transl. Vis. Sci. Technol."},{"key":"10.1016\/j.media.2025.103556_b89","article-title":"Resolvable vs. irresolvable ambiguity: A new hybrid framework for dealing with uncertain ground truth","volume":"vol. 2016","author":"Schaekermann","year":"2016"},{"key":"10.1016\/j.media.2025.103556_b90","doi-asserted-by":"crossref","unstructured":"Sculley, D., 2007. Rank Aggregation for Similar Items. In: Proc. of the SIAM International Conference on Data Mining. SDM.","DOI":"10.1137\/1.9781611972771.66"},{"key":"10.1016\/j.media.2025.103556_b91","doi-asserted-by":"crossref","unstructured":"Sheng, V.S., Provost, F.J., Ipeirotis, P.G., 2008. Get another label? Improving data quality and data mining using multiple, noisy labelers. In: Proc. of the ACM International Conference on Knowledge Discovery & Data Mining.","DOI":"10.1145\/1401890.1401965"},{"issue":"1","key":"10.1016\/j.media.2025.103556_b92","doi-asserted-by":"crossref","first-page":"17","DOI":"10.1016\/S0167-7152(98)00006-6","article-title":"A weighted Kendall\u2019s tau statistic","volume":"39","author":"Shieh","year":"1998","journal-title":"Statist. Probab. Lett."},{"key":"10.1016\/j.media.2025.103556_b93","doi-asserted-by":"crossref","DOI":"10.1016\/j.jesp.2021.104157","article-title":"Wise teamwork: Collective confidence calibration predicts the effectiveness of group discussion","volume":"96","author":"Silver","year":"2021","journal-title":"J. Exp. Soc. Psychol."},{"key":"10.1016\/j.media.2025.103556_b94","article-title":"Inferring ground truth from subjective labelling of venus images","author":"Smyth","year":"1994"},{"key":"10.1016\/j.media.2025.103556_b95","doi-asserted-by":"crossref","unstructured":"Snow, R., O\u2019Connor, B., Jurafsky, D., Ng, A.Y., 2008. Cheap and Fast - But is it Good? Evaluating Non-Expert Annotations for Natural Language Tasks. In: Proc. of the Conference on Empirical Methods in Natural Language Processing.","DOI":"10.3115\/1613715.1613751"},{"key":"10.1016\/j.media.2025.103556_b96","doi-asserted-by":"crossref","unstructured":"Sorokin, A., Forsyth, D.A., 2008. Utility data annotation with Amazon Mechanical Turk. In: Proc. of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) Workshops.","DOI":"10.1109\/CVPRW.2008.4562953"},{"key":"10.1016\/j.media.2025.103556_b97","doi-asserted-by":"crossref","unstructured":"Tanno, R., Saeedi, A., Sankaranarayanan, S., Alexander, D.C., Silberman, N., 2019. Learning From Noisy Labels by Regularized Estimation of Annotator Confusion. In: Proc. of the IEEE Conference on Computer Vision and Pattern Recognition. CVPR.","DOI":"10.1109\/CVPR.2019.01150"},{"issue":"10","key":"10.1016\/j.media.2025.103556_b98","doi-asserted-by":"crossref","first-page":"2480","DOI":"10.1109\/TPAMI.2018.2860987","article-title":"Max-margin majority voting for learning from crowds","volume":"41","author":"Tian","year":"2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.media.2025.103556_b99","article-title":"Scaling and disagreements: Bias, noise, and ambiguity","volume":"5","author":"Uma","year":"2022","journal-title":"Front. Artifical Intell."},{"key":"10.1016\/j.media.2025.103556_b100","doi-asserted-by":"crossref","first-page":"1385","DOI":"10.1613\/jair.1.12752","article-title":"Learning from disagreement: A survey","volume":"72","author":"Uma","year":"2021","journal-title":"J. Artifical Intell. Res."},{"key":"10.1016\/j.media.2025.103556_b101","doi-asserted-by":"crossref","unstructured":"Vigna, S., 2015. A Weighted Correlation Index for Rankings with Ties. In: Proc. of the International World Wide Web Conference. WWW.","DOI":"10.1145\/2736277.2741088"},{"key":"10.1016\/j.media.2025.103556_b102","series-title":"Ilab at SemEval-2023 task 11 Le-Wi-Di: Modelling disagreement or modelling perspectives?","author":"Vitsakis","year":"2023"},{"key":"10.1016\/j.media.2025.103556_b103","doi-asserted-by":"crossref","unstructured":"Wang, D., Kaplan, L.M., Le, H.K., Abdelzaher, T.F., 2012. On truth discovery in social sensing: a maximum likelihood estimation approach. In: Proc. of the International Conference on Information Processing in Sensor Networks. IPSN.","DOI":"10.1145\/2185677.2185737"},{"key":"10.1016\/j.media.2025.103556_b104","unstructured":"Wang, H., Xiao, R., Li, Y., Feng, L., Niu, G., Chen, G., Zhao, J., 2022. PiCO: Contrastive Label Disambiguation for Partial Label Learning. In: Proc. of the International Conference on Learning Representations. ICLR."},{"issue":"4","key":"10.1016\/j.media.2025.103556_b105","doi-asserted-by":"crossref","first-page":"385","DOI":"10.1038\/nmeth.2855","article-title":"Sleep-spindle detection: crowdsourcing and evaluating performance of experts, non-experts and automated methods","volume":"11","author":"Warby","year":"2014","journal-title":"Nature Methods"},{"issue":"4","key":"10.1016\/j.media.2025.103556_b106","doi-asserted-by":"crossref","first-page":"20:1","DOI":"10.1145\/1852102.1852106","article-title":"A similarity measure for indefinite rankings","volume":"28","author":"Webber","year":"2010","journal-title":"ACM Trans. Inf. Syst. (TOIS)"},{"key":"10.1016\/j.media.2025.103556_b107","article-title":"The multidimensional wisdom of crowds","author":"Welinder","year":"2010"},{"key":"10.1016\/j.media.2025.103556_b108","doi-asserted-by":"crossref","unstructured":"Welinder, P., Perona, P., 2010. Online crowdsourcing: Rating annotators and obtaining cost-effective labels. In: Proc. of the IEEE Conference on Computer Vision and Pattern Recognition. CVPR.","DOI":"10.1109\/CVPRW.2010.5543189"},{"key":"10.1016\/j.media.2025.103556_b109","doi-asserted-by":"crossref","unstructured":"Wu, S., Crestani, F., 2003. Methods for Ranking Information Retrieval Systems without Relevance Judgments. In: Proc. of the ACM Symposium on Applied Computing. SAC.","DOI":"10.1145\/952686.952693"},{"issue":"3","key":"10.1016\/j.media.2025.103556_b110","doi-asserted-by":"crossref","first-page":"291","DOI":"10.1007\/s10994-013-5412-1","article-title":"Learning from multiple annotators with varying expertise","volume":"95","author":"Yan","year":"2014","journal-title":"Mach. Learn."},{"issue":"6","key":"10.1016\/j.media.2025.103556_b111","doi-asserted-by":"crossref","first-page":"796","DOI":"10.1109\/TKDE.2007.190745","article-title":"Truth discovery with multiple conflicting information providers on the web","volume":"20","author":"Yin","year":"2008","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"10.1016\/j.media.2025.103556_b112","series-title":"Disentangling human error from the ground truth in segmentation of medical images","author":"Zhang","year":"2020"},{"key":"10.1016\/j.media.2025.103556_b113","series-title":"Learning from label proportions by learning with label noise","author":"Zhang","year":"2022"},{"issue":"6","key":"10.1016\/j.media.2025.103556_b114","doi-asserted-by":"crossref","first-page":"550","DOI":"10.14778\/2168651.2168656","article-title":"A Bayesian approach to discovering truth from conflicting sources for data integration","volume":"5","author":"Zhao","year":"2012","journal-title":"Proc. the VLDB Endow."},{"issue":"5","key":"10.1016\/j.media.2025.103556_b115","doi-asserted-by":"crossref","first-page":"541","DOI":"10.14778\/3055540.3055547","article-title":"Truth inference in crowdsourcing: Is the problem solved?","volume":"10","author":"Zheng","year":"2017","journal-title":"Proc. the VLDB Endow."}],"container-title":["Medical Image Analysis"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1361841525001033?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1361841525001033?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,26]],"date-time":"2026-03-26T13:03:32Z","timestamp":1774530212000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1361841525001033"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7]]},"references-count":115,"alternative-id":["S1361841525001033"],"URL":"https:\/\/doi.org\/10.1016\/j.media.2025.103556","relation":{},"ISSN":["1361-8415"],"issn-type":[{"value":"1361-8415","type":"print"}],"subject":[],"published":{"date-parts":[[2025,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Evaluating medical AI systems in dermatology under uncertain ground truth","name":"articletitle","label":"Article Title"},{"value":"Medical Image Analysis","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.media.2025.103556","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Published by Elsevier B.V.","name":"copyright","label":"Copyright"}],"article-number":"103556"}}