{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T03:25:32Z","timestamp":1773890732083,"version":"3.50.1"},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,4,23]],"date-time":"2025-04-23T00:00:00Z","timestamp":1745366400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,4,23]],"date-time":"2025-04-23T00:00:00Z","timestamp":1745366400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Sci Data"],"DOI":"10.1038\/s41597-025-04664-y","type":"journal-article","created":{"date-parts":[[2025,4,23]],"date-time":"2025-04-23T09:02:26Z","timestamp":1745398946000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["A benchmarking framework and dataset for learning to defer in human-AI decision-making"],"prefix":"10.1038","volume":"12","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-6865-4577","authenticated-orcid":false,"given":"Jean V.","family":"Alves","sequence":"first","affiliation":[]},{"given":"Diogo","family":"Leit\u00e3o","sequence":"additional","affiliation":[]},{"given":"S\u00e9rgio","family":"Jesus","sequence":"additional","affiliation":[]},{"given":"Marco O. P.","family":"Sampaio","sequence":"additional","affiliation":[]},{"given":"Javier","family":"Li\u00e9bana","sequence":"additional","affiliation":[]},{"given":"Pedro","family":"Saleiro","sequence":"additional","affiliation":[]},{"given":"M\u00e1rio A. T.","family":"Figueiredo","sequence":"additional","affiliation":[]},{"given":"Pedro","family":"Bizarro","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,23]]},"reference":[{"key":"4664_CR1","doi-asserted-by":"publisher","unstructured":"De-Arteaga, M., Fogliato, R. & Chouldechova, A. A Case for Humans-in-the-Loop: Decisions in the Presence of Erroneous Algorithmic Scores. In Proceedings of the 2020 CHI Conference on Human Factors in Computing Systems, 1\u201312 https:\/\/doi.org\/10.1145\/3313831.3376638 (ACM, Honolulu HI USA, 2020).","DOI":"10.1145\/3313831.3376638"},{"key":"4664_CR2","doi-asserted-by":"publisher","first-page":"637","DOI":"10.1007\/s12599-019-00595-2","volume":"61","author":"D Dellermann","year":"2019","unstructured":"Dellermann, D., Ebel, P., Soellner, M. & Leimeister, J. M. Hybrid. Intelligence. Business & Information Systems Engineering 61, 637\u2013643, https:\/\/doi.org\/10.1007\/s12599-019-00595-2 (2019).","journal-title":"Intelligence. Business & Information Systems Engineering"},{"key":"4664_CR3","doi-asserted-by":"publisher","unstructured":"Inkpen, K. et al. Advancing human-ai complementarity: The impact of user expertise and algorithmic tuning on joint decision making. ACM Trans. Comput.-Hum. Interact. 30, https:\/\/doi.org\/10.1145\/3534561 (2023).","DOI":"10.1145\/3534561"},{"key":"4664_CR4","doi-asserted-by":"publisher","unstructured":"Charusaie, M.-A., et al. (eds.) International Conference on Machine Learning, ICML 2022, 17-23 July 2022, Baltimore, Maryland, USA, vol. 162 of Proceedings of Machine Learning Research, 2972\u20133005 https:\/\/doi.org\/10.48550\/arXiv.2207.09584 (PMLR, 2022).","DOI":"10.48550\/arXiv.2207.09584"},{"key":"4664_CR5","doi-asserted-by":"publisher","unstructured":"Hemmer, P., Schellhammer, S., V\u00f6ssing, M., Jakubik, J. & Satzger, G. Forming Effective Human-AI Teams: Building Machine Learning Models that Complement the Capabilities of Multiple Experts. In Raedt, L. D. (ed.) Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence, IJCAI 2022, Vienna, Austria, 23-29 July 2022, 2478\u20132484 https:\/\/doi.org\/10.24963\/ijcai.2022\/344 (ijcai.org, 2022).","DOI":"10.24963\/ijcai.2022\/344"},{"key":"4664_CR6","doi-asserted-by":"publisher","unstructured":"Raghu, M. et al. Direct uncertainty prediction for medical second opinions. In International Conference on Machine Learning, 5281\u20135290 https:\/\/doi.org\/10.48550\/arXiv.1807.01771 (PMLR, 2019).","DOI":"10.48550\/arXiv.1807.01771"},{"key":"4664_CR7","doi-asserted-by":"publisher","unstructured":"Raghu, M. et al. The Algorithmic Automation Problem: Prediction, Triage, and Human Effort. CoRR abs\/1903.12220 https:\/\/doi.org\/10.48550\/arXiv.1903.12220 (2019).","DOI":"10.48550\/arXiv.1903.12220"},{"key":"4664_CR8","doi-asserted-by":"publisher","unstructured":"Mozannar, H. & Sontag, D. A. Consistent Estimators for Learning to Defer to an Expert. In Proceedings of the 37th International Conference on Machine Learning, ICML 2020, 13-18 July 2020, Virtual Event, vol. 119 of Proceedings of Machine Learning Research, 7076\u20137087 https:\/\/doi.org\/10.48550\/arXiv.2006.01862 (PMLR, 2020).","DOI":"10.48550\/arXiv.2006.01862"},{"key":"4664_CR9","doi-asserted-by":"publisher","unstructured":"Mozannar, H. et al. Who should predict? exact algorithms for learning to defer to humans. In Proceedings of the 26th International Conference on Artificial Intelligence and Statistics (AISTATS), https:\/\/doi.org\/10.48550\/arXiv.2301.06197 (2023).","DOI":"10.48550\/arXiv.2301.06197"},{"key":"4664_CR10","doi-asserted-by":"publisher","unstructured":"Madras, D., Pitassi, T. & Zemel, R. Predict Responsibly: Improving Fairness and Accuracy by Learning to Defer. In Advances in Neural Information Processing Systems, vol. 31 https:\/\/doi.org\/10.48550\/arXiv.1711.06664 (Curran Associates, Inc., 2018).","DOI":"10.48550\/arXiv.1711.06664"},{"key":"4664_CR11","doi-asserted-by":"publisher","unstructured":"Verma, R. & Nalisnick, E. T. Calibrated Learning to Defer with One-vs-All Classifiers. In Chaudhuri, K. et al. (eds.) International Conference on Machine Learning, ICML 2022, 17-23 July 2022, Baltimore, Maryland, USA, vol. 162 of Proceedings of Machine Learning Research, 22184\u201322202 https:\/\/doi.org\/10.48550\/arXiv.2202.03673 (PMLR, 2022).","DOI":"10.48550\/arXiv.2202.03673"},{"key":"4664_CR12","doi-asserted-by":"publisher","unstructured":"Verma, R., Barrej\u00f3n, D. & Nalisnick, E. Learning to defer to multiple experts: Consistent surrogate losses, confidence calibration, and conformal ensembles. In International Conference on Artificial Intelligence and Statistics, 11415\u201311434 https:\/\/doi.org\/10.48550\/arXiv.2210.16955 (PMLR, 2023).","DOI":"10.48550\/arXiv.2210.16955"},{"key":"4664_CR13","doi-asserted-by":"publisher","unstructured":"Hemmer, P., Thede, L., V\u00f6ssing, M., Jakubik, J. & K\u00fchl, N. Learning to defer with limited expert predictions. In Proceedings of the AAAI Conference on Artificial Intelligence, vol. 37, 6002\u20136011 https:\/\/doi.org\/10.1609\/aaai.v37i5.25742 (2023).","DOI":"10.1609\/aaai.v37i5.25742"},{"key":"4664_CR14","doi-asserted-by":"publisher","unstructured":"Wang, X. et al. Chestx-ray8: Hospital-scale chest x-ray database and benchmarks on weakly-supervised classification and localization of common thorax diseases. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2097\u20132106 https:\/\/doi.org\/10.1109\/CVPR.2017.369 (2017).","DOI":"10.1109\/CVPR.2017.369"},{"key":"4664_CR15","doi-asserted-by":"publisher","unstructured":"Peterson, J. C., Battleday, R. M., Griffiths, T. L. & Russakovsky, O. Human uncertainty makes classification more robust. In Proceedings of the IEEE\/CVF international conference on computer vision, 9617\u20139626 https:\/\/doi.org\/10.1109\/ICCV.2019.00971 (2019).","DOI":"10.1109\/ICCV.2019.00971"},{"key":"4664_CR16","unstructured":"Krizhevsky, A. Learning multiple layers of features from tiny images. University of Toronto (2012)."},{"key":"4664_CR17","doi-asserted-by":"publisher","unstructured":"Zhu, Z., Liu, T. & Liu, Y. A second-order approach to learning with instance-dependent label noise. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 10113\u201310123 https:\/\/doi.org\/10.48550\/arXiv.2012.11854 (2021).","DOI":"10.48550\/arXiv.2012.11854"},{"key":"4664_CR18","doi-asserted-by":"publisher","unstructured":"Berthon, A., Han, B., Niu, G., Liu, T. & Sugiyama, M. Confidence scores make instance-dependent label-noise learning possible. In International conference on machine learning, 825\u2013836 https:\/\/doi.org\/10.48550\/arXiv.2001.03772 (PMLR, 2021).","DOI":"10.48550\/arXiv.2001.03772"},{"key":"4664_CR19","unstructured":"Ke, G. et al. LightGBM: A Highly Efficient Gradient Boosting Decision Tree. In Guyon, I. et al. (eds.) Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA, 3146\u20133154 (2017)."},{"key":"4664_CR20","doi-asserted-by":"publisher","unstructured":"Jesus, S. et al. Turning the Tables: Biased, Imbalanced, Dynamic Tabular Datasets for ML Evaluation. In Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks 1, NeurIPS Datasets and Benchmarks 2022 https:\/\/doi.org\/10.48550\/arXiv.2211.13358 (2022).","DOI":"10.48550\/arXiv.2211.13358"},{"key":"4664_CR21","doi-asserted-by":"publisher","unstructured":"Amarasinghe, K. et al. On the importance of application-grounded experimental design for evaluating explainable ml methods. Proceedings of the AAAI Conference on Artificial Intelligence, 38(19), 20921\u201320929, https:\/\/doi.org\/10.1609\/aaai.v38i19.30082 (2024).","DOI":"10.1609\/aaai.v38i19.30082"},{"key":"4664_CR22","doi-asserted-by":"publisher","unstructured":"Levy, A., Agrawal, M., Satyanarayan, A. & Sontag, D. Assessing the impact of automated suggestions on decision making: Domain experts mediate model errors but take less initiative. In Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, 1\u201313 https:\/\/doi.org\/10.1145\/3411764.3445522 (2021).","DOI":"10.1145\/3411764.3445522"},{"key":"4664_CR23","doi-asserted-by":"publisher","first-page":"1023","DOI":"10.2307\/2290129","volume":"83","author":"TJ Mitchell","year":"1988","unstructured":"Mitchell, T. J. & Beauchamp, J. J. Bayesian variable selection in linear regression. Journal of the american statistical association 83, 1023\u20131032, https:\/\/doi.org\/10.2307\/2290129 (1988).","journal-title":"Journal of the american statistical association"},{"key":"4664_CR24","unstructured":"Burden, R. L. & Faires, J. D. 2.1 bissection method. In Numerical analysis, vol. 3 (PWS Publishers, 1985)."},{"key":"4664_CR25","doi-asserted-by":"publisher","first-page":"2402","DOI":"10.1001\/jama.2016.17216","volume":"316","author":"V Gulshan","year":"2016","unstructured":"Gulshan, V. et al. Development and validation of a deep learning algorithm for detection of diabetic retinopathy in retinal fundus photographs. Jama 316, 2402\u20132410, https:\/\/doi.org\/10.1001\/jama.2016.17216 (2016).","journal-title":"Jama"},{"key":"4664_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2523813","volume":"46","author":"J Gama","year":"2014","unstructured":"Gama, J., \u017dliobait\u0117, I., Bifet, A., Pechenizkiy, M. & Bouchachia, A. A survey on concept drift adaptation. ACM computing surveys (CSUR) 46, 1\u201337, https:\/\/doi.org\/10.1145\/2523813 (2014).","journal-title":"ACM computing surveys (CSUR)"},{"key":"4664_CR27","unstructured":"Elkan, C. The foundations of cost-sensitive learning. In International joint conference on artificial intelligence, vol. 17, 973\u2013978 (Lawrence Erlbaum Associates Ltd, 2001)."},{"key":"4664_CR28","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s42521-020-00023-1","volume":"2","author":"J Han","year":"2020","unstructured":"Han, J., Huang, Y., Liu, S. & Towey, K. Artificial intelligence for anti-money laundering: a review and extension. Digital Finance 2, 211\u2013239, https:\/\/doi.org\/10.1007\/s42521-020-00023-1 (2020).","journal-title":"Digital Finance"},{"key":"4664_CR29","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1016\/j.inffus.2021.11.011","volume":"81","author":"R Shwartz-Ziv","year":"2022","unstructured":"Shwartz-Ziv, R. & Armon, A. Tabular data: Deep learning is not all you need. Information Fusion 81, 84\u201390, https:\/\/doi.org\/10.1016\/j.inffus.2021.11.011 (2022).","journal-title":"Information Fusion"},{"key":"4664_CR30","doi-asserted-by":"publisher","unstructured":"Borisov, V. et al. Deep neural networks and tabular data: A survey. IEEE Transactions on Neural Networks and Learning Systems https:\/\/doi.org\/10.1109\/TNNLS.2022.3229161 (2022).","DOI":"10.1109\/TNNLS.2022.3229161"},{"key":"4664_CR31","doi-asserted-by":"publisher","unstructured":"Alves, J. et al. Financial fraud alert review dataset. figshare https:\/\/doi.org\/10.6084\/m9.figshare.28351172.","DOI":"10.6084\/m9.figshare.28351172"},{"key":"4664_CR32","doi-asserted-by":"publisher","first-page":"1770","DOI":"10.1016\/j.jss.2007.03.001","volume":"80","author":"S Grimstad","year":"2007","unstructured":"Grimstad, S. & J\u00f8rgensen, M. Inconsistency of expert judgment-based estimates of software development effort. Journal of Systems and Software 80, 1770\u20131777, https:\/\/doi.org\/10.1016\/j.jss.2007.03.001 (2007).","journal-title":"Journal of Systems and Software"},{"key":"4664_CR33","doi-asserted-by":"publisher","unstructured":"Remenyi, B. et al. Inter-rater and intra-rater reliability and agreement of echocardiographic diagnosis of rheumatic heart disease using the world heart federation evidence-based criteria. Heart Asia 11 https:\/\/doi.org\/10.1136\/heartasia-2019-011233 (2019).","DOI":"10.1136\/heartasia-2019-011233"},{"key":"4664_CR34","doi-asserted-by":"publisher","first-page":"104315","DOI":"10.1016\/j.euroecorev.2022.104315","volume":"151","author":"L Lippens","year":"2023","unstructured":"Lippens, L., Vermeiren, S. & Baert, S. The state of hiring discrimination: A meta-analysis of (almost) all recent correspondence experiments. European Economic Review 151, 104315, https:\/\/doi.org\/10.1016\/j.euroecorev.2022.104315 (2023).","journal-title":"European Economic Review"},{"key":"4664_CR35","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1146\/annurev-statistics-042720-125902","volume":"8","author":"S Mitchell","year":"2021","unstructured":"Mitchell, S., Potash, E., Barocas, S., D\u2019Amour, A. & Lum, K. Algorithmic fairness: Choices, assumptions, and definitions. Annual Review of Statistics and Its Application 8, 141\u2013163, https:\/\/doi.org\/10.1146\/annurev-statistics-042720-125902 (2021).","journal-title":"Annual Review of Statistics and Its Application"},{"key":"4664_CR36","doi-asserted-by":"publisher","unstructured":"Corbett-Davies, S., Pierson, E., Feller, A., Goel, S. & Huq, A. Algorithmic decision making and the cost of fairness. In Proceedings of the 23rd acm sigkdd international conference on knowledge discovery and data mining, 797\u2013806 https:\/\/doi.org\/10.1145\/3097983.3098095 (2017).","DOI":"10.1145\/3097983.3098095"},{"key":"4664_CR37","doi-asserted-by":"publisher","unstructured":"Alves, J. V. et al. Cost-sensitive learning to defer to multiple experts with workload constraints. Transactions on Machine Learning Research, 2835\u20138856, https:\/\/doi.org\/10.48550\/arXiv.2403.06906 (2024).","DOI":"10.48550\/arXiv.2403.06906"},{"key":"4664_CR38","doi-asserted-by":"publisher","unstructured":"Guo, C., Pleiss, G., Sun, Y. & Weinberger, K. Q. On calibration of modern neural networks. In International conference on machine learning, 1321\u20131330, https:\/\/doi.org\/10.48550\/arXiv.1706.04599 (PMLR, 2017).","DOI":"10.48550\/arXiv.1706.04599"},{"key":"4664_CR39","doi-asserted-by":"publisher","unstructured":"Hofmann, H. Statlog (German Credit Data). UCI Machine Learning Repository https:\/\/doi.org\/10.24432\/C5NC77 (1994).","DOI":"10.24432\/C5NC77"},{"key":"4664_CR40","doi-asserted-by":"publisher","unstructured":"Tailor, D., Patra, A., Verma, R., Manggala, P. & Nalisnick, E. Learning to defer to a population: A meta-learning approach. In International Conference on Artificial Intelligence and Statistics, 3475\u20133483, https:\/\/doi.org\/10.48550\/arXiv.2403.02683 (PMLR, 2024).","DOI":"10.48550\/arXiv.2403.02683"},{"key":"4664_CR41","doi-asserted-by":"publisher","unstructured":"Akiba, T., et al. (eds.) Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, KDD 2019, Anchorage, AK, USA, August 4\u20138, 2019, 2623\u20132631, https:\/\/doi.org\/10.1145\/3292500.3330701 (ACM, 2019).","DOI":"10.1145\/3292500.3330701"}],"container-title":["Scientific Data"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.nature.com\/articles\/s41597-025-04664-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s41597-025-04664-y","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s41597-025-04664-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,23]],"date-time":"2025-04-23T09:02:37Z","timestamp":1745398957000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.nature.com\/articles\/s41597-025-04664-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,23]]},"references-count":41,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["4664"],"URL":"https:\/\/doi.org\/10.1038\/s41597-025-04664-y","relation":{},"ISSN":["2052-4463"],"issn-type":[{"value":"2052-4463","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,23]]},"assertion":[{"value":"13 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 February 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 April 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declare no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"506"}}