{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T06:19:48Z","timestamp":1780640388255,"version":"3.54.1"},"reference-count":37,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,10,7]],"date-time":"2025-10-07T00:00:00Z","timestamp":1759795200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,10,7]],"date-time":"2025-10-07T00:00:00Z","timestamp":1759795200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"name":"Duke Whitehead Scholar Award"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["npj Digit. Med."],"DOI":"10.1038\/s41746-025-01963-x","type":"journal-article","created":{"date-parts":[[2025,10,7]],"date-time":"2025-10-07T12:49:35Z","timestamp":1759841375000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":25,"title":["The evaluation illusion of large language models in medicine"],"prefix":"10.1038","volume":"8","author":[{"given":"Monica","family":"Agrawal","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Irene Y.","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Freya","family":"Gulamali","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shalmali","family":"Joshi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,10,7]]},"reference":[{"key":"1963_CR1","doi-asserted-by":"publisher","first-page":"375","DOI":"10.1093\/jamia\/ocae295","volume":"32","author":"SJ Shah","year":"2025","unstructured":"Shah, S. J. et al. Ambient artificial intelligence scribes: physician burnout and perspectives on usability and documentation burden. J. Am. Med. Inform. Assoc. 32, 375\u2013380 (2025).","journal-title":"J. Am. Med. Inform. Assoc."},{"key":"1963_CR2","doi-asserted-by":"crossref","unstructured":"Rajashekar, N. C. et al. Human-algorithmic interaction using a large language model-augmented artificial intelligence clinical decision support system. In Proc. CHI Conf. Hum. Factors Comput. Syst. Vol. 442, 1\u201320 (Association for Computing Machinery, 2024).","DOI":"10.1145\/3613904.3642024"},{"key":"1963_CR3","doi-asserted-by":"publisher","first-page":"1665","DOI":"10.1093\/jamia\/ocae142","volume":"31","author":"S Liu","year":"2024","unstructured":"Liu, S. et al. Using large language model to guide patients to create efficient and comprehensive clinical care message. J. Am. Med. Inform. Assoc. 31, 1665\u20131670 (2024).","journal-title":"J. Am. Med. Inform. Assoc."},{"key":"1963_CR4","doi-asserted-by":"crossref","unstructured":"Jeong, D. P., Garg, S., Lipton, Z. C. & Oberst, M. Medical adaptation of large language and vision-language models: Are we making progress? In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing (eds Al-Onaizan, Y., Bansal, M. & Chen, Y.-N.) 12143\u201312170. https:\/\/aclanthology.org\/2024.emnlp-main.677\/ (Association for Computational Linguistics, 2024).","DOI":"10.18653\/v1\/2024.emnlp-main.677"},{"key":"1963_CR5","unstructured":"Li, Y., Harrigian, K., Zirikly, A. & Dredze, M. Are clinical T5 models better for clinical text? In Proc. 4th Machine Learning for Health Symposium, vol. 259 of Proceedings of Machine Learning Research (eds Hegselmann, S. et al.) 636\u2013667. https:\/\/proceedings.mlr.press\/v259\/li25a.html (PMLR, 2025)."},{"key":"1963_CR6","doi-asserted-by":"publisher","unstructured":"Ling, C. et al. Domain specialization as the key to make large language models disruptive: a comprehensive survey. ACM Comput. Surv. https:\/\/doi.org\/10.1145\/3764579 (Association for Computing Machinery, 2025).","DOI":"10.1145\/3764579"},{"key":"1963_CR7","doi-asserted-by":"publisher","first-page":"589","DOI":"10.1093\/jamia\/ocae301","volume":"32","author":"S Joshi","year":"2025","unstructured":"Joshi, S. et al. AI as an intervention: improving clinical outcomes relies on a causal approach to AI development and validation. J. Am. Med. Inform. Assoc. 32, 589\u2013594 (2025).","journal-title":"J. Am. Med. Inform. Assoc."},{"key":"1963_CR8","doi-asserted-by":"publisher","first-page":"319","DOI":"10.1001\/jama.2024.21700","volume":"333","author":"S Bedi","year":"2025","unstructured":"Bedi, S. et al. Testing and evaluation of health care applications of large language models: a systematic review. JAMA 333, 319\u2013328, https:\/\/doi.org\/10.1001\/jama.2024.21700 (2025).","journal-title":"JAMA"},{"key":"1963_CR9","doi-asserted-by":"publisher","first-page":"123","DOI":"10.1146\/annurev-biodatasci-092820-114757","volume":"4","author":"IY Chen","year":"2021","unstructured":"Chen, I. Y. et al. Ethical machine learning in healthcare. Annu. Rev. Biomed. Data Sci. 4, 123\u2013144 (2021).","journal-title":"Annu. Rev. Biomed. Data Sci."},{"key":"1963_CR10","doi-asserted-by":"publisher","first-page":"e12","DOI":"10.1016\/S2589-7500(23)00225-X","volume":"6","author":"T Zack","year":"2024","unstructured":"Zack, T. et al. Assessing the potential of GPT-4 to perpetuate racial and gender biases in health care: a model evaluation study. The Lancet Digit. Health 6, e12\u2013e22 (2024).","journal-title":"The Lancet Digit. Health"},{"key":"1963_CR11","doi-asserted-by":"publisher","unstructured":"Alaa, A. et al. Medical large language model benchmarks should prioritize construct validity. arXiv preprint https:\/\/doi.org\/10.48550\/arXiv.2503.10694 (2025).","DOI":"10.48550\/arXiv.2503.10694"},{"key":"1963_CR12","doi-asserted-by":"publisher","unstructured":"Romanov, A. & Shivade, C. Lessons from natural language inference in the clinical domain. In Proc. Conf. Empir. Methods Nat. Lang. Process 1586\u20131596 https:\/\/doi.org\/10.18653\/v1\/D18-1187 (Association for Computational Linguistics, 2018).","DOI":"10.18653\/v1\/D18-1187"},{"key":"1963_CR13","doi-asserted-by":"publisher","unstructured":"Herlihy, C. & Rudinger, R. Mednli is not immune: natural language inference artifacts in the clinical domain. In Proc. Annu. Meet. Assoc.Comput. Linguist. Int. Jt. Conf. Nat. Lang. Process. Vol. 2, 1020\u20131027 https:\/\/doi.org\/10.18653\/v1\/2021.acl-short.129 (Association for Computational Linguistics, 2021).","DOI":"10.18653\/v1\/2021.acl-short.129"},{"key":"1963_CR14","doi-asserted-by":"publisher","first-page":"AIe2401235","DOI":"10.1056\/AIe2401235","volume":"2","author":"ID Raji","year":"2025","unstructured":"Raji, ID., Daneshjou, R. & Alsentzer, E. It\u2019s time to bench the medical exam benchmark. NEJM AI 2, AIe2401235, https:\/\/doi.org\/10.1056\/AIe2401235 (2025).","journal-title":"NEJM AI"},{"key":"1963_CR15","unstructured":"Ball, J. R., Miller, B. T. & Balogh, E. P. Improving Diagnosis in Health Care (National Academies Press, 2015)."},{"key":"1963_CR16","first-page":"28858","volume":"37","author":"S Li","year":"2024","unstructured":"Li, S. et al. Mediq: question-asking LLMs and a benchmark for reliable interactive clinical reasoning. Adv. Neural Inf. Process. Syst. 37, 28858\u201328888 (2024).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1963_CR17","doi-asserted-by":"publisher","first-page":"77","DOI":"10.1038\/s41591-024-03328-5","volume":"31","author":"S Johri","year":"2025","unstructured":"Johri, S. et al. An evaluation framework for clinical use of large language models in patient interaction tasks. Nat. Med. 31, 77\u201386 (2025).","journal-title":"Nat. Med."},{"key":"1963_CR18","doi-asserted-by":"publisher","DOI":"10.1038\/s41746-024-01185-7","volume":"7","author":"Q Jin","year":"2024","unstructured":"Jin, Q. et al. Hidden flaws behind expert-level accuracy of multimodal GPT-4 vision in medicine. npj Digit. Med. 7, 190 (2024).","journal-title":"npj Digit. Med."},{"key":"1963_CR19","unstructured":"Lin, C.-Y. Rouge: A package for automatic evaluation of summaries. Proc. Workshop Text Summarization Branches Out, 74\u201381 (Association for Computational Linguistics, 2004)."},{"key":"1963_CR20","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T. & Zhu, W.-J. Bleu: a method for automatic evaluation of machine translation. In Proc. 40th Annu. Meet.Assoc. Comput. Linguist 311\u2013318 (Association for Computational Linguistics, 2002).","DOI":"10.3115\/1073083.1073135"},{"key":"1963_CR21","unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K. Q. & Artzi, Y. Bertscore: evaluating text generation with Bert. Proc. Int. Conf. Learn. Rep. https:\/\/openreview.net\/forum?id=SkeHuCVFDr (2019)."},{"key":"1963_CR22","doi-asserted-by":"publisher","first-page":"e59","DOI":"10.2196\/jmir.6962","volume":"19","author":"J Zheng","year":"2017","unstructured":"Zheng, J. & Yu, H. Readability formulas and user perceptions of electronic health records difficulty: a corpus study. J. Med. Internet Res. 19, e59 (2017).","journal-title":"J. Med. Internet Res."},{"key":"1963_CR23","doi-asserted-by":"publisher","DOI":"10.1038\/s41746-024-01258-7","volume":"7","author":"TYC Tam","year":"2024","unstructured":"Tam, T. Y. C. et al. A framework for human evaluation of large language models in healthcare derived from literature review. NPJ Digit. Med. 7, 258 (2024).","journal-title":"NPJ Digit. Med."},{"key":"1963_CR24","doi-asserted-by":"publisher","unstructured":"Xie, Y. et al. Doclens: multi-aspect fine-grained evaluation for medical text generation. In Proc. 62nd Annu. Meet. Assoc. Comput. Linguist Vol. 1, 649\u2013679 https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.39 (Association for Computational Linguistics, 2024).","DOI":"10.18653\/v1\/2024.acl-long.39"},{"key":"1963_CR25","unstructured":"Arora, R. K. et al. Healthbench: evaluating large language models towards improved human health. arXiv preprint arXiv:2505.08775 https:\/\/arxiv.org\/abs\/2505.08775 (2025)."},{"key":"1963_CR26","unstructured":"Wu, E., Wu, K. & Zou, J. MedArena: Comparing LLMs for medicine in the wild. Stanford HAI News https:\/\/hai.stanford.edu\/news\/medarena-comparing-llms-for-medicine-in-the-wild (2025)."},{"key":"1963_CR27","doi-asserted-by":"publisher","first-page":"e246565","DOI":"10.1001\/jamanetworkopen.2024.6565","volume":"7","author":"M Tai-Seale","year":"2024","unstructured":"Tai-Seale, M. et al. AI-generated draft replies integrated into health records and physicians\u2019 electronic communication. JAMA Netw. Open 7, e246565\u2013e246565 (2024).","journal-title":"JAMA Netw. Open"},{"key":"1963_CR28","doi-asserted-by":"publisher","first-page":"ooae028","DOI":"10.1093\/jamiaopen\/ooae028","volume":"7","author":"SL Baxter","year":"2024","unstructured":"Baxter, S. L., Longhurst, C. A., Millen, M., Sitapati, A. M. & Tai-Seale, M. Generative artificial intelligence responses to patient messages in the electronic health record: early lessons learned. JAMIA open 7, ooae028 (2024).","journal-title":"JAMIA open"},{"key":"1963_CR29","doi-asserted-by":"publisher","first-page":"e243201","DOI":"10.1001\/jamanetworkopen.2024.3201","volume":"7","author":"P Garcia","year":"2024","unstructured":"Garcia, P. et al. Artificial intelligence\u2013generated draft replies to patient inbox messages. JAMA Netw. Open 7, e243201\u2013e243201 (2024).","journal-title":"JAMA Netw. Open"},{"key":"1963_CR30","doi-asserted-by":"publisher","first-page":"938","DOI":"10.1093\/jamia\/ocv032","volume":"22","author":"R Pivovarov","year":"2015","unstructured":"Pivovarov, R. & Elhadad, N. Automated methods for the summarization of electronic health records. J. Am. Med. Inform. Assoc. 22, 938\u2013947 (2015).","journal-title":"J. Am. Med. Inform. Assoc."},{"key":"1963_CR31","doi-asserted-by":"publisher","first-page":"224","DOI":"10.15265\/IY-2016-017","volume":"25","author":"D Demner-Fushman","year":"2016","unstructured":"Demner-Fushman, D. & Elhadad, N. Aspiring to unintended consequences of natural language processing: a review of recent developments in clinical and consumer-generated text processing. Yearb. Med. Inform. 25, 224\u2013233 (2016).","journal-title":"Yearb. Med. Inform."},{"key":"1963_CR32","doi-asserted-by":"publisher","first-page":"326","DOI":"10.1093\/ehjdh\/ztae086","volume":"6","author":"C-J Chao","year":"2025","unstructured":"Chao, C.-J. et al. Evaluating large language models in echocardiography reporting: opportunities and challenges. Eur. heart j. Digit. health 6, 326\u2013339 (2025).","journal-title":"Eur. heart j. Digit. health"},{"key":"1963_CR33","unstructured":"Abacha, A. B. et al. An investigation of evaluation methods in automatic medical note generation. In Findings of the Association for Computational Linguistics: ACL (2023)."},{"key":"1963_CR34","doi-asserted-by":"publisher","first-page":"1134","DOI":"10.1038\/s41591-024-02855-5","volume":"30","author":"D Van Veen","year":"2024","unstructured":"Van Veen, D. et al. Adapted large language models can outperform medical experts in clinical text summarization. Nat. Med. 30, 1134\u20131142 (2024).","journal-title":"Nat. Med."},{"key":"1963_CR35","doi-asserted-by":"crossref","unstructured":"Moramarco, F. et al. Human evaluation and correlation with automatic metrics in consultation note generation. In Proc. 60th Annual Meeting of the Association for Computational Linguistics Vol. 1, 5739\u20135754 (Association for Computational Linguistics, 2022).","DOI":"10.18653\/v1\/2022.acl-long.394"},{"key":"1963_CR36","doi-asserted-by":"crossref","unstructured":"Wang, L. L. et al. Automated metrics for medical multi-document summarization disagree with human evaluations. In Proc. conference. Association for Computational Linguistics. Meeting Vol. 1, 9871\u20139889 (2023).","DOI":"10.18653\/v1\/2023.acl-long.549"},{"key":"1963_CR37","doi-asserted-by":"crossref","unstructured":"Abacha, A. B., Yim, W.-w., Fan, Y. & Lin, T. An empirical study of clinical note generation from doctor-patient encounters. In Proc. 17th Conference of the European Chapter of the Association for Computational Linguistics 2291\u20132302 (2023).","DOI":"10.18653\/v1\/2023.eacl-main.168"}],"container-title":["npj Digital Medicine"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-01963-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-01963-x","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-01963-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,7]],"date-time":"2025-10-07T12:49:42Z","timestamp":1759841382000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-01963-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,7]]},"references-count":37,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["1963"],"URL":"https:\/\/doi.org\/10.1038\/s41746-025-01963-x","relation":{},"ISSN":["2398-6352"],"issn-type":[{"value":"2398-6352","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10,7]]},"assertion":[{"value":"19 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 August 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 October 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"M.A. is a co-founder of Layer Health.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"600"}}