{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T00:49:46Z","timestamp":1778806186709,"version":"3.51.4"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,11,5]],"date-time":"2025-11-05T00:00:00Z","timestamp":1762300800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,11,5]],"date-time":"2025-11-05T00:00:00Z","timestamp":1762300800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/100000092","name":"U.S. National Library of Medicine","doi-asserted-by":"publisher","award":["5T15LM007359"],"award-info":[{"award-number":["5T15LM007359"]}],"id":[{"id":"10.13039\/100000092","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","award":["R00 LM014308-02"],"award-info":[{"award-number":["R00 LM014308-02"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","award":["R01LM012973"],"award-info":[{"award-number":["R01LM012973"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["npj Digit. Med."],"DOI":"10.1038\/s41746-025-02005-2","type":"journal-article","created":{"date-parts":[[2025,11,5]],"date-time":"2025-11-05T16:35:46Z","timestamp":1762360546000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":22,"title":["Evaluating clinical AI summaries with large language models as judges"],"prefix":"10.1038","volume":"8","author":[{"given":"Emma","family":"Croxford","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanjun","family":"Gao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Elliot","family":"First","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nicholas","family":"Pellegrino","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Miranda","family":"Schnier","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"John","family":"Caskey","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Madeline","family":"Oguss","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Graham","family":"Wills","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guanhua","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dmitriy","family":"Dligach","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Matthew M.","family":"Churpek","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Anoop","family":"Mayampurath","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Frank","family":"Liao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cherodeep","family":"Goswami","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Karen K.","family":"Wong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Brian W.","family":"Patterson","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Majid","family":"Afshar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,11,5]]},"reference":[{"key":"2005_CR1","doi-asserted-by":"publisher","first-page":"ooae039","DOI":"10.1093\/jamiaopen\/ooae039","volume":"7","author":"BW Patterson","year":"2024","unstructured":"Patterson, B. W. et al. Call me dr ishmael: Trends in electronic health record notes available at emergency department visits and admissions. JAMIA Open 7, ooae039 (2024).","journal-title":"JAMIA Open"},{"key":"2005_CR2","doi-asserted-by":"publisher","first-page":"505","DOI":"10.1197\/jamia.M1700","volume":"12","author":"L Poissant","year":"2005","unstructured":"Poissant, L., Pereira, J., Tamblyn, R. & Kawasumi, Y. The impact of electronic health records on time efficiency of physicians and nurses: A systematic review. J. Am. Med. Inform. Assoc.: JAMIA 12, 505\u2013516 (2005).","journal-title":"J. Am. Med. Inform. Assoc.: JAMIA"},{"key":"2005_CR3","doi-asserted-by":"publisher","first-page":"899","DOI":"10.1093\/jamia\/ocaa332","volume":"28","author":"MG Semanik","year":"2021","unstructured":"Semanik, M. G. et al. Impact of a problem-oriented view on clinical data retrieval. J. Am. Med. Inform. Assoc.: JAMIA 28, 899\u2013906 (2021).","journal-title":"J. Am. Med. Inform. Assoc.: JAMIA"},{"key":"2005_CR4","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1016\/j.jbi.2015.03.004","volume":"55","author":"O Ben-Assuli","year":"2015","unstructured":"Ben-Assuli, O., Sagi, D., Leshno, M., Ironi, A. & Ziv, A. Improving diagnostic accuracy using ehr in emergency departments: A simulation-based study. J. Biomed. Inform. 55, 31\u201340 (2015).","journal-title":"J. Biomed. Inform."},{"key":"2005_CR5","doi-asserted-by":"publisher","first-page":"718","DOI":"10.1136\/amiajnl-2012-000946","volume":"20","author":"PJ Embi","year":"2013","unstructured":"Embi, P. J. et al. Computerized provider documentation: findings and implications of a multisite study of clinicians and administrators. J. Am. Med. Inform. Assoc.: JAMIA 20, 718\u2013726 (2013).","journal-title":"J. Am. Med. Inform. Assoc.: JAMIA"},{"key":"2005_CR6","unstructured":"OpenAI et al. Gpt-4 technical report http:\/\/arxiv.org\/abs\/2303.08774 (2024)."},{"key":"2005_CR7","first-page":"ooae080","volume":"7","author":"M Afshar","year":"2024","unstructured":"Afshar, M. et al. Prompt engineering with a large language model to assist providers in responding to patient inquiries: a real-time implementation in the electronic health record. JAMIA Open 7, ooae080 (2024).","journal-title":"JAMIA Open"},{"key":"2005_CR8","doi-asserted-by":"publisher","first-page":"e243201","DOI":"10.1001\/jamanetworkopen.2024.3201","volume":"7","author":"P Garcia","year":"2024","unstructured":"Garcia, P. et al. Artificial intelligence\u2013generated draft replies to patient inbox messages. JAMA Netw. Open 7, e243201 (2024).","journal-title":"JAMA Netw. Open"},{"key":"2005_CR9","doi-asserted-by":"publisher","first-page":"e246565","DOI":"10.1001\/jamanetworkopen.2024.6565","volume":"7","author":"M Tai-Seale","year":"2024","unstructured":"Tai-Seale, M. et al. Ai-generated draft replies integrated into health records and physicians\u2019 electronic communication. JAMA Netw. Open 7, e246565 (2024).","journal-title":"JAMA Netw. Open"},{"key":"2005_CR10","unstructured":"Umapathi, L. K., Pal, A. & Sankarasubbu, M. Med-halt: Medical domain hallucination test for large language models http:\/\/arxiv.org\/abs\/2307.15343 (2023)."},{"key":"2005_CR11","unstructured":"Abacha, A. B., Yim, W.-w., Michalopoulos, G. & Lin, T. An investigation of evaluation metrics for automated medical note generation http:\/\/arxiv.org\/abs\/2305.17364 (2023)."},{"key":"2005_CR12","unstructured":"Zhao, W. X. et al. A survey of large language models http:\/\/arxiv.org\/abs\/2303.18223 (2023)."},{"key":"2005_CR13","doi-asserted-by":"crossref","unstructured":"He, K. et al. A survey of large language models for healthcare: from data, technology, and applications to accountability and ethics (2024). http:\/\/arxiv.org\/abs\/2310.05694.","DOI":"10.2139\/ssrn.4809363"},{"key":"2005_CR14","unstructured":"Li, T., Zhang, G., Do, Q. D., Yue, X. & Chen, W. Long-context llms struggle with long in-context learning http:\/\/arxiv.org\/abs\/2404.02060 (2024)."},{"key":"2005_CR15","unstructured":"Xiong, W. et al. Effective long-context scaling of foundation models. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), 4643\u20134663 (Association for Computational Linguistics, Mexico City, Mexico, 2024). https:\/\/aclanthology.org\/2024.naacl-long.260."},{"key":"2005_CR16","doi-asserted-by":"crossref","unstructured":"Sai, A., Mohankumar, A. & Khapra, M. A survey of evaluation metrics used for nlg systems. ACM Comput. Surv. 55, 1\u201339 (2023).","DOI":"10.1145\/3485766"},{"key":"2005_CR17","first-page":"309","volume":"2024","author":"E Croxford","year":"2024","unstructured":"Croxford, E. et al. Development of a human evaluation framework and correlation with automated metrics for natural language generation of medical diagnoses. AMIA \u2026 Annu. Symp. Proc. AMIA Symp. 2024, 309\u2013318 (2024).","journal-title":"AMIA \u2026 Annu. Symp. Proc. AMIA Symp."},{"key":"2005_CR18","doi-asserted-by":"publisher","first-page":"6","DOI":"10.1038\/s44401-024-00011-2","volume":"2","author":"E Croxford","year":"2025","unstructured":"Croxford, E. et al. Current and future state of evaluation of large language models for medical summarization tasks. npj Health Syst. 2, 6 (2025).","journal-title":"npj Health Syst."},{"key":"2005_CR19","doi-asserted-by":"crossref","unstructured":"Bedi, S. et al. Testing and evaluation of health care applications of large language models: A systematic review. JAMA 333, 319\u2013328 (2025).","DOI":"10.1001\/jama.2024.21700"},{"key":"2005_CR20","doi-asserted-by":"publisher","DOI":"10.1038\/s41746-024-01258-7","volume":"7","author":"TYC Tam","year":"2024","unstructured":"Tam, T. Y. C. et al. A framework for human evaluation of large language models in healthcare derived from literature review. npj Digital Med. 7, 258 (2024).","journal-title":"npj Digital Med."},{"key":"2005_CR21","doi-asserted-by":"publisher","first-page":"164","DOI":"10.4338\/ACI-2011-11-RA-0070","volume":"3","author":"P Stetson","year":"2012","unstructured":"Stetson, P., Bakken, S., Wrenn, J. & Siegler, E. Assessing electronic note quality using the physician documentation quality instrument (pdqi-9). Appl. Clin. Inform. 3, 164\u2013174 (2012).","journal-title":"Appl. Clin. Inform."},{"key":"2005_CR22","first-page":"CAT.23.0404","volume":"5","author":"AA Tierney","year":"2024","unstructured":"Tierney, A. A. et al. Ambient artificial intelligence scribes to alleviate the burden of clinical documentation. NEJM Catal. 5, CAT.23.0404 (2024).","journal-title":"NEJM Catal."},{"key":"2005_CR23","doi-asserted-by":"publisher","first-page":"1050","DOI":"10.1093\/jamia\/ocaf068","volume":"32","author":"E Croxford","year":"2025","unstructured":"Croxford, E. et al. Development and validation of the provider documentation summarization quality instrument for large language models. J. Am. Med. Inform. Assoc. 32, 1050\u20131060 (2025).","journal-title":"J. Am. Med. Inform. Assoc."},{"key":"2005_CR24","unstructured":"Li, D. et al. From generation to judgment: Opportunities and challenges of LLM-as-a-judge http:\/\/arxiv.org\/abs\/2411.16594 (2025)."},{"key":"2005_CR25","unstructured":"Li, H. et al. Llms-as-judges: A comprehensive survey on LLM-based evaluation methods http:\/\/arxiv.org\/abs\/2412.05579 (2024)."},{"key":"2005_CR26","unstructured":"Gu, J. et al. A survey on LLM-as-a-judge http:\/\/arxiv.org\/abs\/2411.15594 (2025)."},{"key":"2005_CR27","unstructured":"Fourney, A. et al. Magentic-one: A generalist multi-agent system for solving complex tasks http:\/\/arxiv.org\/abs\/2411.04468 (2024)."},{"key":"2005_CR28","first-page":"461","volume":"2023","author":"Y Gao","year":"2023","unstructured":"Gao, Y., Dligach, D., Miller, T., Churpek, M. M. & Afshar, M. Overview of the problem list summarization (probsum) 2023 shared task on summarizing patients\u2019 active diagnoses and problems from electronic health record progress notes. Proc. Conf. Assoc. Computational Linguist. Meet. 2023, 461\u2013467 (2023).","journal-title":"Proc. Conf. Assoc. Computational Linguist. Meet."},{"key":"2005_CR29","unstructured":"Hugging face \u2013 the ai community building the future (2024). https:\/\/huggingface.co\/."},{"key":"2005_CR30","unstructured":"Panickssery, A., Bowman, S. R. & Feng, S. Llm evaluators recognize and favor their own generations."},{"key":"2005_CR31","unstructured":"Dubois, Y., Galambosi, B., Liang, P. & Hashimoto, T. B. Length-controlled alpacaeval: A simple way to debias automatic evaluators http:\/\/arxiv.org\/abs\/2404.04475 (2025)."},{"key":"2005_CR32","unstructured":"Zheng, L. et al. Judging LLM-as-a-judge with MT-Bench and Chatbot Arena http:\/\/arxiv.org\/abs\/2306.05685 (2023)."},{"key":"2005_CR33","unstructured":"Du, Y., Li, S., Torralba, A., Tenenbaum, J. B. & Mordatch, I. Improving factuality and reasoning in language models through multiagent debate http:\/\/arxiv.org\/abs\/2305.14325 (2023)."},{"key":"2005_CR34","doi-asserted-by":"crossref","unstructured":"Liang, T. et al. Encouraging divergent thinking in large language models through multi-agent debate. In Al-Onaizan, Y., Bansal, M. & Chen, Y.-N. (eds.) Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, 17889\u201317904 (Association for Computational Linguistics, Miami, Florida, USA, 2024). https:\/\/aclanthology.org\/2024.emnlp-main.992\/.","DOI":"10.18653\/v1\/2024.emnlp-main.992"},{"key":"2005_CR35","unstructured":"Wang, J., Wang, J., Athiwaratkun, B., Zhang, C. & Zou, J. Mixture-of-agents enhances large language model capabilities http:\/\/arxiv.org\/abs\/2406.04692 (2024)."},{"key":"2005_CR36","unstructured":"Kim, Y. et al. Mdagents: An adaptive collaboration of llms for medical decision-making (2024)."},{"key":"2005_CR37","unstructured":"DeepSeek-AI. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning https:\/\/arxiv.org\/abs\/2501.12948 (2025)."},{"key":"2005_CR38","unstructured":"von Werra, L. et al. Trl: Transformer reinforcement learning. https:\/\/github.com\/huggingface\/trl (2020)."},{"key":"2005_CR39","unstructured":"Frantar, E., Ashkboos, S., Hoefler, T. & Alistarh, D. Gptq: Accurate post-training quantization for generative pre-trained transformers http:\/\/arxiv.org\/abs\/2210.17323 (2023)."},{"key":"2005_CR40","unstructured":"Dettmers, T., Pagnoni, A., Holtzman, A. & Zettlemoyer, L. Qlora: Efficient finetuning of quantized llms http:\/\/arxiv.org\/abs\/2305.14314 (2023)."},{"key":"2005_CR41","doi-asserted-by":"crossref","unstructured":"Akiba, T., Sano, S., Yanase, T., Ohta, T. & Koyama, M. Optuna: A next-generation hyperparameter optimization framework. In The 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, 2623\u20132631 (2019).","DOI":"10.1145\/3292500.3330701"},{"key":"2005_CR42","doi-asserted-by":"publisher","unstructured":"Fisher, R. A.Statistical Methods for Research Workers, 66\u201370 (Springer, New York, NY, 1992). https:\/\/doi.org\/10.1007\/978-1-4612-4380-9_6","DOI":"10.1007\/978-1-4612-4380-9_6"},{"key":"2005_CR43","doi-asserted-by":"publisher","first-page":"155","DOI":"10.1016\/j.jcm.2016.02.012","volume":"15","author":"TK Koo","year":"2016","unstructured":"Koo, T. K. & Li, M. Y. A guideline of selecting and reporting intraclass correlation coefficients for reliability research. J. Chiropr. Med. 15, 155 (2016).","journal-title":"J. Chiropr. Med."},{"key":"2005_CR44","doi-asserted-by":"crossref","unstructured":"Shrout, P. E. & Fleiss, J. L. Intraclass correlations: Uses in assessing rater reliability. Psychol. Bull. 86, 420\u20138 (1979).","DOI":"10.1037\/\/0033-2909.86.2.420"},{"key":"2005_CR45","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1348\/000711006X126600","volume":"61","author":"KL Gwet","year":"2008","unstructured":"Gwet, K. L. Computing inter-rater reliability and its variance in the presence of high agreement. Br. J. Math. Stat. Psychol. 61, 29\u201348 (2008).","journal-title":"Br. J. Math. Stat. Psychol."}],"container-title":["npj Digital Medicine"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-02005-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-02005-2","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-02005-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,5]],"date-time":"2025-11-05T16:35:49Z","timestamp":1762360549000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-02005-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,5]]},"references-count":45,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["2005"],"URL":"https:\/\/doi.org\/10.1038\/s41746-025-02005-2","relation":{},"ISSN":["2398-6352"],"issn-type":[{"value":"2398-6352","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,5]]},"assertion":[{"value":"20 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 September 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 November 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declare no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"640"}}