{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T16:14:37Z","timestamp":1776096877873,"version":"3.50.1"},"reference-count":25,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T00:00:00Z","timestamp":1756425600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"},{"start":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T00:00:00Z","timestamp":1756425600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"funder":[{"name":"Weill Neurohub"},{"DOI":"10.13039\/100000054","name":"National Cancer Institute","doi-asserted-by":"publisher","award":["P30CA082103"],"award-info":[{"award-number":["P30CA082103"]}],"id":[{"id":"10.13039\/100000054","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["npj Digit. Med."],"DOI":"10.1038\/s41746-025-01926-2","type":"journal-article","created":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T09:31:05Z","timestamp":1756459865000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Robustness tests for biomedical foundation models should tailor to specifications"],"prefix":"10.1038","volume":"8","author":[{"given":"R. Patrick","family":"Xian","sequence":"first","affiliation":[]},{"given":"Noah R.","family":"Baker","sequence":"additional","affiliation":[]},{"given":"Tom","family":"David","sequence":"additional","affiliation":[]},{"given":"Qiming","family":"Cui","sequence":"additional","affiliation":[]},{"given":"A. Jay","family":"Holmgren","sequence":"additional","affiliation":[]},{"given":"Stefan","family":"Bauer","sequence":"additional","affiliation":[]},{"given":"Madhumita","family":"Sushil","sequence":"additional","affiliation":[]},{"given":"Reza","family":"Abbasi-Asl","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,29]]},"reference":[{"key":"1926_CR1","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1145\/3665926","volume":"57","author":"A Tocchetti","year":"2025","unstructured":"Tocchetti, A. et al. A.I. Robustness: a human-centered perspective on technological challenges and opportunities. ACM Comput. Surv. 57, 141 (2025).","journal-title":"ACM Comput. Surv."},{"key":"1926_CR2","doi-asserted-by":"publisher","first-page":"197","DOI":"10.1038\/s41746-022-00737-z","volume":"5","author":"KM Kostick-Quenet","year":"2022","unstructured":"Kostick-Quenet, K. M. & Gerke, S. AI in the hands of imperfect users. npj Digit. Med. 5, 197 (2022).","journal-title":"npj Digit. Med."},{"key":"1926_CR3","doi-asserted-by":"publisher","first-page":"15","DOI":"10.14296\/deeslr.v17i0.5171","volume":"17","author":"PB Ladkin","year":"2020","unstructured":"Ladkin, P. B. Robustness of Software. Digit. Evid. Electron. Signature Law Rev. 17, 15\u201324 (2020).","journal-title":"Digit. Evid. Electron. Signature Law Rev."},{"key":"1926_CR4","doi-asserted-by":"publisher","first-page":"241","DOI":"10.1001\/jama.2024.21451","volume":"333","author":"HJ Warraich","year":"2025","unstructured":"Warraich, H. J., Tazbaz, T. & Califf, R. M. FDA perspective on the regulation of artificial intelligence in health care and biomedicine. JAMA 333, 241\u2013247 (2025).","journal-title":"JAMA"},{"key":"1926_CR5","doi-asserted-by":"publisher","first-page":"e662","DOI":"10.1016\/S2589-7500(24)00124-9","volume":"6","author":"O Freyer","year":"2024","unstructured":"Freyer, O., Wiest, I. C., Kather, J. N. & Gilbert, S. A future role for health applications of large language models depends on regulators enforcing safety standards. Lancet Digit. Health 6, e662\u2013e672 (2024).","journal-title":"Lancet Digit. Health"},{"key":"1926_CR6","doi-asserted-by":"publisher","first-page":"1227","DOI":"10.1093\/jamia\/ocad065","volume":"30","author":"D Lyell","year":"2023","unstructured":"Lyell, D., Wang, Y., Coiera, E. & Magrabi, F. More than algorithms: an analysis of safety events involving ML-enabled medical devices reported to the FDA. J. Am. Med. Inform. Assoc 30, 1227\u20131236 (2023).","journal-title":"J. Am. Med. Inform. Assoc"},{"key":"1926_CR7","doi-asserted-by":"publisher","first-page":"929","DOI":"10.1109\/32.962562","volume":"27","author":"G Rothermel","year":"2001","unstructured":"Rothermel, G., Untch, R., Chu, C. & Harrold, M. Prioritizing test cases for regression testing. IEEE Trans. Softw. Eng. 27, 929\u2013948 (2001).","journal-title":"IEEE Trans. Softw. Eng."},{"key":"1926_CR8","unstructured":"Chen, P.-Y., Liu, S. & Paul, S. Foundational Robustness of Foundation Models. NeurIPS Tutorial (2022). https:\/\/research.ibm.com\/publications\/foundational-robustness-of-foundation-models."},{"key":"1926_CR9","unstructured":"Qi, X. et al. AI Risk Management Should Incorporate Both Safety and Security. arXiv:2405.19524. http:\/\/arxiv.org\/abs\/2405.19524 (2024)."},{"key":"1926_CR10","doi-asserted-by":"publisher","first-page":"1156","DOI":"10.1038\/s42256-024-00899-3","volume":"6","author":"J Yang","year":"2024","unstructured":"Yang, J. et al. Poisoning medical knowledge using large language models. Nat. Mach. Intell. 6, 1156\u20131168 (2024).","journal-title":"Nat. Mach. Intell."},{"key":"1926_CR11","doi-asserted-by":"crossref","unstructured":"Jin, R., Huang, C.-Y., You, C. & Li, X. Back-door Attack on Unpaired Medical Image-Text Foundation Models: A Pilot Study on Med-CLIP. In 2024 IEEE Conference on Secure and Trustworthy Machine Learning (SaTML), 272\u2013285 (IEEE, 2024).","DOI":"10.1109\/SaTML59370.2024.00020"},{"key":"1926_CR12","unstructured":"Chowdhury, A. G. et al. Breaking Down the Defenses: A Comparative Survey of Attacks on Large Language Models. arXiv:2403.04786. http:\/\/arxiv.org\/abs\/2403.04786 (2024)."},{"key":"1926_CR13","doi-asserted-by":"crossref","unstructured":"Karunanayake, N., Gunawardena, R., Senevi-Ratne, S. & Chawla, S. Out-of-distribution data: an acquaintance of adversarial examples - a survey. ACM Comput. Surv. 57, 210 (2025).","DOI":"10.1145\/3719292"},{"key":"1926_CR14","doi-asserted-by":"publisher","first-page":"2613","DOI":"10.1038\/s41591-024-03097-1","volume":"30","author":"P Hager","year":"2024","unstructured":"Hager, P. et al. Evaluation and mitigation of the limitations of large language models in clinical decision-making. Nat. Med. 30, 2613\u20132622 (2024).","journal-title":"Nat. Med."},{"key":"1926_CR15","doi-asserted-by":"publisher","first-page":"77","DOI":"10.1038\/s41591-024-03328-5","volume":"31","author":"S Johri","year":"2025","unstructured":"Johri, S. et al. An evaluation framework for clinical use of large language models in patient interaction tasks. Nat. Med. 31, 77\u201386 (2025).","journal-title":"Nat. Med."},{"key":"1926_CR16","doi-asserted-by":"publisher","first-page":"288:1","DOI":"10.1038\/s41746-024-01282-7","volume":"7","author":"T Han","year":"2024","unstructured":"Han, T. et al. Medical large language models are susceptible to targeted misinformation attacks. npj Digit. Med. 7, 288:1\u20139 (2024).","journal-title":"npj Digit. Med."},{"key":"1926_CR17","doi-asserted-by":"crossref","unstructured":"Yan, Q., He, X., Yue, X. & Wang, X. E. Worse than random? An embarrassingly simple probing evaluation of large multimodal models in medical VQA. Findings of the ACL, 10345\u201310359 (2025).","DOI":"10.18653\/v1\/2025.findings-acl.981"},{"key":"1926_CR18","unstructured":"Xian, R. P. et al. Assessing biomedical knowledge robustness in large language models by query-efficient sampling attacks. Trans. Mach. Learn. Res. (2024). https:\/\/openreview.net\/forum? ID=pvol5JyVYB."},{"key":"1926_CR19","doi-asserted-by":"publisher","first-page":"120289","DOI":"10.1016\/j.neuroimage.2023.120289","volume":"278","author":"L Boone","year":"2023","unstructured":"Boone, L. et al. ROOD-MRI: Benchmarking the robustness of deep learning segmentation models to out-of-distribution and corrupted data in MRI. NeuroImage 278, 120289 (2023).","journal-title":"NeuroImage"},{"key":"1926_CR20","unstructured":"Yang, Y., Zhang, H., Katabi, D. & Ghassemi, M. Change is hard: a closer look at subpopulation shift. In ICML, 39584\u201339622 (Honolulu, Hawaii, USA, 2023)."},{"key":"1926_CR21","unstructured":"Chandu, K. et al. Certainly Uncertain: A Benchmark and Metric for Multimodal Epistemic and Aleatoric Awareness. ICLR (2025). https:\/\/openreview.net\/forum?id=cQ25MQQSNI."},{"key":"1926_CR22","doi-asserted-by":"crossref","unstructured":"Wang, W. et al. A Survey of LLM-based Agents in Medicine: How far are we from Baymax? Findings of the ACL, 10345\u201310359 (Vienna, Austria, 2025). https:\/\/aclanthology.org\/2025.findings-acl.539\/.","DOI":"10.18653\/v1\/2025.findings-acl.539"},{"key":"1926_CR23","unstructured":"Mukherjee, S. et al. Polaris: A Safety-focused LLM Constellation Architecture for Healthcare. arXiv:2403.13313. http:\/\/arxiv.org\/abs\/2403.13313 (2024)."},{"key":"1926_CR24","doi-asserted-by":"publisher","first-page":"158","DOI":"10.1186\/s12911-019-0882-0","volume":"19","author":"K Radcliffe","year":"2019","unstructured":"Radcliffe, K., Lyson, H. C., Barr-Walker, J. & Sarkar, U. Collective intelligence in medical decision-making: a systematic scoping review. BMC Med. Inform. Decis. Making. 19, 158 (2019).","journal-title":"BMC Med. Inform. Decis. Making."},{"key":"1926_CR25","unstructured":"Koessler, L., Schuett, J. & Anderljung, M. Risk thresholds for frontier AI. arXiv:2406.14713. http:\/\/arxiv.org\/abs\/2406.14713 (2024)."}],"container-title":["npj Digital Medicine"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-01926-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-01926-2","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-01926-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,9]],"date-time":"2025-09-09T20:38:41Z","timestamp":1757450321000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-01926-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,29]]},"references-count":25,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["1926"],"URL":"https:\/\/doi.org\/10.1038\/s41746-025-01926-2","relation":{},"ISSN":["2398-6352"],"issn-type":[{"value":"2398-6352","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,8,29]]},"assertion":[{"value":"17 January 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 August 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 August 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"T.D. is a co-founder and director of governance & standardization at PRISM Eval. Other authors declare no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"557"}}