{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,2]],"date-time":"2026-05-02T08:27:23Z","timestamp":1777710443481,"version":"3.51.4"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,3,17]],"date-time":"2025-03-17T00:00:00Z","timestamp":1742169600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,3,17]],"date-time":"2025-03-17T00:00:00Z","timestamp":1742169600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["npj Digit. Med."],"DOI":"10.1038\/s41746-025-01475-8","type":"journal-article","created":{"date-parts":[[2025,3,17]],"date-time":"2025-03-17T11:43:11Z","timestamp":1742211791000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":26,"title":["Large language model agents can use tools to perform clinical calculations"],"prefix":"10.1038","volume":"8","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0229-8843","authenticated-orcid":false,"given":"Alex J.","family":"Goodell","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3863-1548","authenticated-orcid":false,"given":"Simon N.","family":"Chu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1188-6877","authenticated-orcid":false,"given":"Dara","family":"Rouholiman","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0814-6402","authenticated-orcid":false,"given":"Larry F.","family":"Chu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,17]]},"reference":[{"key":"1475_CR1","doi-asserted-by":"publisher","unstructured":"Nori, H. et al. Can generalist foundation models outcompete special-purpose tuning? Case study in medicine. https:\/\/doi.org\/10.48550\/arXiv.2311.16452 (2023).","DOI":"10.48550\/arXiv.2311.16452"},{"key":"1475_CR2","doi-asserted-by":"publisher","first-page":"e0000198","DOI":"10.1371\/journal.pdig.0000198","volume":"2","author":"TH Kung","year":"2023","unstructured":"Kung, T. H. et al. Performance of ChatGPT on USMLE: potential for ai-assisted medical education using large language models. PLoS Digit. Health 2, e0000198 (2023).","journal-title":"PLoS Digit. Health"},{"key":"1475_CR3","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1038\/s41586-023-06291-2","volume":"620","author":"K Singhal","year":"2023","unstructured":"Singhal, K. et al. Large language models encode clinical knowledge. Nature 620, 172\u2013180 (2023).","journal-title":"Nature"},{"key":"1475_CR4","unstructured":"OpenAI. Gpt-4 technical report. http:\/\/arxiv.org\/abs\/2303.08774 (2023)."},{"key":"1475_CR5","doi-asserted-by":"publisher","DOI":"10.1038\/s41746-024-01010-1","volume":"7","author":"T Savage","year":"2023","unstructured":"Savage, T., Nayak, A., Gallo, R., Rangan, E. & Chen, J. H. Diagnostic reasoning prompts reveal the potential for large language model interpretability in medicine. NPJ Digit. Med. 7, 20 (2023).","journal-title":"NPJ Digit. Med."},{"key":"1475_CR6","doi-asserted-by":"publisher","first-page":"17341","DOI":"10.1038\/s41598-024-66933-x","volume":"14","author":"D Patel","year":"2024","unstructured":"Patel, D. et al. Evaluating prompt engineering on GPT-3.5\u2019s performance in USMLE-style medical calculations and clinical scenarios generated by GPT-4. Sci. Rep. 14, 17341 (2024).","journal-title":"Sci. Rep."},{"key":"1475_CR7","doi-asserted-by":"publisher","first-page":"AIoa2300068","DOI":"10.1056\/AIoa2300068","volume":"1","author":"C Zakka","year":"2024","unstructured":"Zakka, C. et al. Almanac\u2014retrieval-augmented language models for clinical medicine. NEJM AI 1, AIoa2300068 (2024).","journal-title":"NEJM AI"},{"key":"1475_CR8","unstructured":"Yao, S. et. al. ReAct: Synergizing Reasoning and Acting in Language Models. The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=WE_vluYUL-X (2023)."},{"key":"1475_CR9","doi-asserted-by":"publisher","first-page":"121101","DOI":"10.1007\/s11432-024-4222-0","volume":"68","author":"ZH Xi","year":"2025","unstructured":"Xi, Z. H. et al. The rise and potential of large language model based agents: a survey. Sci China Inf Sci. 68, 121101 (2025).","journal-title":"Sci China Inf Sci."},{"key":"1475_CR10","doi-asserted-by":"publisher","unstructured":"Yubo, W., Xueguang, M. & Wenhu, C. Augmenting Black-box LLMs with Medical Textbooks for Biomedical Question Answering. https:\/\/doi.org\/10.48550\/arXiv.2309.02233 (2024).","DOI":"10.48550\/arXiv.2309.02233"},{"key":"1475_CR11","doi-asserted-by":"publisher","first-page":"btae075","DOI":"10.1093\/bioinformatics\/btae075","volume":"40","author":"Q Jin","year":"2024","unstructured":"Jin, Q., Yang, Y., Chen, Q. & Lu, Z. Genegpt: augmenting large language models with domain tools for improved access to biomedical information. Bioinformatics 40, btae075 (2024).","journal-title":"Bioinformatics"},{"key":"1475_CR12","doi-asserted-by":"crossref","unstructured":"Imani, S., Du, L. & Shrivastava, H. MathPrompter: Mathematical Reasoning using Large Language Models. In Proc. 61st Annual Meeting of the Association for Computational Linguistics (eds Sitaram, S., Beigman K. B. & Williams, J. D.) Volume 5: Industry Track, 37\u201342 (Association for Computational Linguistics, Toronto, Canada, 2023).","DOI":"10.18653\/v1\/2023.acl-industry.4"},{"key":"1475_CR13","unstructured":"MDCalc. Faq. https:\/\/web.archive.org\/web\/20231212045610\/https:\/\/www.mdcalc.com\/faq (2023)."},{"key":"1475_CR14","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1097\/01.EEM.0000482476.20709.dc","volume":"38","author":"G Walker","year":"2016","unstructured":"Walker, G. Emergentology: the ups and downs of developing the MDCalc app. Emerg. Med. News 38, 18 (2016).","journal-title":"Emerg. Med. News"},{"key":"1475_CR15","doi-asserted-by":"publisher","first-page":"833","DOI":"10.1016\/j.jamcollsurg.2013.07.385","volume":"217","author":"KY Bilimoria","year":"2013","unstructured":"Bilimoria, K. Y. et al. Development and evaluation of the universal ACS NSQIP surgical risk calculator: a decision aid and informed consent tool for patients and surgeons. J. Am. Coll. Surg. 217, 833 (2013).","journal-title":"J. Am. Coll. Surg."},{"key":"1475_CR16","doi-asserted-by":"publisher","first-page":"1411","DOI":"10.1016\/j.athoracsur.2018.03.002","volume":"105","author":"DM Shahian","year":"2018","unstructured":"Shahian, D. M. et al. The society of thoracic surgeons 2018 adult cardiac surgery risk models: part 1\u2014background, design considerations, and model development. Ann. Thorac. Surg. 105, 1411\u20131418 (2018).","journal-title":"Ann. Thorac. Surg."},{"key":"1475_CR17","unstructured":"Gliadkovskaya, A. Some doctors are using public AI chatbots like ChatGPT in clinical decisions. is it safe? https:\/\/www.fiercehealthcare.com\/special-reports\/some-doctors-are-using-public-generative-ai-tools-chatgpt-clinical-decisions-it (2024)."},{"key":"1475_CR18","doi-asserted-by":"crossref","unstructured":"Ng, M. Y., Helzer, J., Pfeffer, M. A., Seto, T. & Hernandez-Boussard, T. Development of secure infrastructure for advancing generative ai research in healthcare at an academic medical center. J. Am. Med. Inform. Assoc. 32, 586\u2013588 (2025).","DOI":"10.1093\/jamia\/ocaf005"},{"key":"1475_CR19","unstructured":"Bumgardner, V. K. C. et al. Institutional platform for secure self-service large language model exploration. arXiv [cs.CR]. http:\/\/arxiv.org\/abs\/2402.00913 (2024)."},{"key":"1475_CR20","doi-asserted-by":"publisher","first-page":"e2440969","DOI":"10.1001\/jamanetworkopen.2024.40969","volume":"7","author":"E Goh","year":"2024","unstructured":"Goh, E. et al. Large language model influence on diagnostic reasoning: a randomized clinical trial. JAMA Netw. Open 7, e2440969 (2024).","journal-title":"JAMA Netw. Open"},{"key":"1475_CR21","doi-asserted-by":"publisher","first-page":"608","DOI":"10.1097\/ALN.0000000000005122","volume":"141","author":"HJ Hong","year":"2024","unstructured":"Hong, H. J., Schmiesing, C. A. & Goodell, A. J. Enhancing the readability of preoperative patient instructions using large language models. Anesthesiology 141, 608\u2013610 (2024).","journal-title":"Anesthesiology"},{"key":"1475_CR22","doi-asserted-by":"publisher","first-page":"928","DOI":"10.1001\/jamasurg.2024.1621","volume":"159","author":"P Chung","year":"2024","unstructured":"Chung, P. et al. Large language model capabilities in perioperative risk prediction and prognostication. JAMA Surg. 159, 928\u2013937 (2024).","journal-title":"JAMA Surg."},{"key":"1475_CR23","doi-asserted-by":"publisher","first-page":"997","DOI":"10.1093\/jamia\/ocad253","volume":"31","author":"B Kwan","year":"2024","unstructured":"Kwan, B., Bell, J. F., Longhurst, C. A., Goldhaber, N. H. & Clay, B. Implementation of an electronic health record-integrated instant messaging system in an academic health system. J. Am. Med. Inform. Assoc. 31, 997\u20131000 (2024).","journal-title":"J. Am. Med. Inform. Assoc."},{"key":"1475_CR24","first-page":"68539","volume":"36","author":"TD Schick","year":"2023","unstructured":"Schick, T. D. et al. Toolformer: Language models can teach themselves to use tools. Adv. Neural Inf. Process. Syst. 36, 68539\u201368551 (2023).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1475_CR25","doi-asserted-by":"publisher","unstructured":"Goodell, A. J., Chu, S. N., Rouholiman, D. & Chu, L. F. Augmentation of ChatGPT with clinician-informed tools improves performance on medical calculation tasks. medRxiv 2023-12. https:\/\/doi.org\/10.1101\/2023.12.13.23299881 (2023).","DOI":"10.1101\/2023.12.13.23299881"},{"key":"1475_CR26","unstructured":"Khandekar, N. et al. Medcalc-bench: Evaluating large language models for medical calculations. arXiv [cs.CL]. http:\/\/arxiv.org\/abs\/2406.12036 (2024)."},{"key":"1475_CR27","unstructured":"Jin, Q. et al. Agentmd: empowering language agents for risk prediction with large-scale clinical tool learning. arXiv [cs.CL]. http:\/\/arxiv.org\/abs\/2402.13225 (2024)."},{"key":"1475_CR28","unstructured":"Zhu, Y. et al. Menti: bridging medical calculator and llm agent with nested tool calling. arXiv [cs.AI]. http:\/\/arxiv.org\/abs\/2410.13610 (2024)."},{"key":"1475_CR29","unstructured":"Wan, N. et al. Humans continue to outperform large language models in complex clinical decision-making: a study with medical calculators. arXiv [cs.CL]. http:\/\/arxiv.org\/abs\/2411.05897 (2024)."},{"key":"1475_CR30","doi-asserted-by":"crossref","unstructured":"Fleming, S. L. et al. MedAlign: A Clinician-Generated Dataset for Instruction Following with Electronic Medical Records. In Proc. AAAI Conference on Artificial Intelligence, Vol. 38, 22021\u201322030 (2024).","DOI":"10.1609\/aaai.v38i20.30205"},{"key":"1475_CR31","doi-asserted-by":"crossref","unstructured":"Pal, A. et al. Med-HALT: Medical Domain Hallucination Test for Large Language Models. In Proc. 27th Conference on Computational Natural Language Learning (CoNLL) (eds Jiang, J., Reitter, D. & Deng, S) 314\u2013334 (Association for Computational Linguistics, Singapore, 2023) https:\/\/aclanthology.org\/2023.conll-1.21\/","DOI":"10.18653\/v1\/2023.conll-1.21"},{"key":"1475_CR32","doi-asserted-by":"crossref","unstructured":"Huang, J. & Chang, K. C-C. Towards Reasoning in Large Language Models: A Survey. Findings of the Association for Computational Linguistics: ACL 2023 (eds Rogers, A., Boyd-Graber, J. & Okazaki, N) 1049\u20131065 (Association for Computational Linguistics, Toronto, Canada, 2023).","DOI":"10.18653\/v1\/2023.findings-acl.67"},{"key":"1475_CR33","doi-asserted-by":"crossref","unstructured":"Qiao, S. et al. Reasoning with Language Model Prompting: A Survey. In Proc. 61st Annual Meeting of the Association for Computational Linguistics, Vol. 1 (eds Rogers, A., Boyd-Graber, J. & Okazaki, N) 5368\u20135393 (Association for Computational Linguistics, Toronto, Canada, 2023).","DOI":"10.18653\/v1\/2023.acl-long.294"},{"key":"1475_CR34","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1038\/s41746-021-00504-6","volume":"4","author":"SP Shashikumar","year":"2021","unstructured":"Shashikumar, S. P., Wardi, G., Malhotra, A. & Nemati, S. Artificial intelligence sepsis prediction algorithm learns to say \u201cI don\u2019t know\u201d. npj Digit. Med. 4, 1\u20139 (2021).","journal-title":"npj Digit. Med."},{"key":"1475_CR35","doi-asserted-by":"publisher","unstructured":"Tang, X. et al. MedAgents: Large Language Models as Collaborators for Zero-shot Medical Reasoning. https:\/\/doi.org\/10.48550\/arXiv.2311.10537 (2024).","DOI":"10.48550\/arXiv.2311.10537"},{"key":"1475_CR36","doi-asserted-by":"publisher","unstructured":"Liao, Y., Meng, Y., Liu, H., Wang, Y. & Wang, Y. An automatic evaluation framework for multi-turn medical consultations capabilities of large language models. https:\/\/doi.org\/10.48550\/arXiv.2309.02077 (2023).","DOI":"10.48550\/arXiv.2309.02077"},{"key":"1475_CR37","unstructured":"Schmidgall, S. et al. Agentclinic: a multimodal agent benchmark to evaluate ai in simulated clinical environments. arXiv [cs.HC] http:\/\/arxiv.org\/abs\/2405.07960 (2024)."},{"key":"1475_CR38","doi-asserted-by":"publisher","first-page":"1134","DOI":"10.1038\/s41591-024-02855-5","volume":"30","author":"D Van Veen","year":"2024","unstructured":"Van Veen, D. et al. Adapted large language models can outperform medical experts in clinical text summarization. Nat. Med. 30, 1134\u20131142 (2024).","journal-title":"Nat. Med."},{"key":"1475_CR39","unstructured":"Willard, B. T. & Louf, R. Efficient guided generation for large language models. arXiv [cs.CL]. http:\/\/arxiv.org\/abs\/2307.09702 (2023)."},{"key":"1475_CR40","unstructured":"Chase, H. Langchain. https:\/\/github.com\/langchain-ai\/langchain (2022)."},{"key":"1475_CR41","unstructured":"Ram\u00edrez, S. Fastapi. https:\/\/github.com\/tiangolo\/fastapi (2023)."},{"key":"1475_CR42","unstructured":"Miller, D. et al. OpenAPI Specification v3. 1.0 (OpenAPI Initiative, The Linux Foundation, 2021). https:\/\/spec.openapis.org\/."}],"container-title":["npj Digital Medicine"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-01475-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-01475-8","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-01475-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,17]],"date-time":"2025-03-17T11:43:16Z","timestamp":1742211796000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.nature.com\/articles\/s41746-025-01475-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,17]]},"references-count":42,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["1475"],"URL":"https:\/\/doi.org\/10.1038\/s41746-025-01475-8","relation":{},"ISSN":["2398-6352"],"issn-type":[{"value":"2398-6352","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,3,17]]},"assertion":[{"value":"12 December 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 January 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 March 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declare no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"163"}}