{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T03:35:58Z","timestamp":1782876958510,"version":"3.54.5"},"reference-count":5,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,7,28]],"date-time":"2025-07-28T00:00:00Z","timestamp":1753660800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"},{"start":{"date-parts":[[2025,7,28]],"date-time":"2025-07-28T00:00:00Z","timestamp":1753660800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Med Syst"],"abstract":"<jats:title>Abstract<\/jats:title>\n          <jats:p>HealthBench is an open-source, large-scale benchmark consisting of 5,000 multi-turn clinical conversations evaluated against 48,562 criteria developed by clinicians. Recognized as a significant advancement in assessing realistic artificial intelligence (AI) models, HealthBench deserves further exploration. In this article, we systematically analyze the benchmark\u2019s disease spectrum, diagnostic and therapeutic focuses, and demographic diversity. We evaluate its representativeness and strengths, as well as the essential limitations that AI researchers and clinicians should consider when using it for realistic model evaluations.<\/jats:p>","DOI":"10.1007\/s10916-025-02232-w","type":"journal-article","created":{"date-parts":[[2025,7,28]],"date-time":"2025-07-28T04:33:54Z","timestamp":1753677234000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Dissecting HealthBench: Disease Spectrum, Clinical Diversity, and Data Insights from Multi-Turn Clinical AI Evaluation Benchmark"],"prefix":"10.1007","volume":"49","author":[{"given":"Jialin","family":"Liu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Siru","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,7,28]]},"reference":[{"issue":"1","key":"2232_CR1","doi-asserted-by":"publisher","first-page":"80","DOI":"10.1007\/s10916-025-02212-0","volume":"49","author":"DA Forero","year":"2025","unstructured":"Forero, D.A, Abreu, S.E, Tovar, B.E, Oermann, M.H. Large Language Models and the Analyses of Adherence to Reporting Guidelines in Systematic Reviews and Overviews of Reviews (PRISMA 2020 and PRIOR). J Med Syst. 2025;49(1):80. https:\/\/doi.org\/10.1007\/s10916-025-02212-0.","journal-title":"J Med Syst"},{"key":"2232_CR2","doi-asserted-by":"publisher","DOI":"10.3390\/app11146421","author":"D Jin","year":"2021","unstructured":"Jin, D., Pan, E., Oufattole, N., Weng, W.-H., Fang, H., Szolovits, P. What Disease Does This Patient Have? A Large-Scale Open Domain Question Answering Dataset from Medical Exams. Appl. Sci. 2021, 11, 6421. https:\/\/doi.org\/10.3390\/app11146421"},{"key":"2232_CR3","doi-asserted-by":"publisher","unstructured":"Jin, Q., Dhingra, B., Liu, Z., Cohen, W. W. & Lu, X. PubMedQA: A Dataset for Biomedical Research Question Answering. arXiv, 2019. https:\/\/doi.org\/10.48550\/arXiv.1909.06146","DOI":"10.48550\/arXiv.1909.06146"},{"key":"2232_CR4","unstructured":"Arora, R. K., Wei, J., Hicks, R. S., Bowman, P., Qui\u00f1onero-Candela, J., Tsimourlas, F, et al. Introducing HealthBench An evaluation for AI systems and human health. 2025. https:\/\/openai.com\/index\/healthbench\/ (accessed 12 May 2025)."},{"key":"2232_CR5","unstructured":"World Health Organization. WHO Model Lists of Essential Medicines (July 2023). Available at: https:\/\/www.who.int\/groups\/expert-committee-on-selection-and-use-of-essential-medicines\/essential-medicines-lists (accessed 12 May 2025)."}],"container-title":["Journal of Medical Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10916-025-02232-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10916-025-02232-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10916-025-02232-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,28]],"date-time":"2025-07-28T04:33:56Z","timestamp":1753677236000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10916-025-02232-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,28]]},"references-count":5,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["2232"],"URL":"https:\/\/doi.org\/10.1007\/s10916-025-02232-w","relation":{"references":[{"id-type":"doi","id":"10.3390\/app11146421","asserted-by":"subject"}]},"ISSN":["1573-689X"],"issn-type":[{"value":"1573-689X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7,28]]},"assertion":[{"value":"1 July 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 July 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 July 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics Approval and Consent to Participate"}},{"value":"The authors declare no competing interests.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing Interests"}}],"article-number":"100"}}