{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:05:38Z","timestamp":1750309538627,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,28]]},"DOI":"10.1145\/3696410.3714535","type":"proceedings-article","created":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T22:57:28Z","timestamp":1745362648000},"page":"5330-5341","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["How much Medical Knowledge do LLMs have? An Evaluation of Medical Knowledge Coverage for LLMs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2968-7304","authenticated-orcid":false,"given":"Ziheng","family":"Zhang","sequence":"first","affiliation":[{"name":"Jarvis Research Center, Tencent YouTu Lab, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1264-6549","authenticated-orcid":false,"given":"Zhenxi","family":"Lin","sequence":"additional","affiliation":[{"name":"Jarvis Research Center, Tencent YouTu Lab, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2195-2847","authenticated-orcid":false,"given":"Yefeng","family":"Zheng","sequence":"additional","affiliation":[{"name":"Medical Artificial Intelligence Lab, Westlake University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1118-9710","authenticated-orcid":false,"given":"Xian","family":"Wu","sequence":"additional","affiliation":[{"name":"Jarvis Research Center, Tencent YouTu Lab, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Zhijie Bao Wei Chen Shengze Xiao Kuang Ren Jiaao Wu Cheng Zhong Jiajie Peng Xuanjing Huang and Zhongyu Wei. 2023. DISC-MedLLM: Bridging General Large Language Models and Real-World Medical Consultation. arxiv: 2308.14346 [cs.CL]"},{"key":"e_1_3_2_1_2_1","volume-title":"Lisa Soleymani Lehmann, et al","author":"Bedi Suhana","year":"2024","unstructured":"Suhana Bedi, Yutong Liu, Lucy Orr-Ewing, Dev Dash, Sanmi Koyejo, Alison Callahan, Jason A Fries, Michael Wornow, Akshay Swaminathan, Lisa Soleymani Lehmann, et al. 2024. Testing and Evaluation of Health Care Applications of Large Language Models: A Systematic Review. JAMA (2024)."},{"key":"e_1_3_2_1_3_1","unstructured":"Yan Cai Linlin Wang Ye Wang Gerard de Melo Ya Zhang Yanfeng Wang and Liang He. 2023. MedBench: A Large-Scale Chinese Benchmark for Evaluating Medical Large Language Models. arxiv: 2312.12806 [cs.CL] https:\/\/arxiv.org\/abs\/2312.12806"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10916-023-01925-4"},{"key":"e_1_3_2_1_5_1","unstructured":"Junying Chen Xidong Wang Anningzhe Gao Feng Jiang Shunian Chen Hongbo Zhang Dingjie Song Wenya Xie Chuyi Kong Jianquan Li Xiang Wan Haizhou Li and Benyou Wang. 2023a. HuatuoGPT-II One-stage Training for Medical Adaption of LLMs. arxiv: 2311.09774 [cs.CL]"},{"key":"e_1_3_2_1_6_1","unstructured":"Shiqi Chen Yiran Zhao Jinghan Zhang I-Chun Chern Siyang Gao Pengfei Liu and Junxian He. 2023b. FELM: Benchmarking Factuality Evaluation of Large Language Models. In Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track. https:\/\/openreview.net\/forum?id=jSO7Vgolc6"},{"key":"e_1_3_2_1_7_1","volume-title":"A Review: Knowledge Reasoning over Knowledge Graph. Expert systems with applications","author":"Chen Xiaojun","year":"2020","unstructured":"Xiaojun Chen, Shengbin Jia, and Yang Xiang. 2020. A Review: Knowledge Reasoning over Knowledge Graph. Expert systems with applications, Vol. 141 (2020), 112948."},{"key":"e_1_3_2_1_8_1","volume-title":"DialogSum: A real-life scenario dialogue summarization dataset. arXiv preprint arXiv:2105.06762","author":"Chen Yulong","year":"2021","unstructured":"Yulong Chen, Yang Liu, Liang Chen, and Yue Zhang. 2021. DialogSum: A real-life scenario dialogue summarization dataset. arXiv preprint arXiv:2105.06762 (2021)."},{"key":"e_1_3_2_1_9_1","first-page":"1","article-title":"Preliminary study on the construction of Chinese medical knowledge graph","volume":"33","author":"Dema Ao","year":"2019","unstructured":"Ao Dema, Yang Yunfei, Sui Zhizfang, Dai DaMai, Chang BaoBao, Li SuJian, and Zan HongYing. 2019. Preliminary study on the construction of Chinese medical knowledge graph. Journal of Chinese Information Processing, Vol. 33, 10 (2019), 1--7.","journal-title":"Journal of Chinese Information Processing"},{"key":"e_1_3_2_1_10_1","volume-title":"SciKnowEval: Evaluating Multi-level Scientific Knowledge of Large Language Models. arXiv preprint arXiv:2406.09098","author":"Feng Kehua","year":"2024","unstructured":"Kehua Feng, Keyan Ding, Weijie Wang, Xiang Zhuang, Zeyuan Wang, Ming Qin, Yu Zhao, Jianhua Yao, Qiang Zhang, and Huajun Chen. 2024. SciKnowEval: Evaluating Multi-level Scientific Knowledge of Large Language Models. arXiv preprint arXiv:2406.09098 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Tool Calling: Enhancing Medication Consultation via Retrieval-Augmented Large Language Models. arxiv: 2404.17897 [cs.CL] https:\/\/arxiv.org\/abs\/2404.17897","author":"Huang Zhongzhen","year":"2024","unstructured":"Zhongzhen Huang, Kui Xue, Yongqi Fan, Linjie Mu, Ruoyu Liu, Tong Ruan, Shaoting Zhang, and Xiaofan Zhang. 2024. Tool Calling: Enhancing Medication Consultation via Retrieval-Augmented Large Language Models. arxiv: 2404.17897 [cs.CL] https:\/\/arxiv.org\/abs\/2404.17897"},{"key":"e_1_3_2_1_12_1","volume-title":"Chun Peng, and Jimmy Xiangji Huang.","author":"Jahan Israt","year":"2024","unstructured":"Israt Jahan, Md Tahmid Rahman Laskar, Chun Peng, and Jimmy Xiangji Huang. 2024. A comprehensive evaluation of large language models on benchmark biomedical text processing tasks. Computers in biology and medicine, Vol. 171 (2024), 108189."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2023.105894"},{"key":"e_1_3_2_1_14_1","volume-title":"Path-based knowledge reasoning with textual semantic information for medical knowledge graph completion. BMC medical informatics and decision making","author":"Lan Yinyu","year":"2021","unstructured":"Yinyu Lan, Shizhu He, Kang Liu, Xiangrong Zeng, Shengping Liu, and Jun Zhao. 2021. Path-based knowledge reasoning with textual semantic information for medical knowledge graph completion. BMC medical informatics and decision making, Vol. 21 (2021), 1--12."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024","author":"Lin Zhenxi","year":"2024","unstructured":"Zhenxi Lin, Ziheng Zhang, Xian Wu, and Yefeng Zheng. 2024. Biomedical Entity Linking as Multiple Choice Question Answering. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024). ELRA and ICCL, Torino, Italia, 2390--2396. https:\/\/aclanthology.org\/2024.lrec-main.214\/"},{"key":"e_1_3_2_1_16_1","unstructured":"Xiaoze Liu Feijie Wu Tianyang Xu Zhuo Chen Yichi Zhang Xiaoqian Wang and Jing Gao. 2024. Evaluating the Factuality of Large Language Models using Large-Scale Knowledge Graphs. arxiv: 2404.00942 [cs.CL] https:\/\/arxiv.org\/abs\/2404.00942"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_18_1","volume-title":"GitHub: https:\/\/github.com\/winninghealth\/WiNGPT2","author":"Research Winning Health AI","year":"2024","unstructured":"Winning Health AI Research. 2023. WiNGPT2. GitHub: https:\/\/github.com\/winninghealth\/WiNGPT2; Hugging Face: https:\/\/huggingface.co\/winninghealth\/WiNGPT2-Llama-3--8B-Chat. Accessed: 2024--12-01."},{"key":"e_1_3_2_1_19_1","volume-title":"Hermina Petric Maretic, and Juba Nait Saada","author":"Sansford Hannah","year":"2024","unstructured":"Hannah Sansford, Nicholas Richardson, Hermina Petric Maretic, and Juba Nait Saada. 2024. Grapheval: A knowledge-graph based llm hallucination evaluation framework. arXiv preprint arXiv:2407.10793 (2024)."},{"key":"e_1_3_2_1_20_1","unstructured":"Karan Singhal Shekoofeh Azizi Tao Tu S. Sara Mahdavi Jason Wei Hyung Won Chung Nathan Scales Ajay Tanwani Heather Cole-Lewis Stephen Pfohl Perry Payne Martin Seneviratne Paul Gamble Chris Kelly Nathaneal Scharli Aakanksha Chowdhery Philip Mansfield Blaise Aguera y Arcas Dale Webster Greg S. Corrado Yossi Matias Katherine Chou Juraj Gottweis Nenad Tomasev Yun Liu Alvin Rajkomar Joelle Barral Christopher Semturs Alan Karthikesalingam and Vivek Natarajan. 2022. Large Language Models Encode Clinical Knowledge. arxiv: 2212.13138 [cs.CL] https:\/\/arxiv.org\/abs\/2212.13138"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41591-023-02448-8"},{"key":"e_1_3_2_1_22_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.26599\/BDMA.2022.9020021"},{"key":"e_1_3_2_1_25_1","volume-title":"The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track. https:\/\/openreview.net\/forum?id=XXaIoJyYs7","author":"Wu Xian","year":"2024","unstructured":"Xian Wu, Yutian Zhao, Yunyan Zhang, Jiageng Wu, Zhihong Zhu, Yingying Zhang, Yi Ouyang, Ziheng Zhang, Huimin WANG, Zhenxi Lin, Jie Yang, Shuang Zhao, and Yefeng Zheng. 2024. MedJourney: Benchmark and Evaluation of Large Language Models over Patient Clinical Journey. In The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track. https:\/\/openreview.net\/forum?id=XXaIoJyYs7"},{"key":"e_1_3_2_1_26_1","volume-title":"PULSE: Pretrained and Unified Language Service Engine.","author":"Xiaofan Zhang Shaoting Zhang","year":"2023","unstructured":"Shaoting Zhang Xiaofan Zhang, Kui Xue. 2023. PULSE: Pretrained and Unified Language Service Engine. (2023). https:\/\/github.com\/openmedlab\/PULSE"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3679673"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.456"},{"key":"e_1_3_2_1_29_1","unstructured":"Aiyuan Yang Bin Xiao Bingning Wang Borong Zhang Ce Bian Chao Yin Chenxu Lv Da Pan Dian Wang Dong Yan Fan Yang Fei Deng Feng Wang Feng Liu Guangwei Ai Guosheng Dong Haizhou Zhao Hang Xu Haoze Sun Hongda Zhang Hui Liu Jiaming Ji Jian Xie JunTao Dai Kun Fang Lei Su Liang Song Lifeng Liu Liyun Ru Luyao Ma Mang Wang Mickel Liu MingAn Lin Nuolan Nie Peidong Guo Ruiyang Sun Tao Zhang Tianpeng Li Tianyu Li Wei Cheng Weipeng Chen Xiangrong Zeng Xiaochuan Wang Xiaoxi Chen Xin Men Xin Yu Xuehai Pan Yanjun Shen Yiding Wang Yiyu Li Youxin Jiang Yuchen Gao Yupeng Zhang Zenan Zhou and Zhiying Wu. 2023. Baichuan 2: Open Large-scale Language Models. arxiv: 2309.10305 [cs.CL] https:\/\/arxiv.org\/abs\/2309.10305"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_32_1","volume-title":"Large language models as reliable knowledge bases? arXiv preprint arXiv:2407.13578","author":"Zheng Danna","year":"2024","unstructured":"Danna Zheng, Mirella Lapata, and Jeff Z Pan. 2024. Large language models as reliable knowledge bases? arXiv preprint arXiv:2407.13578 (2024)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Wei Zhu Xiaoling Wang Huanran Zheng Mosha Chen and Buzhou Tang. 2023. PromptCBLUE: A Chinese Prompt Tuning Benchmark for the Medical Domain. arxiv: 2310.14151 [cs.CL] https:\/\/arxiv.org\/abs\/2310.14151","DOI":"10.2139\/ssrn.4685921"},{"key":"e_1_3_2_1_34_1","unstructured":"Yanxu Zhu Jinlin Xiao Yuhang Wang and Jitao Sang. 2024. KG-FPQ: Evaluating Factuality Hallucination in LLMs with Knowledge Graph-based False Premise Questions. arxiv: 2407.05868 [cs.CL] https:\/\/arxiv.org\/abs\/2407.05868"}],"event":{"name":"WWW '25: The ACM Web Conference 2025","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Sydney NSW Australia","acronym":"WWW '25"},"container-title":["Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714535","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696410.3714535","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:33Z","timestamp":1750295913000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714535"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":33,"alternative-id":["10.1145\/3696410.3714535","10.1145\/3696410"],"URL":"https:\/\/doi.org\/10.1145\/3696410.3714535","relation":{},"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"2025-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}