{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,27]],"date-time":"2026-04-27T14:18:51Z","timestamp":1777299531064,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3774904.3792076","type":"proceedings-article","created":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T21:54:34Z","timestamp":1775771674000},"page":"1864-1875","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Med-R\n                    <sup>2<\/sup>\n                    : Crafting Trustworthy LLM Physicians via Retrieval and Reasoning of Evidence-Based Medicine"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-5966-8309","authenticated-orcid":false,"given":"Lu","family":"Keer","sequence":"first","affiliation":[{"name":"Peking University, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4470-6107","authenticated-orcid":false,"given":"Zheng","family":"Liang","sequence":"additional","affiliation":[{"name":"Baichuan Inc., Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5815-741X","authenticated-orcid":false,"given":"Da","family":"Pan","sequence":"additional","affiliation":[{"name":"Baichuan Inc., Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2107-3152","authenticated-orcid":false,"given":"Shusen","family":"Zhang","sequence":"additional","affiliation":[{"name":"Baichuan Inc., Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2955-9272","authenticated-orcid":false,"given":"Guosheng","family":"Dong","sequence":"additional","affiliation":[{"name":"Baichuan Inc., Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9531-0760","authenticated-orcid":false,"given":"Huang","family":"Leng","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1681-4677","authenticated-orcid":false,"given":"Bin","family":"Cui","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1268-836X","authenticated-orcid":false,"given":"Zhonghai","family":"Wu","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7532-5550","authenticated-orcid":false,"given":"Wentao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2026,4,12]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.2196\/48291"},{"key":"e_1_3_2_1_2_1","unstructured":"Anthropic. 2024. Claude. https:\/\/www.anthropic.com\/Accessed: 2024-06--27."},{"key":"e_1_3_2_1_3_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Asai Akari","year":"2023","unstructured":"Akari Asai, Zeqiu Wu, Yizhong Wang, Avirup Sil, and Hannaneh Hajishirzi. 2023. Self-rag: Learning to retrieve, generate, and critique through self-reflection. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_4_1","volume-title":"Rawen Kader, Esteban Ortiz-Prado, Marcus R Makowski, Luca Saba, Martin Hadamitzky, Jakob Nikolas Kather, et al.","author":"Busch Felix","year":"2024","unstructured":"Felix Busch, Lena Hoffmann, Christopher Rueger, Elon HC van Dijk, Rawen Kader, Esteban Ortiz-Prado, Marcus R Makowski, Luca Saba, Martin Hadamitzky, Jakob Nikolas Kather, et al. 2024. Systematic Review of Large Language Models for Patient Care: Current Applications and Challenges. medRxiv (2024), 2024--03."},{"key":"e_1_3_2_1_5_1","unstructured":"Susan M Case and David B Swanson. 1998. Constructing written test questions for the basic and clinical sciences. National Board of Medical Examiners Philadelphia."},{"key":"e_1_3_2_1_6_1","volume-title":"Angelika Romanou, Antoine Bonnet","author":"Chen Zeming","year":"2023","unstructured":"Zeming Chen, Alejandro Hern\u00e1ndez Cano, Angelika Romanou, Antoine Bonnet, Kyle Matoba, Francesco Salvi, Matteo Pagliardini, Simin Fan, Andreas K\u00f6pf, Amirkeivan Mohtashami, et al. 2023. Meditron-70b: Scaling medical pretraining for large language models. arXiv preprint arXiv:2311.16079 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"Zunamys I Carrero, Jan-Niklas Eckardt, Narmin Ghaffari Laleh, Chiara Maria Lavinia L\u00f6ffler, Sophie-Caroline Schwarzkopf, Michaela Unger, Gregory P Veldhuizen, et al.","author":"Clusmann Jan","year":"2023","unstructured":"Jan Clusmann, Fiona R Kolbinger, Hannah Sophie Muti, Zunamys I Carrero, Jan-Niklas Eckardt, Narmin Ghaffari Laleh, Chiara Maria Lavinia L\u00f6ffler, Sophie-Caroline Schwarzkopf, Michaela Unger, Gregory P Veldhuizen, et al. 2023. The future landscape of large language models in medicine. Communications medicine 3, 1 (2023), 141."},{"key":"e_1_3_2_1_8_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997","author":"Gao Yunfan","year":"2023","unstructured":"Yunfan Gao, Yun Xiong, Xinyu Gao, Kangxiang Jia, Jinliu Pan, Yuxi Bi, Yi Dai, Jiawei Sun, and Haofen Wang. 2023. Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997 (2023)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1001\/jama.1992.03490170092032"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1001\/jama.284.10.1290"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Paul Hager Friederike Jungmann Robbie Holland Kunal Bhagat Inga Hubrecht Manuel Knauer Jakob Vielhauer Marcus Makowski Rickmer Braren Georgios Kaissis et al. 2024. Evaluation and mitigation of the limitations of large language models in clinical decision-making. Nature medicine 30 9 (2024) 2613--2622.","DOI":"10.1038\/s41591-024-03097-1"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the 15th Workshop on Biomedical Natural Language Processing, Kevin Bretonnel Cohen, Dina Demner-Fushman, Sophia Ananiadou, and Jun-ichi Tsujii (Eds.)","author":"Hakala Kai","unstructured":"Kai Hakala, Suwisa Kaewphan, Tapio Salakoski, and Filip Ginter. 2016. Syntactic analyses and named entity recognition for PubMed and PubMed Central \u2014 up-to-the-minute. In Proceedings of the 15th Workshop on Biomedical Natural Language Processing, Kevin Bretonnel Cohen, Dina Demner-Fushman, Sophia Ananiadou, and Jun-ichi Tsujii (Eds.). Association for Computational Linguistics, Berlin, Germany, 102--107."},{"key":"e_1_3_2_1_14_1","volume-title":"Rethinking with retrieval: Faithful large language model inference. arXiv preprint arXiv:2301.00303","author":"He Hangfeng","year":"2022","unstructured":"Hangfeng He, Hongming Zhang, and Dan Roth. 2022. Rethinking with retrieval: Faithful large language model inference. arXiv preprint arXiv:2301.00303 (2022)."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Hendrycks Dan","year":"2021","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2021. Measuring Massive Multitask Language Understanding. Proceedings of the International Conference on Learning Representations (ICLR) (2021)."},{"key":"e_1_3_2_1_16_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_17_1","first-page":"1","article-title":"Atlas: Few-shot learning with retrieval augmented language models","volume":"24","author":"Izacard Gautier","year":"2023","unstructured":"Gautier Izacard, Patrick Lewis, Maria Lomeli, Lucas Hosseini, Fabio Petroni, Timo Schick, Jane Dwivedi-Yu, Armand Joulin, Sebastian Riedel, and Edouard Grave. 2023. Atlas: Few-shot learning with retrieval augmented language models. Journal of Machine Learning Research 24, 251 (2023), 1--43.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/btae238"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.389"},{"key":"e_1_3_2_1_20_1","volume-title":"What Disease does this Patient Have? A Large-scale Open Domain Question Answering Dataset from Medical Exams. arXiv preprint arXiv:2009.13081","author":"Jin Di","year":"2020","unstructured":"Di Jin, Eileen Pan, Nassim Oufattole, Wei-Hung Weng, Hanyi Fang, and Peter Szolovits. 2020. What Disease does this Patient Have? A Large-scale Open Domain Question Answering Dataset from Medical Exams. arXiv preprint arXiv:2009.13081 (2020)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1259"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1681\/ASN.0000000000000166"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_24_1","volume-title":"International Conference on Machine Learning. PMLR, 15696--15707","author":"Kandpal Nikhil","year":"2023","unstructured":"Nikhil Kandpal, Haikang Deng, Adam Roberts, Eric Wallace, and Colin Raffel. 2023. Large language models struggle to learn long-tail knowledge. In International Conference on Machine Learning. PMLR, 15696--15707."},{"key":"e_1_3_2_1_25_1","volume-title":"GPT versus resident physicians\u2014a benchmark based on official board scores. Nejm Ai 1, 5","author":"Katz Uriel","year":"2024","unstructured":"Uriel Katz, Eran Cohen, Eliya Shachar, Jonathan Somer, Adam Fink, Eli Morse, Beki Shreiber, and Ido Wolf. 2024. GPT versus resident physicians\u2014a benchmark based on official board scores. Nejm Ai 1, 5 (2024), AIdbp2300192."},{"key":"e_1_3_2_1_26_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Kim Yubin","year":"2024","unstructured":"Yubin Kim, Chanwoo Park, Hyewon Jeong, Yik Siu Chan, Xuhai Xu, Daniel McDuff, Hyeonhoon Lee, Marzyeh Ghassemi, Cynthia Breazeal, and Hae Won Park. 2024. Mdagents: An adaptive collaboration of llms for medical decision-making. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_27_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems 33 (2020), 9459--9474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_28_1","volume-title":"Chatdoctor: A medical chat model fine-tuned on a large language model meta-ai (llama) using medical domain knowledge. Cureus 15, 6","author":"Li Yunxiang","year":"2023","unstructured":"Yunxiang Li, Zihan Li, Kai Zhang, Ruilong Dan, Steve Jiang, and You Zhang. 2023. Chatdoctor: A medical chat model fine-tuned on a large language model meta-ai (llama) using medical domain knowledge. Cureus 15, 6 (2023)."},{"key":"e_1_3_2_1_29_1","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_1_30_1","volume-title":"Bailicai: A Domain-Optimized Retrieval-Augmented Generation Framework for Medical Applications. arXiv preprint arXiv:2407.21055","author":"Long Cui","year":"2024","unstructured":"Cui Long, Yongbin Liu, Chunping Ouyang, and Ying Yu. 2024. Bailicai: A Domain-Optimized Retrieval-Augmented Generation Framework for Medical Applications. arXiv preprint arXiv:2407.21055 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"BioGPT: generative pre-trained transformer for biomedical text generation and mining. Briefings in bioinformatics 23, 6","author":"Luo Renqian","year":"2022","unstructured":"Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon, and Tie-Yan Liu. 2022. BioGPT: generative pre-trained transformer for biomedical text generation and mining. Briefings in bioinformatics 23, 6 (2022), bbac409."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.36922\/aih.2558"},{"key":"e_1_3_2_1_33_1","volume-title":"Conference on health, inference, and learning. PMLR, 248--260","author":"Pal Ankit","year":"2022","unstructured":"Ankit Pal, Logesh Kumar Umapathi, and Malaikannan Sankarasubbu. 2022. Medmcqa: A large-scale multi-subject multi-choice dataset for medical domain question answering. In Conference on health, inference, and learning. PMLR, 248--260."},{"key":"e_1_3_2_1_34_1","volume-title":"Seminars in perinatology","author":"Sackett David L","unstructured":"David L Sackett. 1997. Evidence-based medicine. In Seminars in perinatology, Vol. 21. Elsevier, 3--5."},{"key":"e_1_3_2_1_35_1","volume-title":"JA Muir Gray, R Brian Haynes, and W Scott Richardson.","author":"Sackett David L","year":"1996","unstructured":"David L Sackett, William MC Rosenberg, JA Muir Gray, R Brian Haynes, and W Scott Richardson. 1996. Evidence based medicine: what it is and what it isn't. 71--72 pages."},{"key":"e_1_3_2_1_36_1","volume-title":"Nathan Scales, Ajay Tanwani, Heather Cole-Lewis, Stephen Pfohl, et al.","author":"Singhal Karan","year":"2023","unstructured":"Karan Singhal, Shekoofeh Azizi, Tao Tu, S Sara Mahdavi, Jason Wei, Hyung Won Chung, Nathan Scales, Ajay Tanwani, Heather Cole-Lewis, Stephen Pfohl, et al. 2023. Large language models encode clinical knowledge. Nature 620, 7972 (2023), 172--180."},{"key":"e_1_3_2_1_37_1","unstructured":"Karan Singhal Tao Tu Juraj Gottweis Rory Sayres Ellery Wulczyn Le Hou Kevin Clark Stephen Pfohl Heather Cole-Lewis Darlene Neal et al. 2023. Towards expert-level medical question answering with large language models. arXiv preprint arXiv:2305.09617 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Kabilan Elangovan, Laura Gutierrez, Ting Fang Tan, and Daniel Shu Wei Ting.","author":"Thirunavukarasu Arun James","year":"2023","unstructured":"Arun James Thirunavukarasu, Darren Shu Jeng Ting, Kabilan Elangovan, Laura Gutierrez, Ting Fang Tan, and Daniel Shu Wei Ting. 2023. Large language models in medicine. Nature medicine 29, 8 (2023), 1930--1940."},{"key":"e_1_3_2_1_39_1","unstructured":"THUMedInfo. 2025. RareArena: A Dataset for Rare Disease Information Retrieval. https:\/\/huggingface.co\/datasets\/THUMedInfo\/RareArena"},{"key":"e_1_3_2_1_40_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1093\/jamia\/ocad258"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.95"},{"key":"e_1_3_2_1_43_1","volume-title":"PMC-LLaMA: toward building open-source language models for medicine. Journal of the American Medical Informatics Association","author":"Wu Chaoyi","year":"2024","unstructured":"Chaoyi Wu, Weixiong Lin, Xiaoman Zhang, Ya Zhang, Weidi Xie, and Yanfeng Wang. 2024. PMC-LLaMA: toward building open-source language models for medicine. Journal of the American Medical Informatics Association (2024), ocae045."},{"key":"e_1_3_2_1_44_1","volume-title":"Sheared llama: Accelerating language model pre-training via structured pruning. arXiv preprint arXiv:2310.06694","author":"Xia Mengzhou","year":"2023","unstructured":"Mengzhou Xia, Tianyu Gao, Zhiyuan Zeng, and Danqi Chen. 2023. Sheared llama: Accelerating language model pre-training via structured pruning. arXiv preprint arXiv:2310.06694 (2023)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.372"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Guangzhi Xiong Qiao Jin Xiao Wang Minjia Zhang Zhiyong Lu and Aidong Zhang. 2024. Improving retrieval-augmented generation in medicine with iterative follow-up questions. In Biocomputing 2025: Proceedings of the Pacific Symposium. World Scientific 199--214.","DOI":"10.1142\/9789819807024_0015"},{"key":"e_1_3_2_1_47_1","unstructured":"An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chengyuan Li Dayiheng Liu Fei Huang Haoran Wei et al. 2024. Qwen2. 5 Technical Report. arXiv preprint arXiv:2412.15115 (2024)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1002\/hcs2.61"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Cyril Zakka Rohan Shad Akash Chaurasia Alex R Dalal Jennifer L Kim Michael Moor Robyn Fong Curran Phillips Kevin Alexander Euan Ashley et al. 2024. Almanac\u2014retrieval-augmented language models for clinical medicine. NEJM AI 1 2 (2024) AIoa2300068.","DOI":"10.1056\/AIoa2300068"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.743"},{"key":"e_1_3_2_1_51_1","volume-title":"Retrieval-augmented generation for ai-generated content: A survey. arXiv preprint arXiv:2402.19473","author":"Zhao Penghao","year":"2024","unstructured":"Penghao Zhao, Hailin Zhang, Qinhan Yu, Zhengren Wang, Yunteng Geng, Fangcheng Fu, Ling Yang, Wentao Zhang, and Bin Cui. 2024. Retrieval-augmented generation for ai-generated content: A survey. arXiv preprint arXiv:2402.19473 (2024)."},{"key":"e_1_3_2_1_52_1","volume-title":"MedXpertQA: Benchmarking Expert-Level Medical Reasoning and Understanding. arXiv preprint arXiv:2501.18362","author":"Zuo Yuxin","year":"2025","unstructured":"Yuxin Zuo, Shang Qu, Yifei Li, Zhangren Chen, Xuekai Zhu, Ermo Hua, Kaiyan Zhang, Ning Ding, and Bowen Zhou. 2025. MedXpertQA: Benchmarking Expert-Level Medical Reasoning and Understanding. arXiv preprint arXiv:2501.18362 (2025)."}],"event":{"name":"WWW '26: The ACM Web Conference 2026","location":"Dubai United Arab Emirates","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM Web Conference 2026"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3774904.3792076","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,27]],"date-time":"2026-04-27T13:30:07Z","timestamp":1777296607000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3774904.3792076"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,12]]},"references-count":52,"alternative-id":["10.1145\/3774904.3792076","10.1145\/3774904"],"URL":"https:\/\/doi.org\/10.1145\/3774904.3792076","relation":{},"subject":[],"published":{"date-parts":[[2026,4,12]]},"assertion":[{"value":"2026-04-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}