{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:57:40Z","timestamp":1776931060800,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":118,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China","award":["62372298"],"award-info":[{"award-number":["62372298"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3772318.3790835","type":"proceedings-article","created":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T04:12:36Z","timestamp":1776053556000},"page":"1-31","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["\u201cDo I Trust the AI?\u201d Towards Trustworthy AI-Assisted Diagnosis: Understanding User Perception in LLM-Supported Clinical Reasoning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-1630-6279","authenticated-orcid":false,"given":"Yuansong","family":"Xu","sequence":"first","affiliation":[{"name":"School of Information Science and Technology, ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4606-1886","authenticated-orcid":false,"given":"Yichao","family":"Zhu","sequence":"additional","affiliation":[{"name":"School of Information Science and Technology, ShanghaiTech University, Shanghai, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9743-907X","authenticated-orcid":false,"given":"Haokai","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Information Science and Technology, ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8333-4405","authenticated-orcid":false,"given":"Yuchen","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Information Science and Technology, ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5841-7659","authenticated-orcid":false,"given":"Yang","family":"Ouyang","sequence":"additional","affiliation":[{"name":"School of Information Science and Technology, ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9926-9857","authenticated-orcid":false,"given":"Hanlu","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Clinical Research and Trials Center, ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8094-7546","authenticated-orcid":false,"given":"Wenzhe","family":"Zhou","sequence":"additional","affiliation":[{"name":"Shanghai Clinical Research and Trials Center, ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6566-3647","authenticated-orcid":false,"given":"Xinyu","family":"Liu","sequence":"additional","affiliation":[{"name":"Shanghai Clinical Research and Trials Center, ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7468-3372","authenticated-orcid":false,"given":"Chang","family":"Jiang","sequence":"additional","affiliation":[{"name":"Shanghai Clinical Research and Trials Center, ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2249-0728","authenticated-orcid":false,"given":"Quan","family":"Li","sequence":"additional","affiliation":[{"name":"School of Information Science and Technology, ShanghaiTech University, Shanghai, Shanghai, China and Yibandao (Suzhou) Intelligent Technology Co., Ltd., Suzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2026,4,13]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"crossref","unstructured":"Andr\u00e9 Altmann Laura Tolo\u015fi Oliver Sander and Thomas Lengauer. 2010. Permutation importance: a corrected feature importance measure. Bioinformatics 26 10 (2010) 1340\u20131347.","DOI":"10.1093\/bioinformatics\/btq134"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"crossref","unstructured":"Mauro Annarumma Samuel\u00a0J Withey Robert\u00a0J Bakewell Emanuele Pesce Vicky Goh and Giovanni Montana. 2019. Automated triaging of adult chest radiographs with deep artificial neural networks. Radiology 291 1 (2019) 196\u2013202.","DOI":"10.1148\/radiol.2018180921"},{"key":"e_1_3_3_2_5_2","unstructured":"Anthropic. 2024. Claude 3 Model Family. https:\/\/www.anthropic.com\/news\/claude-3-family. Accessed: 2025-06-27."},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Onur Asan Alparslan\u00a0Emrah Bayrak and Avishek Choudhury. 2020. Artificial intelligence and human trust in healthcare: focus on clinicians. Journal of medical Internet research 22 6 (2020) e15154.","DOI":"10.2196\/15154"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Raghav Awasthi Shreya Mishra Dwarikanath Mahapatra Ashish Khanna Kamal Maheshwari Jacek Cywinski Frank Papay and Piyush Mathur. 2023. HumanELY: Human evaluation of LLM yield using a novel web-based evaluation tool. MedRXIV (2023) 2023\u201312.","DOI":"10.1101\/2023.12.22.23300458"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445717"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3604915.3608857"},{"key":"e_1_3_3_2_10_2","unstructured":"HS Barrows GR Norman VR Neufeld and JW7116714 Feightner. 1982. The clinical reasoning of randomly selected physicians in general medical practice. Clinical and Investigative Medicine. Medecine clinique et experimentale 5 1 (1982) 49\u201355."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3714097"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"Judith\u00a0L Bowen. 2006. Educational strategies to promote clinical diagnostic reasoning. New England Journal of Medicine 355 21 (2006) 2217\u20132225.","DOI":"10.1056\/NEJMra054782"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Ralph\u00a0Allan Bradley and Milton\u00a0E Terry. 1952. Rank analysis of incomplete block designs: I. the method of paired comparisons. Biometrika 39 3\/4 (1952) 324\u2013345.","DOI":"10.1093\/biomet\/39.3-4.324"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Virginia Braun and Victoria Clarke. 2006. Using thematic analysis in psychology. Qualitative research in psychology 3 2 (2006) 77\u2013101.","DOI":"10.1191\/1478088706qp063oa"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Zana Bu\u00e7inca Maja\u00a0Barbara Malaya and Krzysztof\u00a0Z Gajos. 2021. To trust or to think: cognitive forcing functions can reduce overreliance on AI in AI-assisted decision-making. Proceedings of the ACM on Human-computer Interaction 5 CSCW1 (2021) 1\u201321.","DOI":"10.1145\/3449287"},{"key":"e_1_3_3_2_16_2","unstructured":"Kai Chen Xinfeng Li Tianpei Yang Hewei Wang Wei Dong and Yang Gao. 2025. Mdteamgpt: A self-evolving llm-based multi-agent framework for multi-disciplinary team medical consultation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.13856 (2025)."},{"key":"e_1_3_3_2_17_2","unstructured":"Zeming Chen Alejandro Hern\u00e1ndez-Cano Angelika Romanou Antoine Bonnet Kyle Matoba Francesco Salvi Matteo Pagliardini Simin Fan Andreas K\u00f6pf Amirkeivan Mohtashami Alexandre Sallinen Alireza Sakhaeirad Vinitra Swamy Igor Krawczuk Deniz Bayazit Axel Marmet Syrielle Montariol Mary-Anne Hartley Martin Jaggi and Antoine Bosselut. 2023. MEDITRON-70B: Scaling Medical Pretraining for Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2311.16079\u00a0[cs.CL]"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"crossref","unstructured":"Ryan Chin\u00a0Taw Cheong Samit Unadkat Venkata McNeillis Andrew Williamson Jonathan Joseph Premjit Randhawa Peter Andrews and Vinidh Paleri. 2024. Artificial intelligence chatbots as sources of patient education material for obstructive sleep apnoea: ChatGPT versus Google Bard. European Archives of Oto-Rhino-Laryngology 281 2 (2024) 985\u2013993.","DOI":"10.1007\/s00405-023-08319-9"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"crossref","unstructured":"Joongwon Choi Jin\u00a0Wook Kim Yong\u00a0Seong Lee Jong\u00a0Hyun Tae Se\u00a0Young Choi In\u00a0Ho Chang and Jung\u00a0Hoon Kim. 2024. Availability of ChatGPT to provide medical information for patients with kidney cancer. Scientific Reports 14 1 (2024) 1542.","DOI":"10.1038\/s41598-024-51531-8"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Avishek Choudhury and Zaira Chaudhry. 2024. Large language models and user trust: consequence of self-referential learning loop and the deskilling of health care professionals. Journal of Medical Internet Research 26 (2024) e56764.","DOI":"10.2196\/56764"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"William\u00a0S Cleveland. 1979. Robust locally weighted regression and smoothing scatterplots. Journal of the American statistical association 74 368 (1979) 829\u2013836.","DOI":"10.1080\/01621459.1979.10481038"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Gregory\u00a0W Corder and Dale\u00a0I Foreman. 2009. Nonparametric statistics for non-statisticians. (No Title) (2009).","DOI":"10.1002\/9781118165881"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746059.3747746"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"crossref","unstructured":"Lee\u00a0J Cronbach. 1951. Coefficient alpha and the internal structure of tests. psychometrika 16 3 (1951) 297\u2013334.","DOI":"10.1007\/BF02310555"},{"key":"e_1_3_3_2_25_2","unstructured":"Edoardo\u00a0Sebastiano De\u00a0Duro Giuseppe\u00a0Alessandro Veltri Hudson Golino and Massimo Stella. 2025. Measuring and identifying factors of individuals\u2019 trust in Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.21028 (2025)."},{"key":"e_1_3_3_2_26_2","unstructured":"Google DeepMind. 2024. Gemini 1.5 Technical Report. https:\/\/deepmind.google\/technologies\/gemini. Accessed: 2025-06-27."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706599.3719675"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.4159\/harvard.9780674189089"},{"key":"e_1_3_3_2_29_2","unstructured":"Reinhard\u00a0Friedrich Fritsch and Adam Jatowt. 2024. LLMTemporalComparator: A Tool for Analysing Differences in Temporal Adaptations of Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.04195 (2024)."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706599.3720105"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-99740-7_21"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Ethan Goh Robert Gallo Jason Hom Eric Strong Yingjie Weng Hannah Kerman Jos\u00e9phine\u00a0A Cool Zahir Kanjee Andrew\u00a0S Parsons Neera Ahuja et\u00a0al. 2024. Large language model influence on diagnostic reasoning: a randomized clinical trial. JAMA Network Open 7 10 (2024) e2440969\u2013e2440969.","DOI":"10.1001\/jamanetworkopen.2024.40969"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"crossref","unstructured":"Catalina Gomez Sue\u00a0Min Cho Shichang Ke Chien-Ming Huang and Mathias Unberath. 2025. Human-AI collaboration is not very collaborative yet: A taxonomy of interaction patterns in AI-assisted decision making from a systematic review. Frontiers in Computer Science 6 (2025) 1521066.","DOI":"10.3389\/fcomp.2024.1521066"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"crossref","unstructured":"Hongyan Gu Jingbin Huang Lauren Hung and Xiang\u2019Anthony\u2019 Chen. 2021. Lessons learned from designing an AI-enabled diagnosis tool for pathologists. Proceedings of the ACM on Human-computer Interaction 5 CSCW1 (2021) 1\u201325.","DOI":"10.1145\/3449084"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580694"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642353"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3678884.3681841"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/2516540.2516554"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491101.3503564"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"crossref","unstructured":"Di Jin Eileen Pan Nassim Oufattole Wei-Hung Weng Hanyi Fang and Peter Szolovits. 2021. What disease does this patient have? a large-scale open domain question answering dataset from medical exams. Applied Sciences 11 14 (2021) 6421.","DOI":"10.3390\/app11146421"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"crossref","unstructured":"Qiao Jin Fangyuan Chen Yiliang Zhou Ziyang Xu Justin\u00a0M Cheung Robert Chen Ronald\u00a0M Summers Justin\u00a0F Rousseau Peiyun Ni Marc\u00a0J Landsman et\u00a0al. 2024. Hidden flaws behind expert-level accuracy of multimodal GPT-4 vision in medicine. NPJ Digital Medicine 7 1 (2024) 190.","DOI":"10.1038\/s41746-024-01185-7"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"Qiao Jin Bhuwan Dhingra Zhengping Liu William\u00a0W Cohen and Xinghua Lu. 2019. Pubmedqa: A dataset for biomedical research question answering. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1909.06146 (2019).","DOI":"10.18653\/v1\/D19-1259"},{"key":"e_1_3_3_2_43_2","unstructured":"Daniel Kahneman. 2011. Fast and slow thinking. Allen Lane and Penguin Books New York (2011)."},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613905.3650755"},{"key":"e_1_3_3_2_45_2","unstructured":"Jerome\u00a0P Kassirer John\u00a0B Wong and Richard\u00a0I Kopelman. 1991. Learning clinical reasoning. (1991)."},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"crossref","unstructured":"Mohsen Khosravi Zahra Zare Seyyed\u00a0Morteza Mojtabaeian and Reyhane Izadi. 2024. Artificial intelligence and decision-making in healthcare: a thematic analysis of a systematic review of reviews. Health services research and managerial epidemiology 11 (2024) 23333928241234863.","DOI":"10.1177\/23333928241234863"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3630106.3658941"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642216"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"crossref","unstructured":"Alisa K\u00fcper Georg\u00a0Christian Lodde Elisabeth Livingstone Dirk Schadendorf and Nicole Kr\u00e4mer. 2025. Psychological factors influencing appropriate reliance on ai-enabled clinical decision support systems: experimental web-based study among dermatologists. Journal of Medical Internet Research 27 (2025) e58660.","DOI":"10.2196\/58660"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3714113"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"crossref","unstructured":"Ro\u00a0Woon Lee Kyu\u00a0Hong Lee Jae\u00a0Sung Yun Myung\u00a0Sub Kim and Hyun\u00a0Seok Choi. 2024. Comparative analysis of M4CXR an LLM-Based chest X-Ray report generation model and ChatGPT in radiological interpretation. Journal of Clinical Medicine 13 23 (2024) 7057.","DOI":"10.3390\/jcm13237057"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3708359.3712136"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.138"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713133"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3714196"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","DOI":"10.1145\/3640543.3645166"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.548"},{"key":"e_1_3_3_2_58_2","unstructured":"Yang Liu Yuanshun Yao Jean-Francois Ton Xiaoying Zhang Ruocheng Guo Hao Cheng Yegor Klochkov Muhammad\u00a0Faaiz Taufiq and Hang Li. 2023. Trustworthy llms: a survey and guideline for evaluating large language models\u2019 alignment. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.05374 (2023)."},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581058"},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"crossref","unstructured":"Subhankar Maity and Manob\u00a0Jyoti Saikia. 2025. Large Language Models in Healthcare and Medical Applications: A Review. Bioengineering 12 6 (2025) 631.","DOI":"10.3390\/bioengineering12060631"},{"key":"e_1_3_3_2_61_2","doi-asserted-by":"crossref","unstructured":"John\u00a0M McGuirl and Nadine\u00a0B Sarter. 2006. Supporting trust calibration and the effective use of decision aids by presenting dynamic system confidence information. Human factors 48 4 (2006) 656\u2013665.","DOI":"10.1518\/001872006779166334"},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"publisher","unstructured":"Andrew\u00a0J. McMurry Dylan Phelan Brian\u00a0E. Dixon Alon Geva Daniel Gottlieb James\u00a0R. Jones Michael Terry David Taylor Hannah\u00a0Grace Callaway Sneha Mahoharan Timothy Miller and Kenneth\u00a0D. Mandl. 2024. Large Language Model Symptom Identification from Clinical Text: A Multi-Center Study. medRxiv (2024). arXiv:https:\/\/www.medrxiv.org\/content\/early\/2024\/12\/17\/2024.12.16.24319044.full.pdf10.1101\/2024.12.16.24319044","DOI":"10.1101\/2024.12.16.24319044"},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642122"},{"key":"e_1_3_3_2_64_2","unstructured":"Microsoft. 2025. Microsoft Dragon Copilot. https:\/\/www.microsoft.com\/en-us\/health-solutions\/clinical-workflow\/dragon-copilot Accessed: 2025-09-11."},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"crossref","unstructured":"Aditi Mishra Bretho Danzy Utkarsh Soni Anjana Arunkumar Jinbin Huang Bum\u00a0Chul Kwon and Chris Bryan. 2025. PromptAid: Visual prompt exploration perturbation testing and iteration for large language models. IEEE Transactions on Visualization and Computer Graphics (2025).","DOI":"10.1109\/TVCG.2025.3535332"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"crossref","unstructured":"Maria\u00a0D Molina and S\u00a0Shyam Sundar. 2022. When AI moderates online content: effects of human collaboration and interactive transparency on user trust. Journal of Computer-Mediated Communication 27 4 (2022) zmac010.","DOI":"10.1093\/jcmc\/zmac010"},{"key":"e_1_3_3_2_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713744"},{"key":"e_1_3_3_2_68_2","doi-asserted-by":"crossref","unstructured":"Mahmud Omar Girish\u00a0N Nadkarni Eyal Klang and Benjamin\u00a0S Glicksberg. 2024. Large language models in medicine: A review of current clinical trials across healthcare applications. PLOS Digital Health 3 11 (2024) e0000662.","DOI":"10.1371\/journal.pdig.0000662"},{"key":"e_1_3_3_2_69_2","first-page":"248","volume-title":"Conference on health, inference, and learning","author":"Pal Ankit","year":"2022","unstructured":"Ankit Pal, Logesh\u00a0Kumar Umapathi, and Malaikannan Sankarasubbu. 2022. Medmcqa: A large-scale multi-subject multi-choice dataset for medical domain question answering. In Conference on health, inference, and learning. PMLR, 248\u2013260."},{"key":"e_1_3_3_2_70_2","doi-asserted-by":"publisher","unstructured":"Oleksandr Palagin Vladislav Kaverinskiy Anna Litvin and Kyrylo Malakhov. 2023. OntoChatGPT Information System: Ontology-Driven Structured Prompts for ChatGPT Meta-Learning. International Journal of Computing (July 2023) 170\u2013183. 10.47839\/ijc.22.2.3086","DOI":"10.47839\/ijc.22.2.3086"},{"key":"e_1_3_3_2_71_2","doi-asserted-by":"crossref","unstructured":"Cecilia Panigutti Andrea Beretta Daniele Fadda Fosca Giannotti Dino Pedreschi Alan Perotti and Salvatore Rinzivillo. 2023. Co-design of human-centered explainable AI for clinical decision support. ACM Transactions on Interactive Intelligent Systems 13 4 (2023) 1\u201335.","DOI":"10.1145\/3587271"},{"key":"e_1_3_3_2_72_2","doi-asserted-by":"crossref","unstructured":"Karl Pearson. 1895. VII. Note on regression and inheritance in the case of two parents. proceedings of the royal society of London 58 347-352 (1895) 240\u2013242.","DOI":"10.1098\/rspl.1895.0041"},{"key":"e_1_3_3_2_73_2","doi-asserted-by":"crossref","unstructured":"Dulce\u00a0G Pereira Anabela Afonso and F\u00e1tima\u00a0Melo Medeiros. 2015. Overview of Friedman\u2019s test and post-hoc analysis. Communications in Statistics-Simulation and Computation 44 10 (2015) 2636\u20132653.","DOI":"10.1080\/03610918.2014.931971"},{"key":"e_1_3_3_2_74_2","doi-asserted-by":"crossref","unstructured":"Jeremy Qin Bang Liu and Quoc\u00a0Dinh Nguyen. 2024. Enhancing Healthcare LLM Trust with Atypical Presentations Recalibration. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.03225 (2024).","DOI":"10.18653\/v1\/2024.findings-emnlp.142"},{"key":"e_1_3_3_2_75_2","unstructured":"Pengcheng Qiu Chaoyi Wu Shuyu Liu Weike Zhao Zhuoxia Chen Hongfei Gu Chuanjin Peng Ya Zhang Yanfeng Wang and Weidi Xie. 2025. Quantifying the reasoning abilities of llms on real-world clinical cases. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.04691 (2025)."},{"key":"e_1_3_3_2_76_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600211.3604712"},{"key":"e_1_3_3_2_77_2","doi-asserted-by":"crossref","unstructured":"Charvi Rastogi Yunfeng Zhang Dennis Wei Kush\u00a0R Varshney Amit Dhurandhar and Richard Tomsett. 2022. Deciding fast and slow: The role of cognitive biases in ai-assisted decision-making. Proceedings of the ACM on Human-computer Interaction 6 CSCW1 (2022) 1\u201322.","DOI":"10.1145\/3512930"},{"key":"e_1_3_3_2_78_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3501967"},{"key":"e_1_3_3_2_79_2","doi-asserted-by":"publisher","DOI":"10.1145\/3719160.3736616"},{"key":"e_1_3_3_2_80_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642362"},{"key":"e_1_3_3_2_81_2","unstructured":"Burcu Sayin Ipek\u00a0Baris Schlicht Ngoc\u00a0Vo Hong Sara Allievi Jacopo Staiano Pasquale Minervini and Andrea Passerini. 2025. MedSyn: Enhancing Diagnostics with Human-AI Collaboration. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.14774 (2025)."},{"key":"e_1_3_3_2_82_2","doi-asserted-by":"crossref","unstructured":"Nadine Schlicker Kevin Baum Alarith Uhde Sarah Sterz Martin\u00a0C Hirsch and Markus Langer. 2025. How do we assess the trustworthiness of AI? Introducing the trustworthiness assessment model (TrAM). Computers in Human Behavior 170 (2025) 108671.","DOI":"10.1016\/j.chb.2025.108671"},{"key":"e_1_3_3_2_83_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676450"},{"key":"e_1_3_3_2_84_2","doi-asserted-by":"publisher","DOI":"10.1145\/3584931.3607492"},{"key":"e_1_3_3_2_85_2","doi-asserted-by":"crossref","unstructured":"Karan Singhal Shekoofeh Azizi Tao Tu S\u00a0Sara Mahdavi Jason Wei Hyung\u00a0Won Chung Nathan Scales Ajay Tanwani Heather Cole-Lewis Stephen Pfohl et\u00a0al. 2023. Large language models encode clinical knowledge. Nature 620 7972 (2023) 172\u2013180.","DOI":"10.1038\/s41586-023-06291-2"},{"key":"e_1_3_3_2_86_2","unstructured":"Karan Singhal Tao Tu Juraj Gottweis Rory Sayres Ellery Wulczyn Le Hou Kevin Clark Stephen Pfohl Heather Cole-Lewis Darlene Neal Mike Schaekermann Amy Wang Mohamed Amin Sami Lachgar Philip Mansfield Sushant Prakash Bradley Green Ewa Dominowska Blaise\u00a0Aguera y Arcas Nenad Tomasev Yun Liu Renee Wong Christopher Semturs S.\u00a0Sara Mahdavi Joelle Barral Dale Webster Greg\u00a0S. Corrado Yossi Matias Shekoofeh Azizi Alan Karthikesalingam and Vivek Natarajan. 2023. Towards Expert-Level Medical Question Answering with Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2305.09617\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2305.09617"},{"key":"e_1_3_3_2_87_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581075"},{"key":"e_1_3_3_2_88_2","doi-asserted-by":"crossref","unstructured":"David\u00a0F Steiner Robert MacDonald Yun Liu Peter Truszkowski Jason\u00a0D Hipp Christopher Gammage Florence Thng Lily Peng and Martin\u00a0C Stumpe. 2018. Impact of deep learning assistance on the histopathologic review of lymph nodes for metastatic breast cancer. The American journal of surgical pathology 42 12 (2018) 1636\u20131646.","DOI":"10.1097\/PAS.0000000000001151"},{"key":"e_1_3_3_2_89_2","doi-asserted-by":"crossref","unstructured":"Mark Steyvers and Aakriti Kumar. 2024. Three challenges for AI-assisted decision-making. Perspectives on Psychological Science 19 5 (2024) 722\u2013734.","DOI":"10.1177\/17456916231181102"},{"key":"e_1_3_3_2_90_2","unstructured":"Lichao Sun Yue Huang Haoran Wang Siyuan Wu Qihui Zhang Chujie Gao Yixin Huang Wenhan Lyu Yixuan Zhang Xiner Li et\u00a0al. 2024. Trustllm: Trustworthiness in large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.05561 3 (2024)."},{"key":"e_1_3_3_2_91_2","doi-asserted-by":"publisher","DOI":"10.1145\/3708557.3716146"},{"key":"e_1_3_3_2_92_2","unstructured":"Annalisa Szymanski Simret\u00a0Araya Gebreegziabher Oghenemaro Anuyah Ronald\u00a0A Metoyer and Toby Jia-Jun Li. 2024. Comparing criteria development across domain experts lay users and models in large language model evaluation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.02054 (2024)."},{"key":"e_1_3_3_2_93_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3641924"},{"key":"e_1_3_3_2_94_2","doi-asserted-by":"crossref","unstructured":"Thomas Yu\u00a0Chow Tam Sonish Sivarajkumar Sumit Kapoor Alisa\u00a0V Stolyar Katelyn Polanska Karleigh\u00a0R McCarthy Hunter Osterhoudt Xizhi Wu Shyam Visweswaran Sunyang Fu et\u00a0al. 2024. A framework for human evaluation of large language models in healthcare derived from literature review. NPJ digital medicine 7 1 (2024) 258.","DOI":"10.1038\/s41746-024-01258-7"},{"key":"e_1_3_3_2_95_2","doi-asserted-by":"publisher","unstructured":"J.\u00a0E. Taylor G.\u00a0A. Rousselet C. Scheepers et\u00a0al. 2023. Rating norms should be calculated from cumulative link mixed effects models. Behavior Research Methods 55 5 (2023) 2175\u20132196. 10.3758\/s13428-022-01814-7","DOI":"10.3758\/s13428-022-01814-7"},{"key":"e_1_3_3_2_96_2","unstructured":"Amy Turner Meena Kaushik Mu-Ti Huang and Srikar Varanasi. 2022. Calibrating trust in AI-assisted decision making. Google Scholar Google Scholar Navigate to (2022)."},{"key":"e_1_3_3_2_97_2","doi-asserted-by":"crossref","unstructured":"Baptiste Vasey Myura Nagendran Bruce Campbell David\u00a0A Clifton Gary\u00a0S Collins Spiros Denaxas Alastair\u00a0K Denniston Livia Faes Bart Geerts Mudathir Ibrahim et\u00a0al. 2022. Reporting guideline for the early stage clinical evaluation of decision support systems driven by artificial intelligence: DECIDE-AI. bmj 377 (2022).","DOI":"10.1136\/bmj-2022-070904"},{"key":"e_1_3_3_2_98_2","doi-asserted-by":"publisher","unstructured":"Ethan Waisberg Joshua Ong Mouayad Masalkhi Sharif\u00a0Amit Kamran Nasif Zaman Prithul Sarker Andrew\u00a0G. Lee and Alireza Tavakkoli. 2023. GPT-4: a new era of artificial intelligence in medicine. Irish Journal of Medical Science (1971 -) 192 6 (Dec. 2023) 3197\u20133200. 10.1007\/s11845-023-03377-8","DOI":"10.1007\/s11845-023-03377-8"},{"key":"e_1_3_3_2_99_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445432"},{"key":"e_1_3_3_2_100_2","doi-asserted-by":"crossref","unstructured":"Dandan Wang and Shiqing Zhang. 2024. Large language models in medical and healthcare fields: applications advances and challenges. Artificial Intelligence Review 57 11 (2024) 299.","DOI":"10.1007\/s10462-024-10921-0"},{"key":"e_1_3_3_2_101_2","doi-asserted-by":"crossref","unstructured":"Jason Wei Xuezhi Wang Dale Schuurmans Maarten Bosma Fei Xia Ed Chi Quoc\u00a0V Le Denny Zhou et\u00a0al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems 35 (2022) 24824\u201324837.","DOI":"10.52202\/068431-1800"},{"key":"e_1_3_3_2_102_2","doi-asserted-by":"publisher","DOI":"10.1145\/1518701.1518871"},{"key":"e_1_3_3_2_103_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581197"},{"key":"e_1_3_3_2_104_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713372"},{"key":"e_1_3_3_2_105_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3714272"},{"key":"e_1_3_3_2_106_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376807"},{"key":"e_1_3_3_2_107_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713772"},{"key":"e_1_3_3_2_108_2","unstructured":"Lawrence K.\u00a0Q. Yan Qian Niu Ming Li Yichao Zhang Caitlyn\u00a0Heqi Yin Cheng Fei Benji Peng Ziqian Bi Pohsun Feng Keyu Chen Tianyang Wang Yunze Wang Silin Chen Ming Liu and Junyu Liu. 2024. Large Language Model Benchmarks in Medical Tasks. arxiv:https:\/\/arXiv.org\/abs\/2410.21348\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2410.21348"},{"key":"e_1_3_3_2_109_2","unstructured":"An Yang Anfeng Li Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chang Gao Chengen Huang Chenxu Lv et\u00a0al. 2025. Qwen3 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.09388 (2025)."},{"key":"e_1_3_3_2_110_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581393"},{"key":"e_1_3_3_2_111_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642013"},{"key":"e_1_3_3_2_112_2","doi-asserted-by":"crossref","unstructured":"Hongbo Zhang Junying Chen Feng Jiang Fei Yu Zhihong Chen Jianquan Li Guiming Chen Xiangbo Wu Zhiyi Zhang Qingying Xiao Xiang Wan Benyou Wang and Haizhou Li. 2023. HuatuoGPT Towards Taming Language Models To Be a Doctor. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.15075 (2023).","DOI":"10.18653\/v1\/2023.findings-emnlp.725"},{"key":"e_1_3_3_2_113_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642343"},{"key":"e_1_3_3_2_114_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746059.3747668"},{"key":"e_1_3_3_2_115_2","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3372852"},{"key":"e_1_3_3_2_116_2","doi-asserted-by":"publisher","unstructured":"Nan Zhou. 2024. Evaluating Human and Machine Assessment: Introducing a Hybrid Approach for Enhanced Educational Evaluation. Lecture Notes in Education Psychology and Public Media 58 (07 2024) 118\u2013124. 10.54254\/2753-7048\/58\/20241716","DOI":"10.54254\/2753-7048\/58\/20241716"},{"key":"e_1_3_3_2_117_2","unstructured":"Yakun Zhu Zhongzhen Huang Linjie Mu Yutong Huang Wei Nie Jiaji Liu Shaoting Zhang Pengfei Liu and Xiaofan Zhang. 2025. DiagnosisArena: Benchmarking Diagnostic Reasoning for Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.14107 (2025)."},{"key":"e_1_3_3_2_118_2","doi-asserted-by":"crossref","unstructured":"Xuan Zou Weijie He Yu Huang Yi Ouyang Zhen Zhang Yu Wu Yongsheng Wu Lili Feng Sheng Wu Mengqi Yang et\u00a0al. 2024. AI-driven diagnostic assistance in medical inquiry: reinforcement learning algorithm development and validation. Journal of Medical Internet Research 26 (2024) e54616.","DOI":"10.2196\/54616"},{"key":"e_1_3_3_2_119_2","first-page":"195","volume-title":"AAAI Bridge Program on AI for Medicine and Healthcare","author":"Zuo Kaiwen","year":"2025","unstructured":"Kaiwen Zuo, Yirui Jiang, Fan Mo, and Pietro Lio. 2025. Kg4diagnosis: A hierarchical multi-agent llm framework with knowledge graph enhancement for medical diagnosis. In AAAI Bridge Program on AI for Medicine and Healthcare. PMLR, 195\u2013204."}],"event":{"name":"CHI 2026: CHI Conference on Human Factors in Computing Systems","location":"Barcelona Spain","acronym":"CHI '26","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2026 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772318.3790835","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T10:17:42Z","timestamp":1776421062000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772318.3790835"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,13]]},"references-count":118,"alternative-id":["10.1145\/3772318.3790835","10.1145\/3772318"],"URL":"https:\/\/doi.org\/10.1145\/3772318.3790835","relation":{},"subject":[],"published":{"date-parts":[[2026,4,13]]},"assertion":[{"value":"2026-04-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}