{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T17:53:04Z","timestamp":1765389184448,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,12]]},"DOI":"10.1145\/3765612.3767224","type":"proceedings-article","created":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T17:45:59Z","timestamp":1765388759000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["GamELY: Human-in-the loop Framework for Scaling Human Evaluation of LLMs in Healthcare"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6643-4333","authenticated-orcid":false,"given":"Raghav","family":"Awasthi","sequence":"first","affiliation":[{"name":"BrainXAI ReSearch, BrainX,LLC., Cleveland, OH, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3692-7254","authenticated-orcid":false,"given":"Nishant","family":"Singh","sequence":"additional","affiliation":[{"name":"BrainXAI ReSearch, BrainX,LLC., Boston, MA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1052-5546","authenticated-orcid":false,"given":"Shreya","family":"Mishra","sequence":"additional","affiliation":[{"name":"BrainXAI ReSearch, BrainX,LLC., Cleveland, OH, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8041-9105","authenticated-orcid":false,"given":"Atharva","family":"Bhattad","sequence":"additional","affiliation":[{"name":"BrainXAI ReSearch, BrainX,LLC., Cleveland, Ohio, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6398-6047","authenticated-orcid":false,"given":"Moises","family":"Auron","sequence":"additional","affiliation":[{"name":"BrainXAI ReSearch, BrainX,LLC., Cleveland, OH, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1283-0514","authenticated-orcid":false,"given":"Charumathi Raghu","family":"Subramanian","sequence":"additional","affiliation":[{"name":"BrainXAI ReSearch, BrainX,LLC., Cleveland, Ohio, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8170-2861","authenticated-orcid":false,"given":"Ashish","family":"Atreja","sequence":"additional","affiliation":[{"name":"BrainXAI ReSearch, BrainX,LLC., Cleveland, OH, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1961-5577","authenticated-orcid":false,"given":"Kamal","family":"Maheshwari","sequence":"additional","affiliation":[{"name":"BrainXAI ReSearch, BrainX,LLC., Cleveland, OH, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9749-7858","authenticated-orcid":false,"given":"Dwarikanath","family":"Mahapatra","sequence":"additional","affiliation":[{"name":"BrainXAI ReSearch, BrainX,LLC., Abu Dhabi, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6283-3049","authenticated-orcid":false,"given":"Jacek B.","family":"Cywinski","sequence":"additional","affiliation":[{"name":"BrainXAI ReSearch, BrainX,LLC., Cleveland, OH, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9083-891X","authenticated-orcid":false,"given":"Ashish","family":"Khanna","sequence":"additional","affiliation":[{"name":"BrainXAI ReSearch, BrainX,LLC., Cleveland, OH, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0455-2892","authenticated-orcid":false,"given":"Francis","family":"Papay","sequence":"additional","affiliation":[{"name":"BrainXAI ReSearch, BrainX,LLC., Cleveland, OH, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3777-8767","authenticated-orcid":false,"given":"Piyush","family":"Mathur","sequence":"additional","affiliation":[{"name":"BrainXAI ReSearch, BrainX,LLC., Cleveland, OH, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,12,10]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jacc.2019.03.010"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1101\/2023.12.22.23300458"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"Raghav Awasthi Shreya Mishra Charumathi Raghu Auron Moises Ashish Atreja Dwarikanath Mahapatra Nishant Singh Ashish K Khanna Jacek B Cywinski Kamal Maheshwari et al. 2025. Theory of Mind Imitation by LLMs for Physician-Like Human Evaluation. medRxiv (2025) 2025\u201303.","DOI":"10.1101\/2025.03.01.25323142"},{"key":"e_1_3_2_2_4_1","volume-title":"Reference-guided verdict: Llms-as-judges in automatic evaluation of free-form text","author":"Badshah Sher","year":"2024","unstructured":"Sher Badshah and Hassan Sajjad. 2024. Reference-guided verdict: Llms-as-judges in automatic evaluation of free-form text, 2024. URL https:\/\/arxiv.org\/abs\/2408.09235 (2024)."},{"key":"e_1_3_2_2_5_1","volume-title":"Lisa Soleymani Lehmann, et al","author":"Bedi Suhana","year":"2024","unstructured":"Suhana Bedi, Yutong Liu, Lucy Orr-Ewing, Dev Dash, Sanmi Koyejo, Alison Callahan, Jason A Fries, Michael Wornow, Akshay Swaminathan, Lisa Soleymani Lehmann, et al. 2024. A systematic review of testing and evaluation of healthcare applications of large language models (LLMs). medRxiv (2024), 2024\u201304."},{"key":"e_1_3_2_2_6_1","volume-title":"Comparing Two Model Designs for Clinical Note Generation","author":"Brake Nathan","year":"2024","unstructured":"Nathan Brake and Thomas Schaaf. 2024. Comparing Two Model Designs for Clinical Note Generation; Is an LLM a Useful Evaluator of Consistency? arXiv preprint arXiv:2404.06503 (2024)."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"crossref","unstructured":"Clare Bycroft Colin Freeman Desislava Petkova Gavin Band Lloyd T Elliott Kevin Sharp Allan Motyer Damjan Vukcevic Olivier Delaneau Jared O'Connell et al. 2018. The UK Biobank resource with deep phenotyping and genomic data. Nature 562 7726 (2018) 203\u2013209.","DOI":"10.1038\/s41586-018-0579-z"},{"key":"e_1_3_2_2_8_1","volume-title":"Humanin-the-Loop through Chain-of-Thought. arXiv preprint arXiv:2306.07932","author":"Cai Zefan","year":"2023","unstructured":"Zefan Cai, Baobao Chang, and Wenjuan Han. 2023. Humanin-the-Loop through Chain-of-Thought. arXiv preprint arXiv:2306.07932 (2023)."},{"key":"e_1_3_2_2_9_1","volume-title":"Humans or llms as the judge? a study on judgement biases","author":"Chen Guiming Hardy","year":"2024","unstructured":"Guiming Hardy Chen, Shunian Chen, Ziche Liu, Feng Jiang, and Benyou Wang. 2024. Humans or llms as the judge? a study on judgement biases, 2024. URL https:\/\/arxiv.org\/abs\/2402 10669 (2024)."},{"key":"e_1_3_2_2_10_1","volume-title":"Deep reinforcement learning from human preferences. Advances in neural information processing systems 30","author":"Christiano Paul F","year":"2017","unstructured":"Paul F Christiano, Jan Leike, Tom Brown, Miljan Martic, Shane Legg, and Dario Amodei. 2017. Deep reinforcement learning from human preferences. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1001\/jama.2021.6238"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1001\/jama.2021.12531"},{"key":"e_1_3_2_2_13_1","volume-title":"What Linguistic Features and Languages are Important in LLM Translation? arXiv e-prints","author":"Diandaru Ryandito","year":"2024","unstructured":"Ryandito Diandaru, Lucky Susanto, Zilu Tang, Ayu Purwarianti, and Derry Wijaya. 2024. What Linguistic Features and Languages are Important in LLM Translation? arXiv e-prints (2024), arXiv-2402."},{"key":"e_1_3_2_2_14_1","volume-title":"Can LLM be a Personalized Judge? arXiv preprint arXiv:2406.11657","author":"Dong Yijiang River","year":"2024","unstructured":"Yijiang River Dong, Tiancheng Hu, and Nigel Collier. 2024. Can LLM be a Personalized Judge? arXiv preprint arXiv:2406.11657 (2024)."},{"key":"e_1_3_2_2_15_1","volume-title":"Distances with mixed type variables some modified Gower's coefficients. arXiv preprint arXiv:2101.02481","author":"D'Orazio Marcello","year":"2021","unstructured":"Marcello D'Orazio. 2021. Distances with mixed type variables some modified Gower's coefficients. arXiv preprint arXiv:2101.02481 (2021)."},{"key":"e_1_3_2_2_16_1","unstructured":"Jiawei Gu Xuhui Jiang Zhichao Shi Hexiang Tan Xuehao Zhai Chengjin Xu Wei Li Yinghan Shen Shengjie Ma Honghao Liu et al. 2024. A survey on llm-as-a-judge. arXiv preprint arXiv:2411.15594 (2024)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1177\/0013164415596420"},{"key":"e_1_3_2_2_18_1","volume-title":"Leo Anthony Celi, and Roger G Mark","author":"Johnson Alistair EW","year":"2016","unstructured":"Alistair EW Johnson, Tom J Pollard, Lu Shen, Li-wei H Lehman, Mengling Feng, Mohammad Ghassemi, Benjamin Moody, Peter Szolovits, Leo Anthony Celi, and Roger G Mark. 2016. MIMIC-III, a freely accessible critical care database. Scientific data 3, 1 (2016), 1\u20139."},{"key":"e_1_3_2_2_19_1","volume-title":"Towards leveraging large language models for automated medical q&a evaluation. arXiv preprint arXiv:2409.01941","author":"Krolik Jack","year":"2024","unstructured":"Jack Krolik, Herprit Mahal, Feroz Ahmad, Gaurav Trivedi, and Bahador Saket. 2024. Towards leveraging large language models for automated medical q&a evaluation. arXiv preprint arXiv:2409.01941 (2024)."},{"key":"e_1_3_2_2_20_1","volume-title":"LLM-based Automated Grading with Human-in-the-Loop. arXiv preprint arXiv:2504.05239","author":"Li Hang","year":"2025","unstructured":"Hang Li, Yucheng Chu, Kaiqi Yang, Yasemin Copur-Gencturk, and Jiliang Tang. 2025. LLM-based Automated Grading with Human-in-the-Loop. arXiv preprint arXiv:2504.05239 (2025)."},{"key":"e_1_3_2_2_21_1","volume-title":"G-eval: NLG evaluation using gpt-4 with better human alignment. arXiv preprint arXiv:2303.16634","author":"Liu Yang","year":"2023","unstructured":"Yang Liu, Dan Iter, Yichong Xu, Shuohang Wang, Ruochen Xu, and Chenguang Zhu. 2023. G-eval: NLG evaluation using gpt-4 with better human alignment. arXiv preprint arXiv:2303.16634 (2023)."},{"key":"e_1_3_2_2_22_1","first-page":"53728","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume":"36","author":"Rafailov Rafael","year":"2023","unstructured":"Rafael Rafailov, Archit Sharma, Eric Mitchell, Christopher D Manning, Stefano Ermon, and Chelsea Finn. 2023. Direct preference optimization: Your language model is secretly a reward model. Advances in Neural Information Processing Systems 36 (2023), 53728\u201353741.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_23_1","unstructured":"Andrea H Ramirez Lina Sulieman David J Schlueter Alese Halvorson Jun Qian Francis Ratsimbazafy Roxana Loperena Kelsey Mayo Melissa Basford Nicole Deflaux et al. 2022. The All of Us Research Program: data quality utility and diversity. Patterns 3 8 (2022)."},{"key":"e_1_3_2_2_24_1","volume-title":"Minimum sample size calculations for external validation of a clinical prediction model with a time-to-event outcome. Statistics in medicine 41, 7","author":"Riley Richard D","year":"2022","unstructured":"Richard D Riley, Gary S Collins, Joie Ensor, Lucinda Archer, Sarah Booth, Sarwar I Mozumder, Mark J Rutherford, Maarten van Smeden, Paul C Lambert, and Kym IE Snell. 2022. Minimum sample size calculations for external validation of a clinical prediction model with a time-to-event outcome. Statistics in medicine 41, 7 (2022), 1280\u20131295."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3068335"},{"key":"e_1_3_2_2_26_1","unstructured":"Burr Settles. 2009. Active learning literature survey. (2009)."},{"key":"e_1_3_2_2_27_1","volume-title":"Monte Carlo sampling methods. Handbooks in operations research and management science 10","author":"Shapiro Alexander","year":"2003","unstructured":"Alexander Shapiro. 2003. Monte Carlo sampling methods. Handbooks in operations research and management science 10 (2003), 353\u2013425."},{"key":"e_1_3_2_2_28_1","volume-title":"Nathan Scales, Ajay Tanwani, Heather Cole-Lewis, Stephen Pfohl, et al.","author":"Singhal Karan","year":"2023","unstructured":"Karan Singhal, Shekoofeh Azizi, Tao Tu, S Sara Mahdavi, Jason Wei, Hyung Won Chung, Nathan Scales, Ajay Tanwani, Heather Cole-Lewis, Stephen Pfohl, et al. 2023. Large language models encode clinical knowledge. Nature 620, 7972 (2023), 172\u2013180."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Thomas Yu Chow Tam Sonish Sivarajkumar Sumit Kapoor Alisa V Stolyar Katelyn Polanska Karleigh R McCarthy Hunter Osterhoudt Xizhi Wu Shyam Visweswaran Sunyang Fu et al. 2024. A framework for human evaluation of large language models in healthcare derived from literature review. NPJ digital medicine 7 1 (2024) 258.","DOI":"10.1038\/s41746-024-01258-7"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICISA.2011.5772437"},{"key":"e_1_3_2_2_31_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_2_32_1","volume-title":"International conference on machine learning. PMLR","author":"Wei Kai","year":"2015","unstructured":"Kai Wei, Rishabh Iyer, and Jeff Bilmes. 2015. Submodularity in data subset selection and active learning. In International conference on machine learning. PMLR, 1954\u20131963."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2022.05.014"},{"key":"e_1_3_2_2_34_1","volume-title":"Human-in-the-loop machine translation with large language model. arXiv preprint arXiv:2310.08908","author":"Yang Xinyi","year":"2023","unstructured":"Xinyi Yang, Runzhe Zhan, Derek F Wong, Junchao Wu, and Lidia S Chao. 2023. Human-in-the-loop machine translation with large language model. arXiv preprint arXiv:2310.08908 (2023)."},{"key":"e_1_3_2_2_35_1","volume-title":"The Impact of Example Selection in Few-Shot Prompting on Automated Essay Scoring Using GPT Models. In International Conference on Artificial Intelligence in Education. Springer, 61\u201373","author":"Yoshida Lui","year":"2024","unstructured":"Lui Yoshida. 2024. The Impact of Example Selection in Few-Shot Prompting on Automated Essay Scoring Using GPT Models. In International Conference on Artificial Intelligence in Education. Springer, 61\u201373."},{"key":"e_1_3_2_2_36_1","volume-title":"Song Harris Ao, and Piper Liping Liu.","author":"Zhao Xinshu","year":"2022","unstructured":"Xinshu Zhao, Guangchao Charles Feng, Song Harris Ao, and Piper Liping Liu. 2022. Interrater reliability estimators tested against true interrater reliabilities. BMC medical research methodology 22, 1 (2022), 232."}],"event":{"name":"BCB '25: 16th ACM International Conference on Bioinformatics, Computational Biology, and Health Informatics","location":"Element Philadelphia Downtown Philadelphia PA USA","acronym":"BCB '25","sponsor":["SIGBio ACM Special Interest Group on Bioinformatics"]},"container-title":["Proceedings of the 16th ACM International Conference on Bioinformatics, Computational Biology, and Health Informatics"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3765612.3767224","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T17:48:32Z","timestamp":1765388912000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3765612.3767224"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":36,"alternative-id":["10.1145\/3765612.3767224","10.1145\/3765612"],"URL":"https:\/\/doi.org\/10.1145\/3765612.3767224","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-12-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}