{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T21:23:42Z","timestamp":1780694622479,"version":"3.54.1"},"reference-count":20,"publisher":"Oxford University Press (OUP)","issue":"2","license":[{"start":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T00:00:00Z","timestamp":1733270400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"VA Advanced Fellowship in Medical Informatics"},{"DOI":"10.13039\/100000060","name":"National Institute of Allergy and Infectious Diseases","doi-asserted-by":"publisher","award":["1R01AI17812101"],"award-info":[{"award-number":["1R01AI17812101"]}],"id":[{"id":"10.13039\/100000060","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Institute on Drug Abuse Clinical Trials Network","award":["UG1DA015815\u2014CTN-0136"],"award-info":[{"award-number":["UG1DA015815\u2014CTN-0136"]}]},{"name":"National Center for Advancing Translational Sciences\u2019s Clinical and Translational Science","award":["UL1TR003142"],"award-info":[{"award-number":["UL1TR003142"]}]},{"DOI":"10.13039\/100000936","name":"Gordon and Betty Moore Foundation","doi-asserted-by":"publisher","award":["#12409"],"award-info":[{"award-number":["#12409"]}],"id":[{"id":"10.13039\/100000936","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Stanford Artificial Intelligence in Medicine and Imaging\u2014Human-Centered Artificial Intelligence"},{"DOI":"10.13039\/100020670","name":"Stanford Institute for Human-Centered Artificial Intelligence","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100020670","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Stanford Bio-X Interdisciplinary Seed"},{"name":"American Heart Association\u2014Strategically Focused Research Network\u2014Diversity in Clinical Trials"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,2,1]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:sec>\n                  <jats:title>Objectives<\/jats:title>\n                  <jats:p>We aimed to demonstrate the importance of establishing best practices in large language model research, using repeat prompting as an illustrative example.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Materials and Methods<\/jats:title>\n                  <jats:p>Using data from a prior study investigating potential model bias in peer review of medical abstracts, we compared methods that ignore correlation in model outputs from repeated prompting with a random effects method that accounts for this correlation.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Results<\/jats:title>\n                  <jats:p>High correlation within groups was found when repeatedly prompting the model, with intraclass correlation coefficient of 0.69. Ignoring the inherent correlation in the data led to over 100-fold inflation of effective sample size. After appropriately accounting for this issue, the authors\u2019 results reverse from a small but highly significant finding to no evidence of model bias.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Discussion<\/jats:title>\n                  <jats:p>The establishment of best practices for LLM research is urgently needed, as demonstrated in this case where accounting for repeat prompting in analyses was critical for accurate study conclusions.<\/jats:p>\n               <\/jats:sec>","DOI":"10.1093\/jamia\/ocae294","type":"journal-article","created":{"date-parts":[[2024,12,10]],"date-time":"2024-12-10T19:25:38Z","timestamp":1733858738000},"page":"386-390","source":"Crossref","is-referenced-by-count":8,"title":["Establishing best practices in large language model research: an application to repeat prompting"],"prefix":"10.1093","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2601-0173","authenticated-orcid":false,"given":"Robert J","family":"Gallo","sequence":"first","affiliation":[{"name":"Center for Innovation to Implementation, VA Palo Alto Health Care System , Menlo Park, CA 94025,","place":["United States"]},{"name":"Department of Health Policy, Stanford University , Stanford, CA 94305,","place":["United States"]}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7571-5268","authenticated-orcid":false,"given":"Michael","family":"Baiocchi","sequence":"additional","affiliation":[{"name":"Department of Epidemiology and Population Health, Stanford University , Stanford, CA 94305,","place":["United States"]}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Thomas R","family":"Savage","sequence":"additional","affiliation":[{"name":"Division of Hospital Medicine, Stanford University , Stanford, CA 94305,","place":["United States"]}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jonathan H","family":"Chen","sequence":"additional","affiliation":[{"name":"Division of Hospital Medicine, Stanford University , Stanford, CA 94305,","place":["United States"]},{"name":"Stanford Center for Biomedical Informatics Research, Stanford University , Stanford, CA 94304,","place":["United States"]},{"name":"Clinical Excellence Research Center, Stanford University , Stanford, CA 94305,","place":["United States"]}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"286","published-online":{"date-parts":[[2024,12,4]]},"reference":[{"key":"2025012119040225600_ocae294-B1","doi-asserted-by":"publisher","first-page":"e2335924","DOI":"10.1001\/jamanetworkopen.2023.35924","article-title":"Evaluating the application of large language models in clinical research contexts","volume":"6","author":"Perlis","year":"2023","journal-title":"JAMA Netw Open"},{"key":"2025012119040225600_ocae294-B2","doi-asserted-by":"publisher","first-page":"ocae254","DOI":"10.1093\/jamia\/ocae254","article-title":"Large language model uncertainty proxies: discrimination and calibration for medical diagnosis and treatment","author":"Savage","year":"2024","journal-title":"J Am Med Inform Assoc"},{"key":"2025012119040225600_ocae294-B3","doi-asserted-by":"publisher","first-page":"252","DOI":"10.1001\/jama.2023.24641","article-title":"Affiliation bias in peer review of abstracts by a large language model","volume":"331","author":"von Wedel","year":"2024","journal-title":"JAMA"},{"key":"2025012119040225600_ocae294-B4","doi-asserted-by":"publisher","first-page":"2613","DOI":"10.1038\/s41591-024-03097-1","article-title":"Evaluation and mitigation of the limitations of large language models in clinical decision-making","volume":"30","author":"Hager","year":"2024","journal-title":"Nat Med"},{"key":"2025012119040225600_ocae294-B5","doi-asserted-by":"crossref","first-page":"e12","DOI":"10.1016\/S2589-7500(23)00225-X","article-title":"Assessing the potential of GPT-4 to perpetuate racial and gender biases in health care: a model evaluation study","volume":"6","author":"Zack","year":"2024","journal-title":"Lancet Digit Health"},{"key":"2025012119040225600_ocae294-B6","doi-asserted-by":"publisher","DOI":"10.1056\/AIdbp2300192","article-title":"GPT versus resident physicians\u2014a benchmark based on official board scores","volume":"1","author":"Katz","year":"2024","journal-title":"NEJM AI"},{"key":"2025012119040225600_ocae294-B7","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1038\/s41746-024-01029-4","article-title":"Prompt engineering in consistency and reliability with the evidence-based guideline for LLMs","volume":"7","author":"Wang","year":"2024","journal-title":"NPJ Digit Med"},{"key":"2025012119040225600_ocae294-B8","doi-asserted-by":"publisher","first-page":"896","DOI":"10.1136\/bmj.308.6933.896","article-title":"Correlation, regression, and repeated data","volume":"308","author":"Bland","year":"1994","journal-title":"BMJ"},{"key":"2025012119040225600_ocae294-B9","doi-asserted-by":"publisher","first-page":"e072883","DOI":"10.1136\/bmj-2022-072883","article-title":"On the 12th day of Christmas, a statistician sent to me","volume":"379","author":"Riley","year":"2022","journal-title":"BMJ"},{"key":"2025012119040225600_ocae294-B10","doi-asserted-by":"publisher","first-page":"j3064","DOI":"10.1136\/bmj.j3064","article-title":"How to design efficient cluster randomised trials","volume":"358","author":"Hemming","year":"2017","journal-title":"BMJ"},{"key":"2025012119040225600_ocae294-B11","doi-asserted-by":"publisher","first-page":"1234","DOI":"10.1001\/jama.2024.3520","article-title":"Affiliation bias in peer review of abstracts","volume":"331","author":"Gallo","year":"2024","journal-title":"JAMA"},{"key":"2025012119040225600_ocae294-B12","doi-asserted-by":"publisher","first-page":"1235","DOI":"10.1001\/jama.2024.3523","article-title":"Affiliation bias in peer review of abstracts\u2014reply","volume":"331","author":"von Wedel","year":"2024","journal-title":"JAMA"},{"key":"2025012119040225600_ocae294-B13","doi-asserted-by":"crossref","first-page":"20","DOI":"10.1038\/s41746-024-01010-1","article-title":"Diagnostic reasoning prompts reveal the potential for large language model interpretability in medicine","volume":"7","author":"Savage","year":"2024","journal-title":"NPJ Digit Med"},{"key":"2025012119040225600_ocae294-B14","doi-asserted-by":"publisher","first-page":"1051","DOI":"10.1093\/ije\/dyv113","article-title":"Methods for sample size determination in cluster randomized trials","volume":"44","author":"Rutterford","year":"2015","journal-title":"Int J Epidemiol"},{"key":"2025012119040225600_ocae294-B15","doi-asserted-by":"crossref","DOI":"10.1017\/CBO9780511790942","volume-title":"Data Analysis Using Regression and Multilevel\/Hierarchical Models","author":"Gelman","year":"2006"},{"key":"2025012119040225600_ocae294-B16","volume-title":"Content Analysis: An Introduction to its Methodology","author":"Krippendorff","year":"2018"},{"key":"2025012119040225600_ocae294-B17","doi-asserted-by":"publisher","first-page":"385","DOI":"10.1016\/0304-4076(86)90021-7","article-title":"Random group effects and the precision of regression estimates","volume":"32","author":"Moulton","year":"1986","journal-title":"J Econom"},{"key":"2025012119040225600_ocae294-B18","doi-asserted-by":"crossref","first-page":"e2440969","DOI":"10.1001\/jamanetworkopen.2024.40969","article-title":"Large language model influence on diagnostic reasoning: a randomized clinical trial","volume":"7","author":"Goh","year":"2024","journal-title":"JAMA Netw Open"},{"key":"2025012119040225600_ocae294-B19","author":"UK AI Safety Institute","year":"2024"},{"key":"2025012119040225600_ocae294-B20","author":"Gallifant"}],"container-title":["Journal of the American Medical Informatics Association"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/jamia\/article-pdf\/32\/2\/386\/60951519\/ocae294.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/jamia\/article-pdf\/32\/2\/386\/60951519\/ocae294.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,21]],"date-time":"2025-01-21T19:04:19Z","timestamp":1737486259000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/jamia\/article\/32\/2\/386\/7916529"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,4]]},"references-count":20,"journal-issue":{"issue":"2","published-online":{"date-parts":[[2024,12,4]]},"published-print":{"date-parts":[[2025,2,1]]}},"URL":"https:\/\/doi.org\/10.1093\/jamia\/ocae294","relation":{},"ISSN":["1067-5027","1527-974X"],"issn-type":[{"value":"1067-5027","type":"print"},{"value":"1527-974X","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2025,2]]},"published":{"date-parts":[[2024,12,4]]}}}