{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T14:46:54Z","timestamp":1776091614324,"version":"3.50.1"},"reference-count":29,"publisher":"Oxford University Press (OUP)","issue":"5","license":[{"start":{"date-parts":[[2025,9,23]],"date-time":"2025-09-23T00:00:00Z","timestamp":1758585600000},"content-version":"vor","delay-in-days":23,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"funder":[{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","award":["R35GM154865"],"award-info":[{"award-number":["R35GM154865"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000057","name":"National Institute of General Medical Sciences","doi-asserted-by":"publisher","award":["R35GM150887"],"award-info":[{"award-number":["R35GM150887"]}],"id":[{"id":"10.13039\/100000057","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,8,31]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:p>Large language models (LLMs) show promise in biomedical research, but their effectiveness for genomic inquiry remains unclear. We developed GeneTuring, a benchmark consisting of 16 genomics tasks with 1600 curated questions, and manually evaluated 48 000 answers from 10 LLM configurations, including GPT-4o (via API, ChatGPT with web access, and a custom Generative Pretrained Transformer (GPT) setup), GPT-3.5, Claude 3.5, Gemini Advanced, GeneGPT (both slim and full), BioGPT, and BioMedLM. A custom GPT-4o configuration integrated with National Center for Biotechnology Information (NCBI) Application Programming Interfaces (APIs), developed in this study as SeqSnap, achieved the best overall performance. GPT-4o with web access and GeneGPT demonstrated complementary strengths. Our findings highlight both the promise and current limitations of LLMs in genomics, and emphasize the value of combining LLMs with domain-specific tools for robust genomic intelligence. GeneTuring offers a key resource for benchmarking and improving LLMs in biomedical research.<\/jats:p>","DOI":"10.1093\/bib\/bbaf492","type":"journal-article","created":{"date-parts":[[2025,9,23]],"date-time":"2025-09-23T05:23:32Z","timestamp":1758605012000},"source":"Crossref","is-referenced-by-count":5,"title":["Benchmarking large language models for genomic knowledge with GeneTuring"],"prefix":"10.1093","volume":"26","author":[{"given":"Xinyi","family":"Shang","sequence":"first","affiliation":[{"name":"Department of Biostatistics , Mailman School of Public Health, Columbia University, 722 West 168th Street, New York, NY 10032,","place":["United States"]}]},{"given":"Xu","family":"Liao","sequence":"additional","affiliation":[{"name":"Department of Biostatistics , Mailman School of Public Health, Columbia University, 722 West 168th Street, New York, NY 10032,","place":["United States"]}]},{"given":"Zhicheng","family":"Ji","sequence":"additional","affiliation":[{"name":"Department of Biostatistics and Bioinformatics , Duke University School of Medicine, 2424 Erwin Road, Durham, NC 27705,","place":["United States"]}]},{"given":"Wenpin","family":"Hou","sequence":"additional","affiliation":[{"name":"Department of Biostatistics , Mailman School of Public Health, Columbia University, 722 West 168th Street, New York, NY 10032,","place":["United States"]},{"name":"Data Science Institute , Columbia University, 550 West 120th Street, New York, NY 10027,","place":["United States"]}]}],"member":"286","published-online":{"date-parts":[[2025,9,23]]},"reference":[{"key":"2025092301232607200_ref1","article-title":"New embedding models and API updates","author":"OpenAI","year":"2024"},{"key":"2025092301232607200_ref2","article-title":"GPT-4 system card","author":"OpenAI"},{"key":"2025092301232607200_ref3","article-title":"Gemini: a family of highly capable multimodal models","author":"Team","year":"2023"},{"key":"2025092301232607200_ref4","article-title":"Model card and evaluations for Claude models","author":"Anthropic"},{"key":"2025092301232607200_ref5","doi-asserted-by":"publisher","article-title":"Comparing large language models and human programmers for generating programming code","author":"Hou","DOI":"10.1002\/advs.202412279"},{"key":"2025092301232607200_ref6","doi-asserted-by":"publisher","article-title":"Assessing large multimodal models for one-shot learning and interpretability in biomedical image classification","author":"Hou","DOI":"10.1101\/2023.12.31.573796"},{"key":"2025092301232607200_ref7","doi-asserted-by":"publisher","first-page":"1462","DOI":"10.1038\/s41592-024-02235-4","article-title":"Assessing GPT-4 for cell type annotation in single-cell RNA-seq analysis","volume":"21","author":"Hou","year":"2024","journal-title":"Nat Methods"},{"key":"2025092301232607200_ref8","doi-asserted-by":"publisher","first-page":"483","DOI":"10.1038\/s41551-024-01284-6","article-title":"Simple and effective embedding model for single-cell biology built from ChatGPT","volume":"9","author":"Chen","year":"2024","journal-title":"Nat Biomed Eng"},{"key":"2025092301232607200_ref9","article-title":"Measuring massive multitask language understanding","author":"Hendrycks","year":"2020"},{"key":"2025092301232607200_ref10","article-title":"Evaluating large language models trained on code","author":"Chen","year":"2021"},{"key":"2025092301232607200_ref11","doi-asserted-by":"publisher","first-page":"2190","DOI":"10.1016\/j.ajhg.2024.08.010","article-title":"Assessing the utility of large language models for phenotype-driven gene prioritization in the diagnosis of rare genetic disease","volume":"111","author":"Kim","year":"2024","journal-title":"Amer J Hum Genet"},{"key":"2025092301232607200_ref12","doi-asserted-by":"publisher","first-page":"i266","DOI":"10.1093\/bioinformatics\/btae230","article-title":"BioCoder: a benchmark for bioinformatics code generation with large language models","volume":"40","author":"Tang","year":"2024","journal-title":"Bioinformatics"},{"key":"2025092301232607200_ref13","doi-asserted-by":"publisher","first-page":"3498","DOI":"10.1016\/j.csbj.2024.09.031","article-title":"Large language models and their applications in bioinformatics","volume":"23","author":"Sarumi","year":"2024","journal-title":"Comput Struct Biotechnol J"},{"key":"2025092301232607200_ref14","doi-asserted-by":"publisher","DOI":"10.1093\/bib\/bbac409","article-title":"BioGPT: generative pre-trained transformer for biomedical text generation and mining","volume":"23","author":"Luo","year":"2022","journal-title":"Brief Bioinform"},{"key":"2025092301232607200_ref15","article-title":"BioMedLM: a domain-specific large language model for biomedical text","author":"Venigalla"},{"key":"2025092301232607200_ref16","article-title":"GPT-4o system card","author":"OpenAI"},{"key":"2025092301232607200_ref17","doi-asserted-by":"publisher","first-page":"btae075","DOI":"10.1093\/bioinformatics\/btae075","article-title":"GeneGPT: augmenting large language models with domain tools for improved access to biomedical information","volume":"40","author":"Jin","year":"2024","journal-title":"Bioinformatics"},{"key":"2025092301232607200_ref18","doi-asserted-by":"crossref","article-title":"Siren\u2019s song in the AI ocean: a survey on hallucination in large language models","author":"Zhang","DOI":"10.1162\/COLI.a.16"},{"key":"2025092301232607200_ref19","article-title":"A survey of hallucination in large foundation models","author":"Rawte","year":"2023"},{"key":"2025092301232607200_ref20","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/gb-2008-9-s2-s2","article-title":"Overview of biocreative II gene mention recognition","volume":"9","author":"Smith","year":"2008","journal-title":"Genome Biol"},{"key":"2025092301232607200_ref21","doi-asserted-by":"publisher","first-page":"D5","DOI":"10.1093\/nar\/gkl1031","article-title":"Database resources of the National Center for Biotechnology Information","volume":"35","author":"Wheeler","year":"2007","journal-title":"Nucleic Acids Res"},{"key":"2025092301232607200_ref22","doi-asserted-by":"publisher","first-page":"D766","DOI":"10.1093\/nar\/gky955","article-title":"Gencode reference annotation for the human and mouse genomes","volume":"47","author":"Frankish","year":"2019","journal-title":"Nucleic Acids Res"},{"key":"2025092301232607200_ref23","doi-asserted-by":"publisher","first-page":"D1038","DOI":"10.1093\/nar\/gky1151","article-title":"OMIM.org: leveraging knowledge across phenotype\u2013gene relationships","volume":"47","author":"Amberger","year":"2019","journal-title":"Nucleic Acids Res"},{"key":"2025092301232607200_ref24","doi-asserted-by":"publisher","first-page":"417","DOI":"10.1016\/j.cels.2015.12.004","article-title":"The molecular signatures database hallmark gene set collection","volume":"1","author":"Liberzon","year":"2015","journal-title":"Cell systems"},{"key":"2025092301232607200_ref25","doi-asserted-by":"publisher","first-page":"D380","DOI":"10.1093\/nar\/gkx1013","article-title":"Trrust v2: an expanded reference database of human and mouse transcriptional regulatory interactions","volume":"46","author":"Han","year":"2018","journal-title":"Nucleic Acids Res"},{"key":"2025092301232607200_ref26","doi-asserted-by":"publisher","first-page":"115","DOI":"10.1038\/nmeth.3252","article-title":"Orchestrating high-throughput genomic analysis with bioconductor","volume":"12","author":"Huber","year":"2015","journal-title":"Nat Methods"},{"key":"2025092301232607200_ref27","article-title":"A general introduction to the e-utilities","volume-title":"Entrez Programming Utilities Help [Internet]","author":"Sayers","year":"2010"},{"key":"2025092301232607200_ref28","doi-asserted-by":"publisher","first-page":"403","DOI":"10.1016\/S0022-2836(05)80360-2","article-title":"Basic local alignment search tool","volume":"215","author":"Altschul","year":"1990","journal-title":"J Mol Biol"},{"key":"2025092301232607200_ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6","article-title":"Transformers: state-of-the-art natural language processing","volume-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: system Demonstrations","author":"Wolf"}],"container-title":["Briefings in Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bib\/article-pdf\/26\/5\/bbaf492\/64347546\/bbaf492.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bib\/article-pdf\/26\/5\/bbaf492\/64347546\/bbaf492.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,23]],"date-time":"2025-09-23T05:23:46Z","timestamp":1758605026000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bib\/article\/doi\/10.1093\/bib\/bbaf492\/8261762"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,31]]},"references-count":29,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,8,31]]}},"URL":"https:\/\/doi.org\/10.1093\/bib\/bbaf492","relation":{},"ISSN":["1467-5463","1477-4054"],"issn-type":[{"value":"1467-5463","type":"print"},{"value":"1477-4054","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2025,9]]},"published":{"date-parts":[[2025,8,31]]},"article-number":"bbaf492"}}