{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T10:30:32Z","timestamp":1780569032320,"version":"3.54.1"},"reference-count":42,"publisher":"Oxford University Press (OUP)","issue":"1","license":[{"start":{"date-parts":[[2025,2,25]],"date-time":"2025-02-25T00:00:00Z","timestamp":1740441600000},"content-version":"vor","delay-in-days":95,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"funder":[{"name":"Advanced Computing and Imaging in Biomedicine","award":["NTU-113\u00a0L900701"],"award-info":[{"award-number":["NTU-113\u00a0L900701"]}]},{"name":"Ministry of Education in Taiwan"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,11,22]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:p>With the exponential growth of biomedical literature, leveraging Large Language Models (LLMs) for automated medical knowledge understanding has become increasingly critical for advancing precision medicine. However, current approaches face significant challenges in reliability, verifiability, and scalability when extracting complex biological relationships from scientific literature using LLMs. To overcome the obstacles of LLM development in biomedical literature understating, we propose LORE, a novel unsupervised two-stage reading methodology with LLM that models literature as a knowledge graph of verifiable factual statements and, in turn, as semantic embeddings in Euclidean space. LORE captured essential gene pathogenicity information when applied to PubMed abstracts for large-scale understanding of disease\u2013gene relationships. We demonstrated that modeling a latent pathogenic flow in the semantic embedding with supervision from the ClinVar database led to a 90% mean average precision in identifying relevant genes across 2097 diseases. This work provides a scalable and reproducible approach for leveraging LLMs in biomedical literature analysis, offering new opportunities for researchers to identify therapeutic targets efficiently.<\/jats:p>","DOI":"10.1093\/bib\/bbaf070","type":"journal-article","created":{"date-parts":[[2025,2,25]],"date-time":"2025-02-25T14:02:06Z","timestamp":1740492126000},"source":"Crossref","is-referenced-by-count":10,"title":["A large language model framework for literature-based disease\u2013gene association prediction"],"prefix":"10.1093","volume":"26","author":[{"given":"Peng-Hsuan","family":"Li","sequence":"first","affiliation":[{"name":"Taiwan AI Labs , 6F., No. 70, Sec. 1, Chengde Road, Datong Dist., Taipei 10355 ,","place":["Taiwan"]}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yih-Yun","family":"Sun","sequence":"additional","affiliation":[{"name":"Taiwan AI Labs , 6F., No. 70, Sec. 1, Chengde Road, Datong Dist., Taipei 10355 ,","place":["Taiwan"]}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4876-3309","authenticated-orcid":false,"given":"Hsueh-Fen","family":"Juan","sequence":"additional","affiliation":[{"name":"Taiwan AI Labs , 6F., No. 70, Sec. 1, Chengde Road, Datong Dist., Taipei 10355 ,","place":["Taiwan"]},{"name":"Department of Life Science, National Taiwan University , No. 1, Sec. 4, Roosevelt Rd., Taipei 10617 ,","place":["Taiwan"]},{"name":"Center for Computational and Systems Biology, National Taiwan University , No. 1, Sec. 4, Roosevelt Road, Taipei 10617 ,","place":["Taiwan"]},{"name":"Center for Advanced Computing and Imaging in Biomedicine, National Taiwan University , No. 1, Sec. 4, Roosevelt Road, Taipei 10617 ,","place":["Taiwan"]}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6940-6389","authenticated-orcid":false,"given":"Chien-Yu","family":"Chen","sequence":"additional","affiliation":[{"name":"Taiwan AI Labs , 6F., No. 70, Sec. 1, Chengde Road, Datong Dist., Taipei 10355 ,","place":["Taiwan"]},{"name":"Center for Computational and Systems Biology, National Taiwan University , No. 1, Sec. 4, Roosevelt Road, Taipei 10617 ,","place":["Taiwan"]},{"name":"Center for Advanced Computing and Imaging in Biomedicine, National Taiwan University , No. 1, Sec. 4, Roosevelt Road, Taipei 10617 ,","place":["Taiwan"]},{"name":"Department of Biomechatronics Engineering, National Taiwan University , No. 1, Sec. 4, Roosevelt Road, Taipei 10617 ,","place":["Taiwan"]}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4200-8137","authenticated-orcid":false,"given":"Huai-Kuang","family":"Tsai","sequence":"additional","affiliation":[{"name":"Taiwan AI Labs , 6F., No. 70, Sec. 1, Chengde Road, Datong Dist., Taipei 10355 ,","place":["Taiwan"]},{"name":"Institute of Information Science, Academia Sinica , No. 128, Academia Road, Section 2, Nankang, Taipei 11529 ,","place":["Taiwan"]}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5910-5419","authenticated-orcid":false,"given":"Jia-Hsin","family":"Huang","sequence":"additional","affiliation":[{"name":"Taiwan AI Labs , 6F., No. 70, Sec. 1, Chengde Road, Datong Dist., Taipei 10355 ,","place":["Taiwan"]}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"286","published-online":{"date-parts":[[2025,2,25]]},"reference":[{"key":"2025022513081011300_ref1","doi-asserted-by":"publisher","first-page":"937","DOI":"10.1038\/nbt.4267","article-title":"How user intelligence is improving PubMed","volume":"36","author":"Fiorini","year":"2018","journal-title":"Nat Biotechnol"},{"key":"2025022513081011300_ref2","doi-asserted-by":"publisher","first-page":"W616","DOI":"10.1093\/nar\/gkac310","article-title":"pubmedKB: an interactive web server for exploring biomedical entity relations in the biomedical literature","volume":"50","author":"Li","year":"2022","journal-title":"Nucleic Acids Res"},{"key":"2025022513081011300_ref3","doi-asserted-by":"publisher","first-page":"104988","DOI":"10.1016\/j.ebiom.2024.104988","article-title":"PubMed and beyond: biomedical literature search in the age of artificial intelligence","volume":"100","author":"Jin","year":"2024","journal-title":"EBioMedicine"},{"key":"2025022513081011300_ref4","doi-asserted-by":"publisher","first-page":"D862","DOI":"10.1093\/nar\/gkv1222","article-title":"ClinVar: public archive of interpretations of clinically relevant variants","volume":"44","author":"Landrum","year":"2016","journal-title":"Nucleic Acids Res"},{"key":"2025022513081011300_ref5","doi-asserted-by":"publisher","first-page":"D941","DOI":"10.1093\/nar\/gky1015","article-title":"COSMIC: the catalogue of somatic mutations In cancer","volume":"47","author":"Tate","year":"2019","journal-title":"Nucleic Acids Res"},{"key":"2025022513081011300_ref6","doi-asserted-by":"publisher","first-page":"D1038","DOI":"10.1093\/nar\/gky1151","article-title":"OMIM.org: leveraging knowledge across phenotype-gene relationships","volume":"47","author":"Amberger","year":"2019","journal-title":"Nucleic Acids Res"},{"key":"2025022513081011300_ref7","doi-asserted-by":"publisher","first-page":"563","DOI":"10.1002\/cpt.2350","article-title":"An evidence-based framework for evaluating pharmacogenomics knowledge for personalized medicine","volume":"110","author":"Whirl-Carrillo","year":"2021","journal-title":"Clin Pharmacol Ther"},{"key":"2025022513081011300_ref8","doi-asserted-by":"publisher","first-page":"589","DOI":"10.1016\/j.molcel.2006.02.012","article-title":"Biomedical language processing: what's beyond PubMed?","volume":"21","author":"Hunter","year":"2006","journal-title":"Mol Cell"},{"key":"2025022513081011300_ref9","doi-asserted-by":"publisher","first-page":"i41","DOI":"10.1093\/bioinformatics\/btm229","article-title":"Manual curation is not sufficient for annotation of genomic databases","volume":"23","author":"Baumgartner","year":"2007","journal-title":"Bioinformatics"},{"key":"2025022513081011300_ref10","doi-asserted-by":"publisher","first-page":"55","DOI":"10.1186\/s12859-015-0472-9","article-title":"Extraction of relations between genes and diseases from text and large-scale data analysis: implications for translational research","volume":"16","author":"Bravo","year":"2015","journal-title":"BMC Bioinform"},{"key":"2025022513081011300_ref11","doi-asserted-by":"publisher","first-page":"D833","DOI":"10.1093\/nar\/gkw943","article-title":"DisGeNET: a comprehensive platform integrating information on human disease-associated genes and variants","volume":"45","author":"Pinero","year":"2017","journal-title":"Nucleic Acids Res"},{"key":"2025022513081011300_ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-17083-7_17","article-title":"RENET: a deep learning approach for extracting gene-disease associations from literature","volume":"11467","author":"Wu","year":"2019","journal-title":"Research in Computational Molecular Biology"},{"key":"2025022513081011300_ref13","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1038\/s41597-019-0342-9","article-title":"PGxCorpus, a manually annotated corpus for pharmacogenomics","volume":"7","author":"Legrand","year":"2020","journal-title":"Sci Data"},{"key":"2025022513081011300_ref14","doi-asserted-by":"publisher","first-page":"556","DOI":"10.1093\/infdis\/jiaa332","article-title":"ACE2 expression is increased in the lungs of patients with comorbidities associated with severe COVID-19","volume":"222","author":"Pinto","year":"2020","journal-title":"J Infect Dis"},{"key":"2025022513081011300_ref15","doi-asserted-by":"publisher","first-page":"650","DOI":"10.1038\/s41593-023-01259-x","article-title":"Molecular and network-level mechanisms explaining individual differences in autism spectrum disorder","volume":"26","author":"Buch","year":"2023","journal-title":"Nat Neurosci"},{"key":"2025022513081011300_ref16","doi-asserted-by":"publisher","DOI":"10.1016\/j.jbi.2023.104464","article-title":"Graph embedding-based link prediction for literature-based discovery in Alzheimer's disease","volume":"145","author":"Pu","year":"2023","journal-title":"J Biomed Inform"},{"key":"2025022513081011300_ref17","doi-asserted-by":"publisher","first-page":"D638","DOI":"10.1093\/nar\/gkac1000","article-title":"The STRING database in 2023: protein-protein association networks and functional enrichment analyses for any sequenced genome of interest","volume":"51","author":"Szklarczyk","year":"2023","journal-title":"Nucleic Acids Res"},{"key":"2025022513081011300_ref18","doi-asserted-by":"publisher","first-page":"3619","DOI":"10.1093\/bioinformatics\/btw503","article-title":"DTMiner: identification of potential disease targets through biomedical literature mining","volume":"32","author":"Xu","year":"2016","journal-title":"Bioinformatics"},{"key":"2025022513081011300_ref19","doi-asserted-by":"publisher","first-page":"2614","DOI":"10.1093\/bioinformatics\/bty114","article-title":"A global network of biomedical relationships derived from text","volume":"34","author":"Percha","year":"2018","journal-title":"Bioinformatics"},{"key":"2025022513081011300_ref20","doi-asserted-by":"publisher","first-page":"104487","DOI":"10.1016\/j.jbi.2023.104487","article-title":"BioREx: improving biomedical relation extraction by leveraging heterogeneous datasets","volume":"146","author":"Lai","year":"2023","journal-title":"J Biomed Inform"},{"key":"2025022513081011300_ref21","doi-asserted-by":"publisher","first-page":"W540","DOI":"10.1093\/nar\/gkae235","article-title":"PubTator 3.0: an AI-powered literature resource for unlocking biomedical knowledge","volume":"52","author":"Wei","year":"2024","journal-title":"Nucleic Acids Res"},{"key":"2025022513081011300_ref22","doi-asserted-by":"publisher","DOI":"10.3390\/app9183698","article-title":"Neural machine reading comprehension: methods and trends","volume":"9","author":"Liu","year":"2019","journal-title":"Appl Sci"},{"key":"2025022513081011300_ref23","doi-asserted-by":"publisher","first-page":"1233","DOI":"10.1056\/NEJMsr2214184","article-title":"Benefits, limits, and risks of GPT-4 as an AI Chatbot for medicine","volume":"388","author":"Lee","year":"2023","journal-title":"N Engl J Med"},{"key":"2025022513081011300_ref24","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1038\/s41586-023-06291-2","article-title":"Large language models encode clinical knowledge","volume":"620","author":"Singhal","year":"2023","journal-title":"Nature"},{"key":"2025022513081011300_ref25","doi-asserted-by":"publisher","first-page":"1462","DOI":"10.1038\/s41592-024-02235-4","article-title":"Assessing GPT-4 for cell type annotation in single-cell RNA-seq analysis","volume":"21","author":"Hou","year":"2024","journal-title":"Nat Methods"},{"key":"2025022513081011300_ref26","doi-asserted-by":"publisher","first-page":"1302","DOI":"10.1681\/ASN.0000000000000166","article-title":"Retrieve, summarize, and Verify: how will ChatGPT affect information seeking from the medical literature?","volume":"34","author":"Jin","year":"2023","journal-title":"J Am Soc Nephrol"},{"key":"2025022513081011300_ref27","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive NLP tasks","volume":"33","author":"Lewis","year":"2020","journal-title":"Adv Neural Inf Process Syst"},{"key":"2025022513081011300_ref28","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2312.10997","article-title":"Retrieval-augmented generation for large language models: a survey.","author":"Gao","year":"2023"},{"key":"2025022513081011300_ref29","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang","year":"2022","journal-title":"Adv Neural Inf Process Syst"},{"key":"2025022513081011300_ref30","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Adv Neural Inf Process Syst"},{"key":"2025022513081011300_ref31","doi-asserted-by":"crossref","first-page":"447","DOI":"10.18653\/v1\/D13-1043","volume-title":"Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing","author":"Mesquita","year":"2013"},{"key":"2025022513081011300_ref32","doi-asserted-by":"publisher","DOI":"10.21105\/joss.00861","article-title":"UMAP: uniform manifold approximation and projection","volume":"3","author":"McInnes","year":"2018","journal-title":"J Open Source Softw"},{"key":"2025022513081011300_ref33","doi-asserted-by":"publisher","first-page":"254","DOI":"10.1007\/s10791-009-9112-1","article-title":"Adapting boosting for information retrieval measures","volume":"13","author":"Wu","year":"2009","journal-title":"Inf Retr"},{"key":"2025022513081011300_ref34","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2311.07361","article-title":"The impact of large language models on scientific discovery: a preliminary study using GPT-4. arXiv Preprint","author":"Research AI4Science, M","year":"2023"},{"key":"2025022513081011300_ref35","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1162\/tacl_a_00632","article-title":"Benchmarking large language models for news summarization","volume":"12","author":"Zhang","year":"2024","journal-title":"Trans Assoc Comput Linguist"},{"key":"2025022513081011300_ref36","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2403.08295","article-title":"Gemma: open models based on Gemini research and technology","author":"Team","year":"2024"},{"key":"2025022513081011300_ref37","first-page":"10088","article-title":"QLoRA: efficient Finetuning of quantized LLMs","volume":"36","author":"Dettmers","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"2025022513081011300_ref38","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2311.16079","article-title":"MEDITRON-70B: scaling medical Pretraining for large language","author":"Chen","year":"2023"},{"key":"2025022513081011300_ref39","doi-asserted-by":"crossref","first-page":"89","DOI":"10.1145\/1102351.1102363","volume-title":"Proceedings of the 22nd international conference on Machine learning - ICML '05","author":"Burges","year":"2005"},{"key":"2025022513081011300_ref40","doi-asserted-by":"publisher","first-page":"422","DOI":"10.1145\/582415.582418","article-title":"Cumulated gain-based evaluation of IR techniques","volume":"20","author":"J\u00e4rvelin","year":"2002","journal-title":"ACM Trans Inf Syst"},{"key":"2025022513081011300_ref41","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/7503.003.0029","article-title":"Learning to rank with nonsmooth cost functions","volume":"19","author":"Burges","year":"2006","journal-title":"Adv Neural Inf Process Syst"},{"key":"2025022513081011300_ref42","doi-asserted-by":"publisher","first-page":"267D","DOI":"10.1093\/nar\/gkh061","article-title":"The unified medical language system (UMLS): integrating biomedical terminology","volume":"32","author":"Bodenreider","year":"2004","journal-title":"Nucleic Acids Res"}],"container-title":["Briefings in Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bib\/article-pdf\/26\/1\/bbaf070\/62166238\/bbaf070.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bib\/article-pdf\/26\/1\/bbaf070\/62166238\/bbaf070.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,25]],"date-time":"2025-02-25T14:02:14Z","timestamp":1740492134000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bib\/article\/doi\/10.1093\/bib\/bbaf070\/8042066"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,22]]},"references-count":42,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2024,11,22]]}},"URL":"https:\/\/doi.org\/10.1093\/bib\/bbaf070","relation":{},"ISSN":["1467-5463","1477-4054"],"issn-type":[{"value":"1467-5463","type":"print"},{"value":"1477-4054","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2025,1]]},"published":{"date-parts":[[2024,11,22]]},"article-number":"bbaf070"}}