{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T00:17:23Z","timestamp":1760660243123,"version":"build-2065373602"},"reference-count":28,"publisher":"Oxford University Press (OUP)","issue":"10","license":[{"start":{"date-parts":[[2025,8,26]],"date-time":"2025-08-26T00:00:00Z","timestamp":1756166400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"French government, managed by the Agence Nationale de la Recherche","award":["ANR-21-EXES-0005"],"award-info":[{"award-number":["ANR-21-EXES-0005"]}]},{"name":"Occitanie Region"},{"name":"ExposUM Institute of the University of Montpellier"},{"DOI":"10.13039\/100010269","name":"Wellcome Trust","doi-asserted-by":"publisher","award":["104111\/Z\/14\/ZR"],"award-info":[{"award-number":["104111\/Z\/14\/ZR"]}],"id":[{"id":"10.13039\/100010269","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,2]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:sec>\n                  <jats:title>Motivation<\/jats:title>\n                  <jats:p>Inferring protein homology from sequence information is essential for understanding species evolution and enabling functional annotation transfer. Besides similarity-based methods, several machine learning approaches have been developed using various ways of representing protein data.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Results<\/jats:title>\n                  <jats:p>Here, we represent proteins with a biologically oriented large language model and apply k-means clustering to the embedded data to extract homology relationships. Although our approach lacks the sensitivity of other tools, we obtain better precision for the detection of n:m orthologs. Furthermore, we successfully reconstruct full orthologous groups from scratch, highlighting the growing potential of using large language models in combination with clustering algorithms for the analysis of protein data.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Availability and implementation<\/jats:title>\n                  <jats:p>Datasets are available on OrthoMCL-DB as indicated in the Methods. Source code is available on GitHub at https:\/\/github.com\/ThomasGTHB\/OrthoLM and Zenodo at https:\/\/doi.org\/10.5281\/zenodo.16640170.<\/jats:p>\n               <\/jats:sec>","DOI":"10.1093\/bioinformatics\/btaf472","type":"journal-article","created":{"date-parts":[[2025,8,26]],"date-time":"2025-08-26T18:20:20Z","timestamp":1756232420000},"source":"Crossref","is-referenced-by-count":0,"title":["Exploring homology detection via k-means clustering of proteins embedded with a large language model"],"prefix":"10.1093","volume":"41","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-1300-3732","authenticated-orcid":false,"given":"Thomas","family":"Minotto","sequence":"first","affiliation":[{"name":"LPHI, CNRS, University of Montpellier , Montpellier 34095,","place":["France"]},{"name":"IMAG, CNRS, University of Montpellier , Montpellier 34095,","place":["France"]}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4277-0914","authenticated-orcid":false,"given":"Antoine","family":"Claessens","sequence":"additional","affiliation":[{"name":"LPHI, CNRS, University of Montpellier , Montpellier 34095,","place":["France"]}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1246-7404","authenticated-orcid":false,"given":"Thomas D","family":"Otto","sequence":"additional","affiliation":[{"name":"LPHI, CNRS, University of Montpellier , Montpellier 34095,","place":["France"]},{"name":"School of Infection and Immunity, University of Glasgow , Glasgow G12 8TA,","place":["United Kingdom"]}]}],"member":"286","published-online":{"date-parts":[[2025,8,26]]},"reference":[{"key":"2025101607370675900_btaf472-B1","doi-asserted-by":"publisher","first-page":"3389","DOI":"10.1093\/nar\/25.17.3389","article-title":"Gapped BLAST and PSI-BLAST: a new generation of protein database search programs","volume":"25","author":"Altschul","year":"1997","journal-title":"Nucleic Acids Res"},{"key":"2025101607370675900_btaf472-B2","doi-asserted-by":"crossref","first-page":"403","DOI":"10.1016\/S0022-2836(05)80360-2","article-title":"Basic local alignment search tool","volume":"215","author":"Altschul","year":"1990","journal-title":"J Mol Biol"},{"key":"2025101607370675900_btaf472-B3","doi-asserted-by":"publisher","first-page":"223","DOI":"10.1126\/science.181.4096.223","article-title":"Principles that govern the folding of protein chains","volume":"181","author":"Anfinsen","year":"1973","journal-title":"Science"},{"key":"2025101607370675900_btaf472-B4","doi-asserted-by":"publisher","first-page":"9646","DOI":"10.1038\/s41467-024-53982-z","article-title":"SSEmb: a joint embedding of protein sequence and structure enables robust variant effect predictions","volume":"15","author":"Blaabjerg","year":"2024","journal-title":"Nat Commun"},{"key":"2025101607370675900_btaf472-B5","doi-asserted-by":"crossref","first-page":"195","DOI":"10.1186\/s13059-024-03298-4","article-title":"SonicParanoid2: fast, accurate, and comprehensive orthology inference with machine learning and language models","volume":"25","author":"Cosentino","year":"2024","journal-title":"Genome Biol"},{"key":"2025101607370675900_btaf472-B6","doi-asserted-by":"publisher","first-page":"evad084","DOI":"10.1093\/gbe\/evad084","article-title":"Unsupervised deep learning can identify protein functional groups from unaligned sequences","volume":"15","author":"David","year":"2023","journal-title":"Genome Biol Evol"},{"key":"2025101607370675900_btaf472-B7","doi-asserted-by":"publisher","first-page":"7112","DOI":"10.1109\/TPAMI.2021.3095381","article-title":"ProtTrans: toward understanding the language of life through self-supervised learning","volume":"44","author":"Elnaggar","year":"2022","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"2025101607370675900_btaf472-B8","doi-asserted-by":"publisher","first-page":"238","DOI":"10.1186\/s13059-019-1832-y","article-title":"OrthoFinder: phylogenetic orthology inference for comparative genomics","volume":"20","author":"Emms","year":"2019","journal-title":"Genome Biol"},{"key":"2025101607370675900_btaf472-B9","doi-asserted-by":"publisher","first-page":"975","DOI":"10.1038\/s41587-023-01917-2","article-title":"Protein remote homology detection and structural alignment using deep learning","volume":"42","author":"Hamamsy","year":"2024","journal-title":"Nat Biotechnol"},{"key":"2025101607370675900_btaf472-B10","doi-asserted-by":"crossref","first-page":"850","DOI":"10.1126\/science.ads0018","article-title":"Simulating 500 million years of evolution with a language model","volume":"387","author":"Hayes","year":"2025","journal-title":"Science"},{"key":"2025101607370675900_btaf472-B11","doi-asserted-by":"publisher","first-page":"723","DOI":"10.1186\/s12859-019-3220-8","article-title":"Modeling aspects of the language of life through transfer-learning protein sequences","volume":"20","author":"Heinzinger","year":"2019","journal-title":"BMC Bioinformatics"},{"key":"2025101607370675900_btaf472-B12","doi-asserted-by":"publisher","first-page":"309","DOI":"10.1146\/annurev.genet.39.073003.114725","article-title":"Orthologs, paralogs, and evolutionary genomics","volume":"39","author":"Koonin","year":"2005","journal-title":"Annu Rev Genet"},{"key":"2025101607370675900_btaf472-B13","doi-asserted-by":"publisher","first-page":"D445","DOI":"10.1093\/nar\/gkac998","article-title":"OrthoDB v11: annotation of orthologs in the widest sampling of organismal diversity","volume":"51","author":"Kuznetsov","year":"2023","journal-title":"Nucleic Acids Res"},{"key":"2025101607370675900_btaf472-B14","doi-asserted-by":"publisher","first-page":"2178","DOI":"10.1101\/gr.1224503","article-title":"OrthoMCL: identification of ortholog groups for eukaryotic genomes","volume":"13","author":"Li","year":"2003","journal-title":"Genome Res"},{"key":"2025101607370675900_btaf472-B15","doi-asserted-by":"crossref","first-page":"1123","DOI":"10.1126\/science.ade2574","article-title":"Evolutionary-scale prediction of atomic-level protein structure with a language model","volume":"379","author":"Lin","year":"2023","journal-title":"Science"},{"key":"2025101607370675900_btaf472-B16","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1099\/mgen.0.000341","article-title":"An assessment of genome annotation coverage across the bacterial tree of life","volume":"6","author":"Lobb","year":"2020","journal-title":"Microb Genom"},{"key":"2025101607370675900_btaf472-B17","article-title":"Distributed representations of words and phrases and their compositionality","author":"Mikolov","year":"2013","journal-title":"Advances in Neural Information Processing Systems 26 (NIPS 2013, Stateline, Nevada, USA)"},{"key":"2025101607370675900_btaf472-B18","doi-asserted-by":"publisher","first-page":"173","DOI":"10.1038\/nmeth.1818","article-title":"HHblits: lightning-fast iterative protein sequence searching by HMM-HMM alignment","volume":"9","author":"Remmert","year":"2012","journal-title":"Nat Methods"},{"key":"2025101607370675900_btaf472-B19","doi-asserted-by":"publisher","first-page":"e2016239118","DOI":"10.1073\/pnas.2016239118","article-title":"Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences","volume":"118","author":"Rives","year":"2021","journal-title":"Proc Natl Acad Sci USA"},{"key":"2025101607370675900_btaf472-B20","doi-asserted-by":"publisher","first-page":"1492","DOI":"10.1038\/s41592-024-02191-z","article-title":"Toward universal cell embeddings: integrating single-cell RNA-seq datasets across species with SATURN","volume":"21","author":"Rosen","year":"2024","journal-title":"Nat Methods"},{"key":"2025101607370675900_btaf472-B21","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3068335","article-title":"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN","volume":"42","author":"Schubert","year":"2017","journal-title":"ACM Trans Database Syst"},{"key":"2025101607370675900_btaf472-B22","doi-asserted-by":"publisher","first-page":"951","DOI":"10.1093\/bioinformatics\/bti125","article-title":"Protein homology detection by HMM\u2013HMM comparison","volume":"21","author":"S\u00f6ding","year":"2005","journal-title":"Bioinformatics"},{"key":"2025101607370675900_btaf472-B23","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1093\/bioinformatics\/btae567","article-title":"Accurate and efficient protein embedding using multi-teacher distillation learning","volume":"40","author":"Shang","year":"2024","journal-title":"Bioinformatics"},{"key":"2025101607370675900_btaf472-B24","doi-asserted-by":"publisher","first-page":"W29","DOI":"10.1093\/nar\/gkw292","article-title":"Companion: a web server for annotation and analysis of parasite genomes","volume":"44","author":"Steinbiss","year":"2016","journal-title":"Nucleic Acids Res"},{"key":"2025101607370675900_btaf472-B25","doi-asserted-by":"publisher","first-page":"926","DOI":"10.1093\/bioinformatics\/btu739","article-title":"UniRef clusters: a comprehensive and scalable alternative for improving sequence similarity searches","volume":"31","author":"Suzek","year":"2015","journal-title":"Bioinformatics"},{"key":"2025101607370675900_btaf472-B26","first-page":"2837","article-title":"Information theoretic measures for clusterings comparison: variants, properties, normalization and correction for chance","volume":"11","author":"Vinh","year":"2010","journal-title":"Journal of Machine Learning Research"},{"key":"2025101607370675900_btaf472-B27","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-319-24277-4","volume-title":"ggplot2: Elegant Graphics for Data Analysis","author":"Wickham","year":"2016"},{"key":"2025101607370675900_btaf472-B28","doi-asserted-by":"publisher","first-page":"774846","DOI":"10.3389\/fgene.2021.774846","article-title":"Use ggbreak to effectively utilize plotting space to deal with large datasets and outliers","volume":"12","author":"Xu","year":"2021","journal-title":"Front Genet"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/advance-article-pdf\/doi\/10.1093\/bioinformatics\/btaf472\/64140918\/btaf472.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/41\/10\/btaf472\/64140918\/btaf472.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/41\/10\/btaf472\/64140918\/btaf472.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,16]],"date-time":"2025-10-16T11:37:13Z","timestamp":1760614633000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/doi\/10.1093\/bioinformatics\/btaf472\/8241956"}},"subtitle":[],"editor":[{"given":"Jianlin","family":"Cheng","sequence":"additional","affiliation":[]}],"short-title":[],"issued":{"date-parts":[[2025,8,26]]},"references-count":28,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2025,10,2]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/btaf472","relation":{},"ISSN":["1367-4803","1367-4811"],"issn-type":[{"type":"print","value":"1367-4803"},{"type":"electronic","value":"1367-4811"}],"subject":[],"published-other":{"date-parts":[[2025,10]]},"published":{"date-parts":[[2025,8,26]]},"article-number":"btaf472"}}