{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,18]],"date-time":"2026-05-18T16:52:16Z","timestamp":1779123136859,"version":"3.51.4"},"reference-count":45,"publisher":"Oxford University Press (OUP)","issue":"1","license":[{"start":{"date-parts":[[2023,1,17]],"date-time":"2023-01-17T00:00:00Z","timestamp":1673913600000},"content-version":"vor","delay-in-days":16,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100000268","name":"BBSRC","doi-asserted-by":"publisher","award":["BB\/V014722\/1"],"award-info":[{"award-number":["BB\/V014722\/1"]}],"id":[{"id":"10.13039\/501100000268","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000268","name":"BBSRC","doi-asserted-by":"publisher","award":["BB\/R009597\/1"],"award-info":[{"award-number":["BB\/R009597\/1"]}],"id":[{"id":"10.13039\/501100000268","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000268","name":"BBSRC","doi-asserted-by":"publisher","award":["BB\/R014892\/1"],"award-info":[{"award-number":["BB\/R014892\/1"]}],"id":[{"id":"10.13039\/501100000268","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000268","name":"BBSRC","doi-asserted-by":"publisher","award":["BB\/S020144\/1"],"award-info":[{"award-number":["BB\/S020144\/1"]}],"id":[{"id":"10.13039\/501100000268","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000268","name":"BBSRC","doi-asserted-by":"publisher","award":["BB\/S020039\/1"],"award-info":[{"award-number":["BB\/S020039\/1"]}],"id":[{"id":"10.13039\/501100000268","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Software Campus 2.0","award":["01IS17049"],"award-info":[{"award-number":["01IS17049"]}]},{"name":"German Ministry for Research and Education"},{"DOI":"10.13039\/501100001659","name":"Deutsche Forschungsgemeinschaft","doi-asserted-by":"publisher","award":["DFG\u2013GZ"],"award-info":[{"award-number":["DFG\u2013GZ"]}],"id":[{"id":"10.13039\/501100001659","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001659","name":"Deutsche Forschungsgemeinschaft","doi-asserted-by":"publisher","award":["RO1320\/4\u20131"],"award-info":[{"award-number":["RO1320\/4\u20131"]}],"id":[{"id":"10.13039\/501100001659","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Bavarian Ministry of Education"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,1,1]]},"abstract":"<jats:title>Abstract<\/jats:title>\n                  <jats:sec>\n                    <jats:title>Motivation<\/jats:title>\n                    <jats:p>CATH is a protein domain classification resource that exploits an automated workflow of structure and sequence comparison alongside expert manual curation to construct a hierarchical classification of evolutionary and structural relationships. The aim of this study was to develop algorithms for detecting remote homologues missed by state-of-the-art hidden Markov model (HMM)-based approaches. The method developed (CATHe) combines a neural network with sequence representations obtained from protein language models. It was assessed using a dataset of remote homologues having less than 20% sequence identity to any domain in the training set.<\/jats:p>\n                  <\/jats:sec>\n                  <jats:sec>\n                    <jats:title>Results<\/jats:title>\n                    <jats:p>The CATHe models trained on 1773 largest and 50 largest CATH superfamilies had an accuracy of 85.6\u2009\u00b1\u20090.4% and 98.2\u2009\u00b1\u20090.3%, respectively. As a further test of the power of CATHe to detect more remote homologues missed by HMMs derived from CATH domains, we used a dataset consisting of protein domains that had annotations in Pfam, but not in CATH. By using highly reliable CATHe predictions (expected error rate &amp;lt;0.5%), we were able to provide CATH annotations for 4.62 million Pfam domains. For a subset of these domains from Homo sapiens, we structurally validated 90.86% of the predictions by comparing their corresponding AlphaFold2 structures with structures from the CATH superfamilies to which they were assigned.<\/jats:p>\n                  <\/jats:sec>\n                  <jats:sec>\n                    <jats:title>Availability and implementation<\/jats:title>\n                    <jats:p>The code for the developed models is available on https:\/\/github.com\/vam-sin\/CATHe, and the datasets developed in this study can be accessed on https:\/\/zenodo.org\/record\/6327572.<\/jats:p>\n                  <\/jats:sec>\n                  <jats:sec>\n                    <jats:title>Supplementary information<\/jats:title>\n                    <jats:p>Supplementary data are available at Bioinformatics online.<\/jats:p>\n                  <\/jats:sec>","DOI":"10.1093\/bioinformatics\/btad029","type":"journal-article","created":{"date-parts":[[2023,1,16]],"date-time":"2023-01-16T15:27:57Z","timestamp":1673882877000},"source":"Crossref","is-referenced-by-count":37,"title":["CATHe: detection of remote homologues for CATH superfamilies using embeddings from protein language models"],"prefix":"10.1093","volume":"39","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4750-038X","authenticated-orcid":false,"given":"Vamsi","family":"Nallapareddy","sequence":"first","affiliation":[{"name":"Institute of Structural and Molecular Biology, University College London , London WC1E 6BT, UK"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6568-9035","authenticated-orcid":false,"given":"Nicola","family":"Bordin","sequence":"additional","affiliation":[{"name":"Institute of Structural and Molecular Biology, University College London , London WC1E 6BT, UK"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1091-9144","authenticated-orcid":false,"given":"Ian","family":"Sillitoe","sequence":"additional","affiliation":[{"name":"Institute of Structural and Molecular Biology, University College London , London WC1E 6BT, UK"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9601-3580","authenticated-orcid":false,"given":"Michael","family":"Heinzinger","sequence":"additional","affiliation":[{"name":"Department of Informatics , Bioinformatics and Computational Biology\u2014i12 , , Garching\/Munich 85748, Germany"},{"name":"Technical University of Munich (TUM) , Bioinformatics and Computational Biology\u2014i12 , , Garching\/Munich 85748, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8533-8163","authenticated-orcid":false,"given":"Maria","family":"Littmann","sequence":"additional","affiliation":[{"name":"Department of Informatics , Bioinformatics and Computational Biology\u2014i12 , , Garching\/Munich 85748, Germany"},{"name":"Technical University of Munich (TUM) , Bioinformatics and Computational Biology\u2014i12 , , Garching\/Munich 85748, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vaishali P","family":"Waman","sequence":"additional","affiliation":[{"name":"Institute of Structural and Molecular Biology, University College London , London WC1E 6BT, UK"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3324-5755","authenticated-orcid":false,"given":"Neeladri","family":"Sen","sequence":"additional","affiliation":[{"name":"Institute of Structural and Molecular Biology, University College London , London WC1E 6BT, UK"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Burkhard","family":"Rost","sequence":"additional","affiliation":[{"name":"Department of Informatics , Bioinformatics and Computational Biology\u2014i12 , , Garching\/Munich 85748, Germany"},{"name":"Technical University of Munich (TUM) , Bioinformatics and Computational Biology\u2014i12 , , Garching\/Munich 85748, Germany"},{"name":"Institute for Advanced Study (TUM-IAS) , Garching\/Munich 85748, Germany"},{"name":"TUM School of Life Sciences Weihenstephan (WZW) 85354, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Christine","family":"Orengo","sequence":"additional","affiliation":[{"name":"Institute of Structural and Molecular Biology, University College London , London WC1E 6BT, UK"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"286","published-online":{"date-parts":[[2023,1,17]]},"reference":[{"key":"2023013106114666900_btad029-B1","author":"Agarap","year":"2019"},{"key":"2023013106114666900_btad029-B2","doi-asserted-by":"crossref","DOI":"10.1093\/database\/baw093","article-title":"The Ensembl gene annotation system","volume":"2016","author":"Aken","year":"2016","journal-title":"Database"},{"key":"2023013106114666900_btad029-B3","doi-asserted-by":"crossref","first-page":"403","DOI":"10.1016\/S0022-2836(05)80360-2","article-title":"Basic local alignment search tool","volume":"215","author":"Altschul","year":"1990","journal-title":"J. Mol. Biol"},{"key":"2023013106114666900_btad029-B4","doi-asserted-by":"crossref","first-page":"654","DOI":"10.1016\/j.cels.2021.05.017","article-title":"Learning the protein language: Evolution, structure, and function","volume":"12","author":"Bepler","year":"2021","journal-title":"Cell Syst"},{"key":"2023013106114666900_btad029-B5","doi-asserted-by":"crossref","first-page":"932","DOI":"10.1038\/s41587-021-01179-w","article-title":"Using deep learning to annotate the protein universe","author":"Bileschi","year":"2022","journal-title":"Nat. Biotechnol"},{"key":"2023013106114666900_btad029-B6","doi-asserted-by":"crossref","DOI":"10.1093\/nar\/28.1.254","article-title":"The ASTRAL compendium for protein structure and sequence analysis","volume":"28","author":"Brenner","year":"2000","journal-title":"Nucleic Acids Res"},{"key":"2023013106114666900_btad029-B7","first-page":"211","author":"Bridle","year":"1989"},{"key":"2023013106114666900_btad029-B8","doi-asserted-by":"crossref","first-page":"1522","DOI":"10.1016\/j.str.2010.08.017","article-title":"Detailed analysis of function divergence in a large and diverse domain superfamily: Toward a refined protocol of function classification","volume":"18","author":"Dessailly","year":"2010","journal-title":"Structure (London, England: 1993)"},{"key":"2023013106114666900_btad029-B9","doi-asserted-by":"crossref","first-page":"7112","DOI":"10.1109\/TPAMI.2021.3095381","article-title":"ProtTrans: Toward understanding the language of life through self-supervised learning","volume":"44","author":"Elnaggar","year":"2022","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell"},{"key":"2023013106114666900_btad029-B10","doi-asserted-by":"crossref","first-page":"5304","DOI":"10.1093\/bioinformatics\/btaa1051","article-title":"DeepNOG: Fast and accurate protein orthologous group assignment","volume":"36","author":"Feldbauer","year":"2020","journal-title":"Bioinformatics"},{"key":"2023013106114666900_btad029-B11","doi-asserted-by":"crossref","first-page":"770","DOI":"10.1109\/CVPR.2016.90","article-title":"Deep residual learning for image recognition","author":"He","year":"2016","journal-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"2023013106114666900_btad029-B12","doi-asserted-by":"crossref","first-page":"1019597","DOI":"10.3389\/fbinf.2022.1019597","article-title":"SETH predicts nuances of residue disorder from protein embeddings","volume":"2","author":"Ilzh\u00f6fer","year":"2022","journal-title":"Front. Bioinform"},{"key":"2023013106114666900_btad029-B13","first-page":"448","author":"Ioffe","year":"2015"},{"key":"2023013106114666900_btad029-B14","doi-asserted-by":"crossref","first-page":"431","DOI":"10.1186\/1471-2105-11-431","article-title":"Hidden Markov model speed heuristic and iterative HMM search procedure","volume":"11","author":"Johnson","year":"2010","journal-title":"BMC Bioinformatics"},{"key":"2023013106114666900_btad029-B15","doi-asserted-by":"crossref","first-page":"583","DOI":"10.1038\/s41586-021-03819-2","article-title":"Highly accurate protein structure prediction with AlphaFold","volume":"596","author":"Jumper","year":"2021","journal-title":"Nature"},{"key":"2023013106114666900_btad029-B16","doi-asserted-by":"crossref","first-page":"2577","DOI":"10.1002\/bip.360221211","article-title":"Dictionary of protein secondary structure: Pattern recognition of hydrogen-bonded and geometrical features","volume":"22","author":"Kabsch","year":"1983","journal-title":"Biopolymers"},{"key":"2023013106114666900_btad029-B17","author":"Kingma","year":"2017"},{"key":"2023013106114666900_btad029-B18","doi-asserted-by":"crossref","first-page":"D435","DOI":"10.1093\/nar\/gkx1069","article-title":"Gene3D: Extensive prediction of globular domains in proteins","volume":"46","author":"Lewis","year":"2018","journal-title":"Nucleic Acids Res"},{"key":"2023013106114666900_btad029-B19","author":"Lin","year":"2022"},{"key":"2023013106114666900_btad029-B20","doi-asserted-by":"crossref","first-page":"1160","DOI":"10.1038\/s41598-020-80786-0","article-title":"Embeddings from deep learning transfer GO annotations beyond homology","volume":"11","author":"Littmann","year":"2021","journal-title":"Sci. Rep"},{"key":"2023013106114666900_btad029-B21","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"van der Maaten","year":"2008","journal-title":"J. Mach. Learn. Res"},{"key":"2023013106114666900_btad029-B22","doi-asserted-by":"crossref","first-page":"2630","DOI":"10.1093\/bioinformatics\/btn504","article-title":"Profile Comparer: A program for scoring and aligning profile hidden Markov models","volume":"24","author":"Madera","year":"2008","journal-title":"Bioinformatics"},{"key":"2023013106114666900_btad029-B23","author":"Meier","year":"2021"},{"key":"2023013106114666900_btad029-B24","doi-asserted-by":"crossref","first-page":"D412","DOI":"10.1093\/nar\/gkaa913","article-title":"Pfam: The protein families database in 2021","volume":"49","author":"Mistry","year":"2021","journal-title":"Nucleic Acids Res"},{"key":"2023013106114666900_btad029-B25","doi-asserted-by":"crossref","first-page":"e121","DOI":"10.1093\/nar\/gkt263","article-title":"Challenges in homology search: HMMER3 and convergent evolution of coiled-coil regions","volume":"41","author":"Mistry","year":"2013","journal-title":"Nucleic Acids Res"},{"key":"2023013106114666900_btad029-B26","first-page":"D570","article-title":"MGnify: The microbiome analysis resource in 2020","volume":"48","author":"Mitchell","year":"2020","journal-title":"Nucleic Acids Res"},{"key":"2023013106114666900_btad029-B27","doi-asserted-by":"crossref","first-page":"D351","DOI":"10.1093\/nar\/gky1100","article-title":"InterPro in 2019: Improving coverage, classification and access to protein sequence annotations","volume":"47","author":"Mitchell","year":"2019","journal-title":"Nucleic Acids Res"},{"key":"2023013106114666900_btad029-B29","first-page":"2825","article-title":"Scikit-learn: Machine learning in python","volume":"12","author":"Pedregosa","year":"2011","journal-title":"J. Mach. Learn. Res"},{"key":"2023013106114666900_btad029-B30","doi-asserted-by":"crossref","first-page":"221","DOI":"10.1038\/nmeth.2340","article-title":"A large-scale evaluation of computational protein function prediction","volume":"10","author":"Radivojac","year":"2013","journal-title":"Nat. Methods"},{"key":"2023013106114666900_btad029-B31","doi-asserted-by":"crossref","first-page":"e232","DOI":"10.1371\/journal.pcbi.0030232","article-title":"CATHEDRAL: A fast and effective algorithm to predict folds and domain boundaries from multidomain protein structures","volume":"3","author":"Redfern","year":"2007","journal-title":"PLoS Comput. Biol"},{"key":"2023013106114666900_btad029-B32","doi-asserted-by":"crossref","first-page":"173","DOI":"10.1038\/nmeth.1818","article-title":"HHblits: Lightning-fast iterative protein sequence searching by HMM\u2013HMM alignment","volume":"9","author":"Remmert","year":"2012","journal-title":"Nat. Methods"},{"key":"2023013106114666900_btad029-B33","doi-asserted-by":"crossref","first-page":"i254","DOI":"10.1093\/bioinformatics\/bty275","article-title":"DeepFam: Deep learning based alignment-free method for protein family modeling and prediction","volume":"34","author":"Seo","year":"2018","journal-title":"Bioinformatics"},{"key":"2023013106114666900_btad029-B34","doi-asserted-by":"crossref","first-page":"D266","DOI":"10.1093\/nar\/gkaa1079","article-title":"CATH: Increased structural coverage of functional space","volume":"49","author":"Sillitoe","year":"2021","journal-title":"Nucleic Acids Res"},{"key":"2023013106114666900_btad029-B35","first-page":"1929","article-title":"Dropout: A simple way to prevent neural networks from overfitting","volume":"15","author":"Srivastava","year":"2014","journal-title":"J. Mach. Learn. Res"},{"key":"2023013106114666900_btad029-B36","doi-asserted-by":"crossref","first-page":"473","DOI":"10.1186\/s12859-019-3019-7","article-title":"HH-suite3 for fast remote homology detection and deep protein annotation","volume":"20","author":"Steinegger","year":"2019","journal-title":"BMC Bioinformatics"},{"key":"2023013106114666900_btad029-B37","doi-asserted-by":"crossref","first-page":"603","DOI":"10.1038\/s41592-019-0437-4","article-title":"Protein-level assembly increases protein sequence recovery from metagenomic samples manyfold","volume":"16","author":"Steinegger","year":"2019","journal-title":"Nat. Methods"},{"key":"2023013106114666900_btad029-B38","doi-asserted-by":"crossref","first-page":"1026","DOI":"10.1038\/nbt.3988","article-title":"MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets","volume":"35","author":"Steinegger","year":"2017","journal-title":"Nat. Biotechnol"},{"key":"2023013106114666900_btad029-B39","doi-asserted-by":"crossref","first-page":"2542","DOI":"10.1038\/s41467-018-04964-5","article-title":"Clustering huge protein sequence sets in linear time","volume":"9","author":"Steinegger","year":"2018","journal-title":"Nat. Commun"},{"key":"2023013106114666900_btad029-B1952167","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/0022-2836(89)90084-3","article-title":"Protein structure alignment","volume":"208","author":"Taylor","year":"1989","journal-title":"J. Mol. Biol."},{"key":"2023013106114666900_btad029-B40","doi-asserted-by":"crossref","first-page":"D204","DOI":"10.1093\/nar\/gku989","article-title":"UniProt: A hub for protein information","volume":"43","author":"UniProt Consortium","year":"2015","journal-title":"Nucleic Acids Res"},{"key":"2023013106114666900_btad029-B41","doi-asserted-by":"crossref","first-page":"227","DOI":"10.1002\/prot.10146","article-title":"Scoring residue conservation","volume":"48","author":"Valdar","year":"2002","journal-title":"Proteins"},{"key":"2023013106114666900_btad029-B42","doi-asserted-by":"crossref","first-page":"D439","DOI":"10.1093\/nar\/gkab1061","article-title":"AlphaFold protein structure database: Massively expanding the structural coverage of protein-sequence space with high-accuracy models","volume":"50","author":"Varadi","year":"2022","journal-title":"Nucleic Acids Res"},{"key":"2023013106114666900_btad029-B43","author":"Wei\u00dfenow","year":"2021"},{"key":"2023013106114666900_btad029-B44","author":"Wu","year":"2022"},{"key":"2023013106114666900_btad029-B45","doi-asserted-by":"crossref","first-page":"D520","DOI":"10.1093\/nar\/gky949","article-title":"Protein Data bank: The single global archive for 3D macromolecular structure data","volume":"47","author":"wwPDB consortium","year":"2019","journal-title":"Nucleic Acids Res"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/advance-article-pdf\/doi\/10.1093\/bioinformatics\/btad029\/48745028\/btad029.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/39\/1\/btad029\/48959111\/btad029.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/39\/1\/btad029\/48959111\/btad029.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,4]],"date-time":"2023-12-04T15:55:45Z","timestamp":1701705345000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/doi\/10.1093\/bioinformatics\/btad029\/6989624"}},"subtitle":[],"editor":[{"given":"Alfonso","family":"Valencia","sequence":"additional","affiliation":[],"role":[{"role":"editor","vocabulary":"crossref"}]}],"short-title":[],"issued":{"date-parts":[[2023,1,1]]},"references-count":45,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2023,1,1]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/btad029","relation":{"has-preprint":[{"id-type":"doi","id":"10.1101\/2022.03.10.483805","asserted-by":"object"}]},"ISSN":["1367-4803","1367-4811"],"issn-type":[{"value":"1367-4803","type":"print"},{"value":"1367-4811","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2023,1,1]]},"published":{"date-parts":[[2023,1,1]]},"article-number":"btad029"}}