{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T21:47:24Z","timestamp":1776289644791,"version":"3.50.1"},"reference-count":40,"publisher":"Oxford University Press (OUP)","issue":"4","license":[{"start":{"date-parts":[[2024,3,20]],"date-time":"2024-03-20T00:00:00Z","timestamp":1710892800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100008530","name":"European Regional Development Fund","doi-asserted-by":"publisher","award":["13.1.1-LMT-K-718-05-0021"],"award-info":[{"award-number":["13.1.1-LMT-K-718-05-0021"]}],"id":[{"id":"10.13039\/501100008530","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004504","name":"Research Council of Lithuania","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004504","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,3,29]]},"abstract":"<jats:title>Abstract<\/jats:title>\n                  <jats:sec>\n                    <jats:title>Motivation<\/jats:title>\n                    <jats:p>Reliable prediction of protein thermostability from its sequence is valuable for both academic and industrial research. This prediction problem can be tackled using machine learning and by taking advantage of the recent blossoming of deep learning methods for sequence analysis. These methods can facilitate training on more data and, possibly, enable the development of more versatile thermostability predictors for multiple ranges of temperatures.<\/jats:p>\n                  <\/jats:sec>\n                  <jats:sec>\n                    <jats:title>Results<\/jats:title>\n                    <jats:p>We applied the principle of transfer learning to predict protein thermostability using embeddings generated by protein language models (pLMs) from an input protein sequence. We used large pLMs that were pre-trained on hundreds of millions of known sequences. The embeddings from such models allowed us to efficiently train and validate a high-performing prediction method using over one million sequences that we collected from organisms with annotated growth temperatures. Our method, TemStaPro (Temperatures of Stability for Proteins), was used to predict thermostability of CRISPR-Cas Class II effector proteins (C2EPs). Predictions indicated sharp differences among groups of C2EPs in terms of thermostability and were largely in tune with previously published and our newly obtained experimental data.<\/jats:p>\n                  <\/jats:sec>\n                  <jats:sec>\n                    <jats:title>Availability and implementation<\/jats:title>\n                    <jats:p>TemStaPro software and the related data are freely available from https:\/\/github.com\/ievapudz\/TemStaPro and https:\/\/doi.org\/10.5281\/zenodo.7743637.<\/jats:p>\n                  <\/jats:sec>","DOI":"10.1093\/bioinformatics\/btae157","type":"journal-article","created":{"date-parts":[[2024,3,20]],"date-time":"2024-03-20T16:21:30Z","timestamp":1710951690000},"source":"Crossref","is-referenced-by-count":46,"title":["TemStaPro: protein thermostability prediction using sequence representations from protein language models"],"prefix":"10.1093","volume":"40","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-0600-590X","authenticated-orcid":false,"given":"Ieva","family":"Pud\u017eiuvelyt\u0117","sequence":"first","affiliation":[{"name":"Institute of Biotechnology, Life Sciences Center, Vilnius University , LT-10257 Vilnius, Lithuania"},{"name":"Institute of Computer Science, Faculty of Mathematics and Informatics, Vilnius University , LT-08303 Vilnius, Lithuania"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4918-9505","authenticated-orcid":false,"given":"Kliment","family":"Olechnovi\u010d","sequence":"additional","affiliation":[{"name":"Institute of Biotechnology, Life Sciences Center, Vilnius University , LT-10257 Vilnius, Lithuania"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7804-5431","authenticated-orcid":false,"given":"Egle","family":"Godliauskaite","sequence":"additional","affiliation":[{"name":"CasZyme , LT-10257 Vilnius, Lithuania"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0842-1800","authenticated-orcid":false,"given":"Kristupas","family":"Sermokas","sequence":"additional","affiliation":[{"name":"CasZyme , LT-10257 Vilnius, Lithuania"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2847-4386","authenticated-orcid":false,"given":"Tomas","family":"Urbaitis","sequence":"additional","affiliation":[{"name":"CasZyme , LT-10257 Vilnius, Lithuania"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2509-9054","authenticated-orcid":false,"given":"Giedrius","family":"Gasiunas","sequence":"additional","affiliation":[{"name":"Institute of Biotechnology, Life Sciences Center, Vilnius University , LT-10257 Vilnius, Lithuania"},{"name":"CasZyme , LT-10257 Vilnius, Lithuania"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6135-8549","authenticated-orcid":false,"given":"Darius","family":"Kazlauskas","sequence":"additional","affiliation":[{"name":"Institute of Biotechnology, Life Sciences Center, Vilnius University , LT-10257 Vilnius, Lithuania"}]}],"member":"286","published-online":{"date-parts":[[2024,3,20]]},"reference":[{"key":"2024040900214019900_btae157-B1","doi-asserted-by":"crossref","first-page":"9586","DOI":"10.1038\/s41598-021-89029-2","article-title":"Efficient genome editing of an extreme thermophile, Thermus thermophilus, using a thermostable Cas9 variant","volume":"11","author":"Adalsteinsson","year":"2021","journal-title":"Sci Rep"},{"key":"2024040900214019900_btae157-B2","doi-asserted-by":"crossref","first-page":"790063","DOI":"10.3389\/fmicb.2022.790063","article-title":"iThermo: a sequence-based model for identifying thermophilic proteins using a multi-feature fusion strategy","volume":"13","author":"Ahmed","year":"2022","journal-title":"Front Microbiol"},{"key":"2024040900214019900_btae157-B3","doi-asserted-by":"crossref","first-page":"57","DOI":"10.1126\/science.abj6856","article-title":"The widespread IS200\/IS605 transposon family encodes diverse programmable RNA-guided endonucleases","volume":"374","author":"Altae-Tran","year":"2021","journal-title":"Science"},{"key":"2024040900214019900_btae157-B4","doi-asserted-by":"crossref","first-page":"D733","DOI":"10.1093\/nar\/gkac1037","article-title":"IMG\/VR v4: an expanded database of uncultivated virus genomes within a framework of extensive functional, taxonomic, and ecological metadata","volume":"51","author":"Camargo","year":"2023","journal-title":"Nucleic Acids Res"},{"key":"2024040900214019900_btae157-B5","doi-asserted-by":"crossref","first-page":"23782","DOI":"10.1038\/s41598-021-03293-w","article-title":"A novel sequence-based predictor for identifying and characterizing thermophilic proteins using estimated propensity scores of dipeptides","volume":"11","author":"Charoenkwan","year":"2021","journal-title":"Sci Rep"},{"key":"2024040900214019900_btae157-B6","doi-asserted-by":"crossref","first-page":"105704","DOI":"10.1016\/j.compbiomed.2022.105704","article-title":"SAPPHIRE: a stacking-based ensemble learning framework for accurate prediction of thermophilic proteins","volume":"146","author":"Charoenkwan","year":"2022","journal-title":"Comput Biol Med"},{"key":"2024040900214019900_btae157-B7","doi-asserted-by":"crossref","first-page":"275","DOI":"10.1093\/protein\/gzn001","article-title":"Revisiting the correlation between proteins\u2019 thermoresistance and organisms\u2019 thermophilicity","volume":"21","author":"Dehouck","year":"2008","journal-title":"Protein Eng Des Sel"},{"key":"2024040900214019900_btae157-B8","doi-asserted-by":"crossref","first-page":"e1002195","DOI":"10.1371\/journal.pcbi.1002195","article-title":"Accelerated profile HMM searches","volume":"7","author":"Eddy","year":"2011","journal-title":"PLoS Comput Biol"},{"key":"2024040900214019900_btae157-B9","doi-asserted-by":"crossref","first-page":"7112","DOI":"10.1109\/TPAMI.2021.3095381","article-title":"ProtTrans: toward understanding the language of life through self-supervised learning","volume":"44","author":"Elnaggar","year":"2022","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"2024040900214019900_btae157-B10","doi-asserted-by":"crossref","first-page":"177","DOI":"10.1186\/s12866-018-1320-7","article-title":"Correlating enzyme annotations with a large set of microbial growth temperatures reveals metabolic adaptations to growth at diverse temperatures","volume":"18","author":"Engqvist","year":"2018","journal-title":"BMC Microbiol"},{"key":"2024040900214019900_btae157-B11","author":"Engqvist","year":"2018"},{"key":"2024040900214019900_btae157-B12","doi-asserted-by":"crossref","first-page":"138","DOI":"10.1016\/j.jtbi.2016.07.010","article-title":"Identification of thermophilic proteins by incorporating evolutionary and acid dissociation information into chou\u2019s general pseudo amino acid composition","volume":"407","author":"Fan","year":"2016","journal-title":"J Theor Biol"},{"key":"2024040900214019900_btae157-B13","doi-asserted-by":"crossref","first-page":"285","DOI":"10.3389\/fbioe.2020.00285","article-title":"A method for prediction of thermophilic protein based on reduced amino acids and mixed features","volume":"8","author":"Feng","year":"2020","journal-title":"Front Bioeng Biotechnol"},{"key":"2024040900214019900_btae157-B14","doi-asserted-by":"crossref","first-page":"bbac232","DOI":"10.1093\/bib\/bbac232","article-title":"Transfer learning in proteins: evaluating novel protein learned representations for bioinformatics tasks","volume":"23","author":"Fenoy","year":"2022","journal-title":"Brief Bioinform"},{"key":"2024040900214019900_btae157-B15","doi-asserted-by":"crossref","first-page":"3150","DOI":"10.1093\/bioinformatics\/bts565","article-title":"CD-HIT: accelerated for clustering the next-generation sequencing data","volume":"28","author":"Fu","year":"2012","journal-title":"Bioinformatics"},{"key":"2024040900214019900_btae157-B16","doi-asserted-by":"crossref","first-page":"5512","DOI":"10.1038\/s41467-020-19344-1","article-title":"A catalogue of biochemically diverse CRISPR-Cas9 orthologs","volume":"11","author":"Gasiunas","year":"2020","journal-title":"Nat Commun"},{"key":"2024040900214019900_btae157-B17","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1021\/acssynbio.2c00496","article-title":"CRISPR-Based diagnostics: challenges and potential solutions toward point-of-care applications","volume":"12","author":"Ghouneimy","year":"2023","journal-title":"ACS Synth Biol"},{"key":"2024040900214019900_btae157-B18","doi-asserted-by":"crossref","first-page":"1274","DOI":"10.1002\/prot.21616","article-title":"Discrimination of mesophilic and thermophilic proteins using machine learning algorithms","volume":"70","author":"Gromiha","year":"2008","journal-title":"Proteins"},{"key":"2024040900214019900_btae157-B19","doi-asserted-by":"crossref","first-page":"1424","DOI":"10.1038\/s41467-017-01408-4","article-title":"A thermostable Cas9 with increased lifetime in human plasma","volume":"8","author":"Harrington","year":"2017","journal-title":"Nat Commun"},{"key":"2024040900214019900_btae157-B20","doi-asserted-by":"crossref","first-page":"lqad087","DOI":"10.1093\/nargab\/lqad087","article-title":"Superior protein thermophilicity prediction with protein language model embeddings","volume":"5","author":"Haselbeck","year":"2023","journal-title":"NAR Genom Bioinform"},{"key":"2024040900214019900_btae157-B21","doi-asserted-by":"crossref","first-page":"692","DOI":"10.1038\/s41586-021-04058-1","article-title":"Transposon-associated TnpB is a programmable RNA-guided DNA endonuclease","volume":"599","author":"Karvelis","year":"2021","journal-title":"Nature"},{"key":"2024040900214019900_btae157-B22","doi-asserted-by":"crossref","first-page":"2665","DOI":"10.3390\/cells11172665","article-title":"Applications of CRISPR\/Cas13-Based RNA editing in plants","volume":"11","author":"Kavuri","year":"2022","journal-title":"Cells"},{"key":"2024040900214019900_btae157-B23","doi-asserted-by":"crossref","first-page":"196","DOI":"10.1007\/s12033-022-00538-5","article-title":"Current and prospective applications of CRISPR-Cas12a in pluricellular organisms","volume":"65","author":"Khan","year":"2023","journal-title":"Mol Biotechnol"},{"key":"2024040900214019900_btae157-B24","author":"Kingma","year":"2017"},{"key":"2024040900214019900_btae157-B25","doi-asserted-by":"crossref","first-page":"3236","DOI":"10.1093\/bioinformatics\/bth191","article-title":"UniProt archive","volume":"20","author":"Leinonen","year":"2004","journal-title":"Bioinformatics"},{"key":"2024040900214019900_btae157-B26","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1016\/j.mimet.2010.10.013","article-title":"Prediction of thermophilic proteins using feature selection technique","volume":"84","author":"Lin","year":"2011","journal-title":"J Microbiol Methods"},{"key":"2024040900214019900_btae157-B27","doi-asserted-by":"crossref","first-page":"1123","DOI":"10.1126\/science.ade2574","article-title":"Evolutionary-scale prediction of atomic-level protein structure with a language model","volume":"379","author":"Lin","year":"2023","journal-title":"Science"},{"key":"2024040900214019900_btae157-B28","doi-asserted-by":"crossref","first-page":"1647","DOI":"10.1038\/s41467-017-01591-4","article-title":"Characterizing a thermostable Cas9 for bacterial genome editing and silencing","volume":"8","author":"Mougiakos","year":"2017","journal-title":"Nat Commun"},{"key":"2024040900214019900_btae157-B30","doi-asserted-by":"crossref","first-page":"103926","DOI":"10.1016\/j.ebiom.2022.103926","article-title":"A thermostable Cas12b from brevibacillus leverages one-pot discrimination of SARS-CoV-2 variants of concern","volume":"77","author":"Nguyen","year":"2022","journal-title":"EBioMedicine"},{"key":"2024040900214019900_btae157-B31","doi-asserted-by":"crossref","first-page":"2858","DOI":"10.3390\/app13052858","article-title":"Identification of thermophilic proteins based on sequence-based bidirectional representations from transformer-embedding features","volume":"13","author":"Pei","year":"2023","journal-title":"Appl Sci"},{"key":"2024040900214019900_btae157-B32","doi-asserted-by":"crossref","first-page":"D753","DOI":"10.1093\/nar\/gkac1080","article-title":"MGnify: the microbiome sequence data analysis resource in 2023","volume":"51","author":"Richardson","year":"2023","journal-title":"Nucleic Acids Res"},{"key":"2024040900214019900_btae157-B33","doi-asserted-by":"crossref","first-page":"e2016239118","DOI":"10.1073\/pnas.2016239118","article-title":"Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences","volume":"118","author":"Rives","year":"2021","journal-title":"Proc Natl Acad Sci USA"},{"key":"2024040900214019900_btae157-B34","doi-asserted-by":"crossref","first-page":"384","DOI":"10.1038\/s41586-023-05826-x","article-title":"TnpB structure reveals minimal functional core of Cas12 nuclease family","volume":"616","author":"Sasnauskas","year":"2023","journal-title":"Nature"},{"key":"2024040900214019900_btae157-B35","doi-asserted-by":"crossref","first-page":"D20","DOI":"10.1093\/nar\/gkab1112","article-title":"Database resources of the National Center for Biotechnology Information","volume":"50","author":"Sayers","year":"2022","journal-title":"Nucleic Acids Res"},{"key":"2024040900214019900_btae157-B36","doi-asserted-by":"crossref","first-page":"vbab035","DOI":"10.1093\/bioadv\/vbab035","article-title":"Light attention predicts protein location from the language of life","volume":"1","author":"Stark","year":"2021","journal-title":"Bioinform Adv"},{"key":"2024040900214019900_btae157-B37","doi-asserted-by":"crossref","first-page":"1282","DOI":"10.1093\/bioinformatics\/btm098","article-title":"UniRef: comprehensive and non-redundant UniProt reference clusters","volume":"23","author":"Suzek","year":"2007","journal-title":"Bioinformatics"},{"key":"2024040900214019900_btae157-B38","doi-asserted-by":"crossref","first-page":"W228","DOI":"10.1093\/nar\/gkac278","article-title":"DeepLoc 2.0: multi-label subcellular localization prediction using protein language models","volume":"50","author":"Thumuluri","year":"2022","journal-title":"Nucleic Acids Res"},{"key":"2024040900214019900_btae157-B39","doi-asserted-by":"crossref","first-page":"e55481","DOI":"10.15252\/embr.202255481","article-title":"A new family of CRISPR-type V nucleases with C-rich PAM recognition","volume":"23","author":"Urbaitis","year":"2022","journal-title":"EMBO Rep"},{"key":"2024040900214019900_btae157-B40","doi-asserted-by":"crossref","first-page":"1814","DOI":"10.1016\/j.cell.2016.11.053","article-title":"PAM-Dependent target DNA recognition and cleavage by C2c1 CRISPR-Cas endonuclease","volume":"167","author":"Yang","year":"2016","journal-title":"Cell"},{"key":"2024040900214019900_btae157-B41","doi-asserted-by":"crossref","first-page":"2217","DOI":"10.3390\/ijms24032217","article-title":"DeepTP: a deep learning model for thermophilic protein prediction","volume":"24","author":"Zhao","year":"2023","journal-title":"Int J Mol Sci"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/advance-article-pdf\/doi\/10.1093\/bioinformatics\/btae157\/57037971\/btae157.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/advance-article-pdf\/doi\/10.1093\/bioinformatics\/btae157\/57185948\/btae157.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/advance-article-pdf\/doi\/10.1093\/bioinformatics\/btae157\/57185948\/btae157.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,8]],"date-time":"2024-04-08T20:31:31Z","timestamp":1712608291000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/doi\/10.1093\/bioinformatics\/btae157\/7632735"}},"subtitle":[],"editor":[{"given":"Lenore","family":"Cowen","sequence":"additional","affiliation":[]}],"short-title":[],"issued":{"date-parts":[[2024,3,20]]},"references-count":40,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2024,3,29]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/btae157","relation":{"has-preprint":[{"id-type":"doi","id":"10.1101\/2023.03.27.534365","asserted-by":"object"}]},"ISSN":["1367-4811"],"issn-type":[{"value":"1367-4811","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2024,4,1]]},"published":{"date-parts":[[2024,3,20]]},"article-number":"btae157"}}