{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T03:53:21Z","timestamp":1776311601504,"version":"3.50.1"},"reference-count":117,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2022,3,21]],"date-time":"2022-03-21T00:00:00Z","timestamp":1647820800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,3,21]],"date-time":"2022-03-21T00:00:00Z","timestamp":1647820800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Nat Mach Intell"],"DOI":"10.1038\/s42256-022-00457-9","type":"journal-article","created":{"date-parts":[[2022,3,23]],"date-time":"2022-03-23T00:44:54Z","timestamp":1647996294000},"page":"227-245","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":186,"title":["Learning functional properties of proteins with language models"],"prefix":"10.1038","volume":"4","author":[{"given":"Serbulent","family":"Unsal","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3180-7010","authenticated-orcid":false,"given":"Heval","family":"Atas","sequence":"additional","affiliation":[]},{"given":"Muammer","family":"Albayrak","sequence":"additional","affiliation":[]},{"given":"Kemal","family":"Turhan","sequence":"additional","affiliation":[]},{"given":"Aybar C.","family":"Acar","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1298-9763","authenticated-orcid":false,"given":"Tunca","family":"Do\u011fan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,3,21]]},"reference":[{"key":"457_CR1","doi-asserted-by":"publisher","first-page":"334","DOI":"10.1186\/s12859-018-2368-y","volume":"19","author":"A Dalkiran","year":"2018","unstructured":"Dalkiran, A. et al. ECPred: a tool for the prediction of the enzymatic functions of protein sequences based on the EC nomenclature. BMC Bioinf. 19, 334 (2018).","journal-title":"BMC Bioinf."},{"key":"457_CR2","doi-asserted-by":"publisher","first-page":"771","DOI":"10.1016\/S0022-2836(03)00628-4","volume":"330","author":"PD Dobson","year":"2003","unstructured":"Dobson, P. D. & Doig, A. J. Distinguishing enzyme structures from non-enzymes without alignments. J. Mol. Biol. 330, 771\u2013783 (2003).","journal-title":"J. Mol. Biol."},{"key":"457_CR3","doi-asserted-by":"publisher","first-page":"1839","DOI":"10.1021\/ci900104b","volume":"49","author":"DARS Latino","year":"2009","unstructured":"Latino, D. A. R. S. & Aires-de-Sousa, J. Assignment of EC numbers to enzymatic reactions with MOLMAP reaction descriptors and random forests. J. Chem. Inf. Model. 49, 1839\u20131846 (2009).","journal-title":"J. Chem. Inf. Model."},{"key":"457_CR4","doi-asserted-by":"publisher","first-page":"e0141287","DOI":"10.1371\/journal.pone.0141287","volume":"10","author":"E Asgari","year":"2015","unstructured":"Asgari, E. & Mofrad, M. R. K. Continuous distributed representation of biological sequences for deep proteomics and genomics. PLoS ONE 10, e0141287 (2015).","journal-title":"PLoS ONE"},{"key":"457_CR5","unstructured":"Kimothi, D., Soni, A., Biyani, P. & Hogan, J. M. Distributed representations for biological sequence analysis. Preprint at https:\/\/arxiv.org\/abs\/1608.05949 (2016)."},{"key":"457_CR6","doi-asserted-by":"publisher","unstructured":"Nguyen, S., Li, Z. & Shang, Y. Deep networks and continuous distributed representation of protein sequences for protein quality assessment. In 2017 IEEE 29th International Conference on Tools with Artificial Intelligence (ICTAI) 527\u2013534 (IEEE, 2017); https:\/\/doi.org\/10.1109\/ICTAI.2017.00086","DOI":"10.1109\/ICTAI.2017.00086"},{"key":"457_CR7","doi-asserted-by":"publisher","first-page":"4884","DOI":"10.1021\/acs.chemrev.5b00683","volume":"116","author":"O Keskin","year":"2016","unstructured":"Keskin, O., Tuncbag, N. & Gursoy, A. Predicting protein\u2013protein interactions from the molecular to the proteome level. Chem. Rev. 116, 4884\u20134909 (2016).","journal-title":"Chem. Rev."},{"key":"457_CR8","doi-asserted-by":"publisher","first-page":"1878","DOI":"10.1093\/bib\/bby061","volume":"20","author":"AS Rifaioglu","year":"2019","unstructured":"Rifaioglu, A. S. et al. Recent applications of deep learning and machine intelligence on in silico drug discovery: methods, tools and databases. Briefings Bioinform. 20, 1878\u20131912 (2019).","journal-title":"Briefings Bioinform."},{"key":"457_CR9","doi-asserted-by":"publisher","first-page":"2531","DOI":"10.1039\/C9SC03414E","volume":"11","author":"AS Rifaioglu","year":"2020","unstructured":"Rifaioglu, A. S. et al. DEEPScreen: high performance drug-target interaction prediction with convolutional neural networks using 2-D structural compound representations. Chem. Sci. 11, 2531\u20132557 (2020).","journal-title":"Chem. Sci."},{"key":"457_CR10","doi-asserted-by":"publisher","first-page":"693","DOI":"10.1093\/bioinformatics\/btaa858","volume":"37","author":"AS Rifaioglu","year":"2021","unstructured":"Rifaioglu, A. S. et al. MDeePred: novel multi-channel protein featurization for deep learning-based binding affinity prediction in drug discovery. Bioinformatics 37, 693\u2013704 (2021).","journal-title":"Bioinformatics"},{"key":"457_CR11","doi-asserted-by":"publisher","first-page":"e1009171","DOI":"10.1371\/journal.pcbi.1009171","volume":"17","author":"T Do\u011fan","year":"2021","unstructured":"Do\u011fan, T. et al. Protein domain-based prediction of compound\u2013target interactions and experimental validation on LIM kinases. PLoS Comput. Biol. 17, e1009171 (2021).","journal-title":"PLoS Comput. Biol."},{"key":"457_CR12","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1002\/prot.25415","volume":"86","author":"J Moult","year":"2018","unstructured":"Moult, J., Fidelis, K., Kryshtafovych, A., Schwede, T. & Tramontano, A. Critical assessment of methods of protein structure prediction (CASP)-Round XII. Proteins 86, 7\u201315 (2018).","journal-title":"Proteins"},{"key":"457_CR13","doi-asserted-by":"crossref","unstructured":"Tunyasuvunakool, K. et al. Highly accurate protein structure prediction for the human proteome. Nature 596, 590\u2013596 (2021).","DOI":"10.1038\/s41586-021-03828-1"},{"key":"457_CR14","doi-asserted-by":"crossref","unstructured":"Baek, M. et al. Accurate prediction of protein structures and interactions using a three-track neural network. Science 373, 871\u2013876 (2021).","DOI":"10.1126\/science.abj8754"},{"key":"457_CR15","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-019-43708-3","volume":"9","author":"AS Rifaioglu","year":"2019","unstructured":"Rifaioglu, A. S., Do\u011fan, T., Jesus Martin, M., Cetin-Atalay, R. & Atalay, V. DEEPred: automated protein function prediction with multi-task feed-forward deep neural networks. Sci. Rep. 9, 7344 (2019).","journal-title":"Sci. Rep."},{"key":"457_CR16","doi-asserted-by":"publisher","first-page":"2465","DOI":"10.1093\/bioinformatics\/bty130","volume":"34","author":"R You","year":"2018","unstructured":"You, R. et al. GOLabeler: improving sequence-based large-scale protein function prediction by learning to rank. Bioinformatics 34, 2465\u20132473 (2018).","journal-title":"Bioinformatics"},{"key":"457_CR17","doi-asserted-by":"publisher","first-page":"753","DOI":"10.1093\/bioinformatics\/bty704","volume":"35","author":"A Jain","year":"2019","unstructured":"Jain, A. & Kihara, D. Phylo-PFP: improved automated protein function prediction using phylogenetic distance of distantly related sequences. Bioinformatics 35, 753\u2013759 (2019).","journal-title":"Bioinformatics"},{"key":"457_CR18","doi-asserted-by":"crossref","unstructured":"The Gene Ontology Consortium. The gene ontology resource: 20 years and still GOing strong. Nucleic Acids Res. 47, D330\u2013D338 (2019).","DOI":"10.1093\/nar\/gky1055"},{"key":"457_CR19","doi-asserted-by":"publisher","DOI":"10.1186\/s13059-019-1835-8","volume":"20","author":"N Zhou","year":"2019","unstructured":"Zhou, N. et al. The CAFA challenge reports improved protein function prediction and new functional annotations for hundreds of genes through experimental screens. Genome Biol. 20, 244 (2019).","journal-title":"Genome Biol."},{"key":"457_CR20","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y LeCun","year":"2015","unstructured":"LeCun, Y., Bengio, Y. & Hinton, G. Deep learning. Nature 521, 436\u2013444 (2015).","journal-title":"Nature"},{"key":"457_CR21","doi-asserted-by":"publisher","first-page":"24","DOI":"10.1038\/s41591-018-0316-z","volume":"25","author":"A Esteva","year":"2019","unstructured":"Esteva, A. et al. A guide to deep learning in healthcare. Nat. Med. 25, 24\u201329 (2019).","journal-title":"Nat. Med."},{"key":"457_CR22","doi-asserted-by":"crossref","unstructured":"Liu, L. et al. Deep learning for generic object detection: a survey. Int. J. Comput. Vision 128, 261\u2013318 (2020).","DOI":"10.1007\/s11263-019-01247-4"},{"key":"457_CR23","doi-asserted-by":"publisher","first-page":"2224","DOI":"10.1109\/COMST.2019.2904897","volume":"21","author":"C Zhang","year":"2019","unstructured":"Zhang, C., Patras, P. & Haddadi, H. Deep learning in mobile and wireless networking: a survey. IEEE Commun. Surv. Tutor. 21, 2224\u20132287 (2019).","journal-title":"IEEE Commun. Surv. Tutor."},{"key":"457_CR24","doi-asserted-by":"publisher","first-page":"12","DOI":"10.1038\/s41588-018-0295-5","volume":"51","author":"J Zou","year":"2019","unstructured":"Zou, J. et al. A primer on deep learning in genomics. Nat. Genet. 51, 12\u201318 (2019).","journal-title":"Nat. Genet."},{"key":"457_CR25","doi-asserted-by":"publisher","first-page":"1817","DOI":"10.1186\/s40537-016-0043-6","volume":"3","author":"K Weiss","year":"2016","unstructured":"Weiss, K., Khoshgoftaar, T. M. & Wang, D. A survey of transfer learning. J. Big Data 3, 1817 (2016).","journal-title":"J. Big Data"},{"key":"457_CR26","unstructured":"Raffel, C. et al. Exploring the limits of transfer learning with a unified text-to-text transformer. Preprint at https:\/\/arxiv.org\/abs\/1910.10683 (2019)."},{"key":"457_CR27","first-page":"9689","volume":"32","author":"R Rao","year":"2019","unstructured":"Rao, R. et al. Evaluating protein transfer learning with TAPE. Adv. Neural Inf. Process. Syst. 32, 9689\u20139701 (2019).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"457_CR28","doi-asserted-by":"crossref","unstructured":"Meier, J. et al. Language models enable zero-shot prediction of the effects of mutations on protein function. In Advances in Neural Information Processing Systems Vol. 34 (NeurIPS, 2021).","DOI":"10.1101\/2021.07.09.450648"},{"key":"457_CR29","doi-asserted-by":"crossref","unstructured":"Elnaggar, A. et al. ProtTrans: towards cracking the language of life\u2019s code through self-supervised deep learning and high performance computing. Preprint at https:\/\/arxiv.org\/abs\/2007.06225 (2020).","DOI":"10.1101\/2020.07.12.199554"},{"key":"457_CR30","doi-asserted-by":"publisher","first-page":"2642","DOI":"10.1093\/bioinformatics\/bty178","volume":"34","author":"KK Yang","year":"2018","unstructured":"Yang, K. K., Wu, Z., Bedbrook, C. N. & Arnold, F. H. Learned protein embeddings for machine learning. Bioinformatics 34, 2642\u20132648 (2018).","journal-title":"Bioinformatics"},{"key":"457_CR31","first-page":"540","volume":"360","author":"M Heinzinger","year":"2019","unstructured":"Heinzinger, M. et al. Modeling the language of life-deep learning protein sequences. Bioinformatics 360, 540 (2019).","journal-title":"Bioinformatics"},{"key":"457_CR32","doi-asserted-by":"publisher","DOI":"10.1186\/s12920-018-0349-7","volume":"11","author":"S Kim","year":"2018","unstructured":"Kim, S., Lee, H., Kim, K. & Kang, J. Mut2Vec: distributed representation of cancerous mutations. BMC Med. Genomics 11, 33 (2018).","journal-title":"BMC Med. Genomics"},{"key":"457_CR33","doi-asserted-by":"publisher","DOI":"10.1186\/s12864-018-5370-x","volume":"20","author":"J Du","year":"2019","unstructured":"Du, J. et al. Gene2vec: distributed representation of genes based on co-expression. BMC Genomics 20, 82 (2019).","journal-title":"BMC Genomics"},{"key":"457_CR34","doi-asserted-by":"crossref","unstructured":"Choy, C. T., Wong, C. H. & Chan, S. L. Infer related genes from large scale gene expression dataset with embedding. Preprint at bioRxiv https:\/\/www.biorxiv.org\/content\/10.1101\/362848v2 (2018).","DOI":"10.1101\/362848"},{"key":"457_CR35","unstructured":"Rao, R. et al. MSA transformer. Preprint at bioRxiv https:\/\/www.biorxiv.org\/content\/10.1101\/2021.02.12.430858v3 (2021)."},{"key":"457_CR36","doi-asserted-by":"crossref","unstructured":"Lu, A. X., Zhang, H., Ghassemi, M. & Moses, A. Self-supervised contrastive learning of protein representations by mutual information maximization. Preprint at bioRxiv https:\/\/www.biorxiv.org\/content\/10.1101\/2020.09.04.283929v2 (2020).","DOI":"10.1101\/2020.09.04.283929"},{"key":"457_CR37","doi-asserted-by":"publisher","first-page":"1315","DOI":"10.1038\/s41592-019-0598-1","volume":"16","author":"EC Alley","year":"2019","unstructured":"Alley, E. C., Khimulya, G., Biswas, S., AlQuraishi, M. & Church, G. M. Unified rational protein engineering with sequence-based deep representation learning. Nat. Methods 16, 1315\u20131322 (2019).","journal-title":"Nat. Methods"},{"key":"457_CR38","doi-asserted-by":"publisher","first-page":"e2016239118","DOI":"10.1073\/pnas.2016239118","volume":"118","author":"A Rives","year":"2021","unstructured":"Rives, A. et al. Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences. Proc. Natl Acad. Sci. USA 118, e2016239118 (2021).","journal-title":"Proc. Natl Acad. Sci. USA"},{"key":"457_CR39","doi-asserted-by":"publisher","first-page":"366","DOI":"10.1038\/s41592-021-01101-x","volume":"18","author":"B Buchfink","year":"2021","unstructured":"Buchfink, B., Reuter, K. & Drost, H.-G. Sensitive protein alignments at tree-of-life scale using DIAMOND. Nat. Methods 18, 366\u2013368 (2021).","journal-title":"Nat. Methods"},{"key":"457_CR40","doi-asserted-by":"publisher","first-page":"431","DOI":"10.1186\/1471-2105-11-431","volume":"11","author":"LS Johnson","year":"2010","unstructured":"Johnson, L. S., Eddy, S. R. & Portugaly, E. Hidden Markov model speed heuristic and iterative HMM search procedure. BMC Bioinf. 11, 431 (2010).","journal-title":"BMC Bioinf."},{"key":"457_CR41","doi-asserted-by":"publisher","first-page":"D412","DOI":"10.1093\/nar\/gkaa913","volume":"49","author":"J Mistry","year":"2021","unstructured":"Mistry, J. et al. Pfam: the protein families database in 2021. Nucleic Acids Res. 49, D412\u2013D419 (2021).","journal-title":"Nucleic Acids Res."},{"key":"457_CR42","doi-asserted-by":"publisher","unstructured":"Gromiha, M. M. Protein Sequence Analysis. In Protein Bioinformatics (ed. Gromiha, M. M.) Ch. 2, 29\u201362 (Academic, 2010); https:\/\/doi.org\/10.1016\/B978-8-1312-2297-3.50002-3","DOI":"10.1016\/B978-8-1312-2297-3.50002-3"},{"key":"457_CR43","doi-asserted-by":"publisher","first-page":"10","DOI":"10.1093\/bioinformatics\/bth466","volume":"21","author":"K-C Chou","year":"2005","unstructured":"Chou, K.-C. Using amphiphilic pseudo amino acid composition to predict enzyme subfamily classes. Bioinformatics 21, 10\u201319 (2005).","journal-title":"Bioinformatics"},{"key":"457_CR44","doi-asserted-by":"publisher","first-page":"2756","DOI":"10.1093\/bioinformatics\/btx302","volume":"33","author":"J Wang","year":"2017","unstructured":"Wang, J. et al. POSSUM: a bioinformatics toolkit for generating numerical sequence feature descriptors based on PSSM profiles. Bioinformatics 33, 2756\u20132758 (2017).","journal-title":"Bioinformatics"},{"key":"457_CR45","doi-asserted-by":"publisher","first-page":"D213","DOI":"10.1093\/nar\/gku1243","volume":"43","author":"A Mitchell","year":"2015","unstructured":"Mitchell, A. et al. The InterPro protein families database: the classification resource after 15 years. Nucleic Acids Res. 43, D213\u2013D221 (2015).","journal-title":"Nucleic Acids Res."},{"key":"457_CR46","doi-asserted-by":"publisher","first-page":"D204","DOI":"10.1093\/nar\/gku989","volume":"43","author":"UniProt Consortium.","year":"2015","unstructured":"UniProt Consortium. UniProt: a hub for protein information. Nucleic Acids Res. 43, D204\u2013D212 (2015).","journal-title":"Nucleic Acids Res."},{"key":"457_CR47","doi-asserted-by":"publisher","first-page":"D884","DOI":"10.1093\/nar\/gkaa942","volume":"49","author":"KL Howe","year":"2021","unstructured":"Howe, K. L. et al. Ensembl 2021. Nucleic Acids Res. 49, D884\u2013D891 (2021).","journal-title":"Nucleic Acids Res."},{"key":"457_CR48","doi-asserted-by":"publisher","first-page":"e0220182","DOI":"10.1371\/journal.pone.0220182","volume":"14","author":"C Mirabello","year":"2019","unstructured":"Mirabello, C. & Wallner, B. rawMSA: end-to-end deep learning using raw multiple sequence alignments. PLoS ONE 14, e0220182 (2019).","journal-title":"PLoS ONE"},{"key":"457_CR49","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-018-26392-7","volume":"8","author":"Y Xu","year":"2018","unstructured":"Xu, Y., Song, J., Wilson, C. & Whisstock, J. C. PhosContext2vec: a distributed representation of residue-level sequence contexts and its application to general and kinase-specific phosphorylation site prediction. Sci. Rep. 8, 8240 (2018).","journal-title":"Sci. Rep."},{"key":"457_CR50","unstructured":"Lin, D. & Others. An information-theoretic definition of similarity. In ICML '98: Proc. 15th International Conference on Machine Learning 296\u2013304 (ACM, 1998)."},{"key":"457_CR51","unstructured":"Pedregosa, F., Varoquaux, G. & Gramfort, A. Scikit-learn: machine learning in Python. J. Mach. Learn. Res. 12, 2825\u20132830 (2011)."},{"key":"457_CR52","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-020-80786-0","volume":"11","author":"M Littmann","year":"2021","unstructured":"Littmann, M., Heinzinger, M., Dallago, C., Olenyi, T. & Rost, B. Embeddings from deep learning transfer GO annotations beyond homology. Sci. Rep. 11, 1160 (2021).","journal-title":"Sci. Rep."},{"key":"457_CR53","doi-asserted-by":"publisher","first-page":"162","DOI":"10.1093\/bioinformatics\/btaa701","volume":"37","author":"A Villegas-Morcillo","year":"2021","unstructured":"Villegas-Morcillo, A. et al. Unsupervised protein embeddings outperform hand-crafted sequence and structure features at predicting molecular function. Bioinformatics 37, 162\u2013170 (2021).","journal-title":"Bioinformatics"},{"key":"457_CR54","doi-asserted-by":"publisher","first-page":"D930","DOI":"10.1093\/nar\/gky1075","volume":"47","author":"D Mendez","year":"2019","unstructured":"Mendez, D. et al. ChEMBL: towards direct deposition of bioassay data. Nucleic Acids Res. 47, D930\u2013D940 (2019).","journal-title":"Nucleic Acids Res."},{"key":"457_CR55","unstructured":"Vaswani, A. et al. Attention is all you need. in Advances in Neural Information Processing Systems 30 (eds. Guyon, I. et al.) 5998\u20136008 (Curran Associates, 2017)."},{"key":"457_CR56","doi-asserted-by":"crossref","unstructured":"Vig, J. et al. BERTology meets biology: interpreting attention in protein language models. Preprint at https:\/\/arxiv.org\/abs\/2006.15222 (2020).","DOI":"10.1101\/2020.06.26.174417"},{"key":"457_CR57","doi-asserted-by":"publisher","first-page":"706","DOI":"10.1038\/s41586-019-1923-7","volume":"577","author":"AW Senior","year":"2020","unstructured":"Senior, A. W. et al. Improved protein structure prediction using potentials from deep learning. Nature 577, 706\u2013710 (2020).","journal-title":"Nature"},{"key":"457_CR58","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2382577.2382579","volume":"6","author":"S Kaufman","year":"2012","unstructured":"Kaufman, S., Rosset, S., Perlich, C. & Stitelman, O. Leakage in data mining: formulation, detection, and avoidance. ACM Trans. Knowl. Discov. Data 6, 1\u201321 (2012).","journal-title":"ACM Trans. Knowl. Discov. Data"},{"key":"457_CR59","doi-asserted-by":"publisher","first-page":"1116","DOI":"10.3389\/fpsyg.2016.01116","volume":"7","author":"M Brysbaert","year":"2016","unstructured":"Brysbaert, M., Stevens, M., Mandera, P. & Keuleers, E. How many words do we know? Practical estimates of vocabulary size dependent on word definition, the degree of language input and the participant\u2019s age. Front. Psychol. 7, 1116 (2016).","journal-title":"Front. Psychol."},{"key":"457_CR60","unstructured":"Higgins, I. et al. Towards a definition of disentangled representations. Preprint at https:\/\/arxiv.org\/abs\/1812.02230 (2018)."},{"key":"457_CR61","doi-asserted-by":"publisher","first-page":"e39397","DOI":"10.7554\/eLife.39397","volume":"8","author":"J Tubiana","year":"2019","unstructured":"Tubiana, J., Cocco, S. & Monasson, R. Learning protein constitutive motifs from sequence data. eLife 8, e39397 (2019).","journal-title":"eLife"},{"key":"457_CR62","unstructured":"\u00d6zt\u00fcrk, H., Ozkirimli, E. & \u00d6zg\u00fcr, A. WideDTA: prediction of drug-target binding affinity. Preprint at https:\/\/arxiv.org\/abs\/1902.04166 (2019)."},{"key":"457_CR63","doi-asserted-by":"publisher","first-page":"816","DOI":"10.1038\/s41592-018-0138-4","volume":"15","author":"AJ Riesselman","year":"2018","unstructured":"Riesselman, A. J., Ingraham, J. B. & Marks, D. S. Deep generative models of genetic variation capture the effects of mutations. Nat. Methods 15, 816\u2013822 (2018).","journal-title":"Nat. Methods"},{"key":"457_CR64","doi-asserted-by":"publisher","first-page":"e96","DOI":"10.1093\/nar\/gkab543","volume":"49","author":"T Do\u011fan","year":"2021","unstructured":"Do\u011fan, T. et al. CROssBAR: Comprehensive resource of biomedical relations with knowledge graph representations. Nucleic Acids Res. 49, e96\u2013e96 (2021).","journal-title":"Nucleic Acids Res."},{"key":"457_CR65","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1016\/j.tibtech.2015.10.007","volume":"34","author":"MJ Burk","year":"2016","unstructured":"Burk, M. J. & Van Dien, S. Biotechnology for chemical production: challenges and opportunities. Trends Biotechnol. 34, 187\u2013190 (2016).","journal-title":"Trends Biotechnol."},{"key":"457_CR66","doi-asserted-by":"publisher","first-page":"16","DOI":"10.1016\/j.sbi.2016.03.006","volume":"39","author":"P Gainza","year":"2016","unstructured":"Gainza, P., Nisonoff, H. M. & Donald, B. R. Algorithms for protein design. Curr. Opin. Struct. Biol. 39, 16\u201326 (2016).","journal-title":"Curr. Opin. Struct. Biol."},{"key":"457_CR67","doi-asserted-by":"publisher","first-page":"1817","DOI":"10.1002\/pro.481","volume":"19","author":"D Baker","year":"2010","unstructured":"Baker, D. An exciting but challenging road ahead for computational enzyme design. Protein Sci. 19, 1817\u20131819 (2010).","journal-title":"Protein Sci."},{"key":"457_CR68","doi-asserted-by":"publisher","first-page":"190","DOI":"10.1038\/nature06879","volume":"453","author":"D R\u00f6thlisberger","year":"2008","unstructured":"R\u00f6thlisberger, D. et al. Kemp elimination catalysts by computational enzyme design. Nature 453, 190\u2013195 (2008).","journal-title":"Nature"},{"key":"457_CR69","doi-asserted-by":"publisher","first-page":"3790","DOI":"10.1073\/pnas.1118082108","volume":"109","author":"HK Privett","year":"2012","unstructured":"Privett, H. K. et al. Iterative approach to computational enzyme design. Proc. Natl Acad. Sci. USA 109, 3790\u20133795 (2012).","journal-title":"Proc. Natl Acad. Sci. USA"},{"key":"457_CR70","doi-asserted-by":"publisher","first-page":"350","DOI":"10.1016\/S0076-6879(04)80016-8","volume":"380","author":"HS Chan","year":"2004","unstructured":"Chan, H. S., Shimizu, S. & Kaya, H. Cooperativity principles in protein folding. Methods Enzymol. 380, 350\u2013379 (2004).","journal-title":"Methods Enzymol."},{"key":"457_CR71","doi-asserted-by":"publisher","first-page":"1171","DOI":"10.1038\/nbt1336","volume":"25","author":"SM Lippow","year":"2007","unstructured":"Lippow, S. M., Wittrup, K. D. & Tidor, B. Computational design of antibody-affinity improvement beyond in vivo maturation. Nat. Biotechnol. 25, 1171\u20131176 (2007).","journal-title":"Nat. Biotechnol."},{"key":"457_CR72","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1038\/nature01556","volume":"423","author":"LL Looger","year":"2003","unstructured":"Looger, L. L., Dwyer, M. A., Smith, J. J. & Hellinga, H. W. Computational design of receptor and sensor proteins with novel functions. Nature 423, 185\u2013190 (2003).","journal-title":"Nature"},{"key":"457_CR73","doi-asserted-by":"publisher","first-page":"1999","DOI":"10.1002\/jcc.10349","volume":"24","author":"Y Duan","year":"2003","unstructured":"Duan, Y. et al. A point-charge force field for molecular mechanics simulations of proteins based on condensed-phase quantum mechanical calculations. J. Comput. Chem. 24, 1999\u20132012 (2003).","journal-title":"J. Comput. Chem."},{"key":"457_CR74","doi-asserted-by":"publisher","first-page":"6217","DOI":"10.1021\/cr500628b","volume":"115","author":"E Brunk","year":"2015","unstructured":"Brunk, E. & Rothlisberger, U. Mixed quantum mechanical\/molecular mechanical molecular dynamics simulations of biological systems in ground and electronically excited states. Chem. Rev. 115, 6217\u20136263 (2015).","journal-title":"Chem. Rev."},{"key":"457_CR75","doi-asserted-by":"publisher","first-page":"9","DOI":"10.1039\/C6ME00083E","volume":"2","author":"MC Childers","year":"2017","unstructured":"Childers, M. C. & Daggett, V. Insights from molecular dynamics simulations for computational protein design. Mol. Syst. Des. Eng. 2, 9\u201333 (2017).","journal-title":"Mol. Syst. Des. Eng."},{"key":"457_CR76","doi-asserted-by":"publisher","first-page":"1129","DOI":"10.1016\/j.neuron.2018.08.011","volume":"99","author":"SA Hollingsworth","year":"2018","unstructured":"Hollingsworth, S. A. & Dror, R. O. Molecular dynamics simulation for all. Neuron 99, 1129\u20131143 (2018).","journal-title":"Neuron"},{"key":"457_CR77","doi-asserted-by":"publisher","first-page":"8982","DOI":"10.1021\/ja5027584","volume":"136","author":"C Camilloni","year":"2014","unstructured":"Camilloni, C. & Vendruscolo, M. Statistical mechanics of the denatured state of a protein using replica-averaged metadynamics. J. Am. Chem. Soc. 136, 8982\u20138991 (2014).","journal-title":"J. Am. Chem. Soc."},{"key":"457_CR78","doi-asserted-by":"publisher","first-page":"2648","DOI":"10.1002\/prot.23086","volume":"79","author":"S-Y Huang","year":"2011","unstructured":"Huang, S.-Y. & Zou, X. Statistical mechanics-based method to extract atomic distance-dependent potentials from protein structures. Proteins 79, 2648\u20132661 (2011).","journal-title":"Proteins"},{"key":"457_CR79","doi-asserted-by":"publisher","first-page":"779","DOI":"10.1093\/protein\/15.10.779","volume":"15","author":"NA Pierce","year":"2002","unstructured":"Pierce, N. A. & Winfree, E. Protein design is NP-hard. Protein Eng. 15, 779\u2013782 (2002).","journal-title":"Protein Eng."},{"key":"457_CR80","doi-asserted-by":"crossref","unstructured":"Eguchi, R. R., Anand, N., Choe, C. A. & Huang, P.-S. IG-VAE: Generative modeling of immunoglobulin proteins by direct 3D coordinate generation. Preprint at bioRxiv https:\/\/www.biorxiv.org\/content\/10.1101\/2020.08.07.242347v2 (2020).","DOI":"10.1101\/2020.08.07.242347"},{"key":"457_CR81","unstructured":"Ng, A. Y. & Jordan, M. I. On discriminative vs. generative classifiers: a comparison of logistic regression and naive Bayes. In Advances in Neural Information Processing Systems (eds. Dietterich, T. G., Becker, S. & Ghahramani, Z.) Vol. 14, 841\u2013848 (MIT Press, 2002)."},{"key":"457_CR82","doi-asserted-by":"publisher","first-page":"361","DOI":"10.1146\/annurev-statistics-010814-020120","volume":"2","author":"R Salakhutdinov","year":"2015","unstructured":"Salakhutdinov, R. Learning deep generative models. Annu. Rev. Stat. Appl. 2, 361\u2013385 (2015).","journal-title":"Annu. Rev. Stat. Appl."},{"key":"457_CR83","doi-asserted-by":"crossref","unstructured":"Madani, A. et al. Deep neural language modeling enables functional protein generation across families. Preprint at bioRxiv https:\/\/www.biorxiv.org\/content\/10.1101\/2021.07.18.452833v1 (2021).","DOI":"10.1101\/2021.07.18.452833"},{"key":"457_CR84","doi-asserted-by":"crossref","unstructured":"St\u00e4rk, H., Dallago, C., Heinzinger, M. & Rost, B. Light attention predicts protein location from the language of life. Bioinformatics Advances 1, vbab035 (2021).","DOI":"10.1093\/bioadv\/vbab035"},{"key":"457_CR85","doi-asserted-by":"publisher","first-page":"976","DOI":"10.1093\/bioinformatics\/btq064","volume":"26","author":"G Yu","year":"2010","unstructured":"Yu, G. et al. GOSemSim: an R package for measuring semantic similarity among GO terms and gene products. Bioinformatics 26, 976\u2013978 (2010).","journal-title":"Bioinformatics"},{"key":"457_CR86","doi-asserted-by":"publisher","first-page":"1116","DOI":"10.1016\/j.jbi.2013.08.008","volume":"46","author":"BT McInnes","year":"2013","unstructured":"McInnes, B. T. & Pedersen, T. Evaluating measures of semantic similarity and relatedness to disambiguate terms in biomedical text. J. Biomed. Inform. 46, 1116\u20131124 (2013).","journal-title":"J. Biomed. Inform."},{"key":"457_CR87","doi-asserted-by":"publisher","first-page":"72","DOI":"10.2307\/1412159","volume":"15","author":"C Spearman","year":"1904","unstructured":"Spearman, C. The proof and measurement of association between two things. Am. J. Psychol. 15, 72\u2013101 (1904).","journal-title":"Am. J. Psychol."},{"key":"457_CR88","doi-asserted-by":"publisher","first-page":"1282","DOI":"10.1093\/bioinformatics\/btm098","volume":"23","author":"BE Suzek","year":"2007","unstructured":"Suzek, B. E., Huang, H., McGarvey, P., Mazumder, R. & Wu, C. H. UniRef: comprehensive and non-redundant UniProt reference clusters. Bioinformatics 23, 1282\u20131288 (2007).","journal-title":"Bioinformatics"},{"key":"457_CR89","doi-asserted-by":"publisher","first-page":"650","DOI":"10.1016\/j.cell.2018.01.029","volume":"172","author":"SA Lambert","year":"2018","unstructured":"Lambert, S. A. et al. The human transcription factors. Cell 172, 650\u2013665 (2018).","journal-title":"Cell"},{"key":"457_CR90","doi-asserted-by":"publisher","first-page":"D170","DOI":"10.1093\/nar\/gkw1081","volume":"45","author":"M Mirdita","year":"2017","unstructured":"Mirdita, M. et al. Uniclust databases of clustered and deeply annotated protein sequences and alignments. Nucleic Acids Res. 45, D170\u2013D176 (2017).","journal-title":"Nucleic Acids Res."},{"key":"457_CR91","doi-asserted-by":"publisher","first-page":"2600","DOI":"10.1093\/bioinformatics\/bts489","volume":"28","author":"IH Moal","year":"2012","unstructured":"Moal, I. H. & Fern\u00e1ndez-Recio, J. SKEMPI: a structural kinetic and energetic database of mutant protein interactions and its use in empirical models. Bioinformatics 28, 2600\u20132607 (2012).","journal-title":"Bioinformatics"},{"key":"457_CR92","doi-asserted-by":"publisher","first-page":"i305","DOI":"10.1093\/bioinformatics\/btz328","volume":"35","author":"M Chen","year":"2019","unstructured":"Chen, M. et al. Multifaceted protein\u2013protein interaction prediction based on Siamese residual RCNN. Bioinformatics 35, i305\u2013i314 (2019).","journal-title":"Bioinformatics"},{"key":"457_CR93","unstructured":"Tipping, M. E. Sparse Bayesian learning and the relevance vector machine. J. Mach. Learn. Res. 1, 211\u2013244 (2001)."},{"key":"457_CR94","doi-asserted-by":"crossref","unstructured":"Wan, F. & Zeng, J. (M.). Deep learning with feature embedding for compound\u2013protein interaction prediction. Preprint at bioRxiv https:\/\/www.biorxiv.org\/content\/10.1101\/086033v1 (2016).","DOI":"10.1101\/086033"},{"key":"457_CR95","doi-asserted-by":"publisher","first-page":"3577","DOI":"10.1038\/s41598-019-38746-w","volume":"9","author":"E Asgari","year":"2019","unstructured":"Asgari, E., McHardy, A. C. & Mofrad, M. R. Probabilistic variable-length segmentation of protein sequences for discriminative motif discovery (DiMotif) and sequence embedding (ProtVecX). Sci. Rep. 9, 3577 (2019).","journal-title":"Sci. Rep."},{"key":"457_CR96","doi-asserted-by":"publisher","first-page":"i821","DOI":"10.1093\/bioinformatics\/bty593","volume":"34","author":"H \u00d6zt\u00fcrk","year":"2018","unstructured":"\u00d6zt\u00fcrk, H., \u00d6zg\u00fcr, A. & Ozkirimli, E. DeepDTA: deep drug-target binding affinity prediction. Bioinformatics 34, i821\u2013i829 (2018).","journal-title":"Bioinformatics"},{"key":"457_CR97","doi-asserted-by":"publisher","first-page":"58826","DOI":"10.1109\/ACCESS.2018.2874208","volume":"6","author":"M Oubounyt","year":"2018","unstructured":"Oubounyt, M., Louadi, Z., Tayara, H. & To Chong, K. Deep learning models based on distributed feature representations for alternative splicing prediction. IEEE Access 6, 58826\u201358834 (2018).","journal-title":"IEEE Access"},{"key":"457_CR98","doi-asserted-by":"crossref","unstructured":"Mirabello, C. & Wallner, B. rawMSA: End-to-end deep learning makes protein sequence profiles and feature extraction obsolete. Bioinformatics 228 (2018).","DOI":"10.1101\/394437"},{"key":"457_CR99","doi-asserted-by":"publisher","first-page":"434","DOI":"10.1016\/j.compbiolchem.2018.03.009","volume":"74","author":"A Dutta","year":"2018","unstructured":"Dutta, A., Dubey, T., Singh, K. K. & Anand, A. SpliceVec: distributed feature representations for splice junction prediction. Comput. Biol. Chem. 74, 434\u2013441 (2018).","journal-title":"Comput. Biol. Chem."},{"key":"457_CR100","doi-asserted-by":"publisher","DOI":"10.1186\/s12870-019-1693-2","volume":"19","author":"MK Mej\u00eda-Guerra","year":"2019","unstructured":"Mej\u00eda-Guerra, M. K. & Buckler, E. S. A k-mer grammar analysis to uncover maize regulatory architecture. BMC Plant Biol. 19, 103 (2019).","journal-title":"BMC Plant Biol."},{"key":"457_CR101","doi-asserted-by":"crossref","unstructured":"Cohen, T., Widdows, D., Heiden, J. A. V., Gupta, N. T. & Kleinstein, S. H. Graded vector representations of immunoglobulins produced in response to west Nile virus. In Quantum Interaction (eds de Barros, J. A., Coecke, B. & Pothos, E.) 135\u2013148 (Springer, 2017).","DOI":"10.1007\/978-3-319-52289-0_11"},{"key":"457_CR102","unstructured":"Ng, P. dna2vec: Consistent vector representations of variable-length k-mers. Preprint at https:\/\/arxiv.org\/abs\/1701.06279 (2017)."},{"key":"457_CR103","doi-asserted-by":"publisher","first-page":"27","DOI":"10.1021\/acs.jcim.7b00616","volume":"58","author":"S Jaeger","year":"2018","unstructured":"Jaeger, S., Fulle, S. & Turk, S. Mol2vec: Unsupervised machine learning approach with chemical intuition. J. Chem. Inf. Model. 58, 27\u201335 (2018).","journal-title":"J. Chem. Inf. Model."},{"key":"457_CR104","doi-asserted-by":"crossref","unstructured":"Viehweger, A., Krautwurst, S., Parks, D. H., K\u00f6nig, B. & Marz, M. An encoding of genome content for machine learning. Preprint at https:\/\/www.biorxiv.org\/content\/10.1101\/524280v3 (2019).","DOI":"10.1101\/524280"},{"key":"457_CR105","doi-asserted-by":"publisher","first-page":"e32235","DOI":"10.1371\/journal.pone.0032235","volume":"7","author":"Y Qi","year":"2012","unstructured":"Qi, Y., Oja, M., Weston, J. & Noble, W. S. A unified multitask architecture for predicting local protein properties. PLoS ONE 7, e32235 (2012).","journal-title":"PLoS ONE"},{"key":"457_CR106","doi-asserted-by":"publisher","first-page":"e1001047","DOI":"10.1371\/journal.pcbi.1001047","volume":"7","author":"I Melvin","year":"2011","unstructured":"Melvin, I., Weston, J., Noble, W. S. & Leslie, C. Detecting remote evolutionary relationships among proteins by large-scale semantic embedding. PLoS Comput. Biol. 7, e1001047 (2011).","journal-title":"PLoS Comput. Biol."},{"key":"457_CR107","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-018-32180-0","volume":"8","author":"J Choi","year":"2018","unstructured":"Choi, J., Oh, I., Seo, S. & Ahn, J. G2Vec: distributed gene representations for identification of cancer prognostic genes. Sci. Rep. 8, 13729 (2018).","journal-title":"Sci. Rep."},{"key":"457_CR108","doi-asserted-by":"publisher","unstructured":"You, R. & Zhu, S. DeepText2Go: Improving large-scale protein function prediction with deep semantic text representation. In 2017 IEEE International Conference on Bioinformatics and Biomedicine (BIBM) 42\u201349 (IEEE, 2017); https:\/\/doi.org\/10.1109\/BIBM.2017.8217622","DOI":"10.1109\/BIBM.2017.8217622"},{"key":"457_CR109","unstructured":"Bepler, T. & Berger, B. Learning protein sequence embeddings using information from structure. Preprint at https:\/\/arxiv.org\/abs\/1902.08661 (2019)."},{"key":"457_CR110","unstructured":"Schwartz, A. S. et al. Deep semantic protein representation for annotation, discovery, and engineering. Preprint at bioRxiv https:\/\/www.biorxiv.org\/content\/10.1101\/365965v1 (2018)."},{"key":"457_CR111","doi-asserted-by":"crossref","unstructured":"Kan\u00e9, H., Coulibali, M., Abdalla, A. & Ajanoh, P. Augmenting protein network embeddings with sequence information. Preprint at bioRxiv https:\/\/www.biorxiv.org\/content\/10.1101\/730481v3 (2019).","DOI":"10.1101\/730481"},{"key":"457_CR112","doi-asserted-by":"publisher","first-page":"126","DOI":"10.4236\/jbise.2018.116012","volume":"11","author":"MR Faisal","year":"2018","unstructured":"Faisal, M. R. et al. Improving protein sequence classification performance using adjacent and overlapped segments on existing protein descriptors. JBiSE 11, 126\u2013143 (2018).","journal-title":"JBiSE"},{"key":"457_CR113","doi-asserted-by":"publisher","first-page":"2401","DOI":"10.1093\/bioinformatics\/btaa003","volume":"36","author":"N Strodthoff","year":"2020","unstructured":"Strodthoff, N., Wagner, P., Wenzel, M. & Samek, W. UDSMProt: universal deep sequence models for protein classification. Bioinformatics 36, 2401\u20132409 (2020).","journal-title":"Bioinformatics"},{"key":"457_CR114","doi-asserted-by":"crossref","unstructured":"Asgari, E., Poerner, N., McHardy, A. C. & Mofrad, M. R. K. DeepPrime2Sec: deep learning for protein secondary structure prediction from the primary sequences. Preprint at bioRxiv https:\/\/www.biorxiv.org\/content\/early\/2019\/07\/18\/705426 (2019)","DOI":"10.1101\/705426"},{"key":"457_CR115","doi-asserted-by":"publisher","unstructured":"Bileschi, M. L. et al. Using deep learning to annotate the protein universe. Nat. Biotechnol. https:\/\/doi.org\/10.1038\/s41587-021-01179-w (2022).","DOI":"10.1038\/s41587-021-01179-w"},{"key":"457_CR116","doi-asserted-by":"publisher","unstructured":"Unsal, S. et al. Learning Functional Properties of Proteins with Language Models Data Sets (Zenodo, 2020); https:\/\/doi.org\/10.5281\/zenodo.5795850","DOI":"10.5281\/zenodo.5795850"},{"key":"457_CR117","doi-asserted-by":"publisher","unstructured":"Unsal, S. et al. PROBE (Protein Representation Benchmark): Function-Centric Evaluation of Protein Representation Methods (Code Ocean, 2021); https:\/\/doi.org\/10.24433\/CO.5123923.v2","DOI":"10.24433\/CO.5123923.v2"}],"container-title":["Nature Machine Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.nature.com\/articles\/s42256-022-00457-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s42256-022-00457-9","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s42256-022-00457-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,25]],"date-time":"2022-11-25T09:21:00Z","timestamp":1669368060000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.nature.com\/articles\/s42256-022-00457-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3,21]]},"references-count":117,"journal-issue":{"issue":"3","published-online":{"date-parts":[[2022,3]]}},"alternative-id":["457"],"URL":"https:\/\/doi.org\/10.1038\/s42256-022-00457-9","relation":{},"ISSN":["2522-5839"],"issn-type":[{"value":"2522-5839","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,3,21]]},"assertion":[{"value":"14 August 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 February 2022","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 March 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declare no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}]}}