{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T16:17:09Z","timestamp":1781540229119,"version":"3.54.5"},"reference-count":103,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T00:00:00Z","timestamp":1733443200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T00:00:00Z","timestamp":1733443200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Nat Mach Intell"],"DOI":"10.1038\/s42256-024-00931-6","type":"journal-article","created":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T05:01:41Z","timestamp":1733461301000},"page":"1512-1524","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":32,"title":["Evaluating generalizability of artificial intelligence models for molecular datasets"],"prefix":"10.1038","volume":"6","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2759-4470","authenticated-orcid":false,"given":"Yasha","family":"Ektefaie","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Andrew","family":"Shen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Daria","family":"Bykova","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Maximillian G.","family":"Marin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8530-7228","authenticated-orcid":false,"given":"Marinka","family":"Zitnik","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3871-5760","authenticated-orcid":false,"given":"Maha","family":"Farhat","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,12,6]]},"reference":[{"key":"931_CR1","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-022-31236-0","volume":"13","author":"AG Green","year":"2022","unstructured":"Green, A. G. et al. A convolutional neural network highlights mutations relevant to antimicrobial resistance in mycobacterium tuberculosis. Nat. Commun. 13, 3817 (2022).","journal-title":"Nat. Commun."},{"key":"931_CR2","doi-asserted-by":"publisher","first-page":"2242","DOI":"10.1109\/TCBB.2023.3240169","volume":"20","author":"V Kumar","year":"2023","unstructured":"Kumar, V., Deepak, A., Ranjan, A. & Prakash, A. Lite-SeqCNN: a light-weight deep CNN architecture for protein function prediction. IEEE\/ACM Trans. Comput. Biol. Bioinform. 20, 2242\u20132253 (2023).","journal-title":"IEEE\/ACM Trans. Comput. Biol. Bioinform."},{"key":"931_CR3","doi-asserted-by":"publisher","first-page":"e80942","DOI":"10.7554\/eLife.80942","volume":"12","author":"T Sanderson","year":"2023","unstructured":"Sanderson, T., Bileschi, M. L., Belanger, D. & Colwell, L. J. Proteinfer, deep neural networks for protein functional inference. eLife 12, e80942 (2023).","journal-title":"eLife"},{"key":"931_CR4","doi-asserted-by":"publisher","first-page":"932","DOI":"10.1038\/s41587-021-01179-w","volume":"40","author":"ML Bileschi","year":"2022","unstructured":"Bileschi, M. L. et al. Using deep learning to annotate the protein universe. Nat. Biotechnol. 40, 932\u2013937 (2022).","journal-title":"Nat. Biotechnol."},{"key":"931_CR5","doi-asserted-by":"publisher","first-page":"e70576","DOI":"10.7554\/eLife.70576","volume":"10","author":"D Griffith","year":"2021","unstructured":"Griffith, D. & Holehouse, A. S. Parrot is a flexible recurrent neural network framework for analysis of large protein datasets. eLife 10, e70576 (2021).","journal-title":"eLife"},{"key":"931_CR6","doi-asserted-by":"crossref","unstructured":"Liu, X. Deep recurrent neural network for protein function prediction from sequence. Preprint at https:\/\/arxiv.org\/abs\/1701.08318 (2017).","DOI":"10.1101\/103994"},{"key":"931_CR7","doi-asserted-by":"publisher","first-page":"8105","DOI":"10.1093\/nar\/gky567","volume":"46","author":"ST Hill","year":"2018","unstructured":"Hill, S. T. et al. A deep recurrent neural network discovers complex biological rules to decipher RNA protein-coding potential. Nucleic Acids Res. 46, 8105\u20138113 (2018).","journal-title":"Nucleic Acids Res."},{"key":"931_CR8","unstructured":"Zhang, Z., Xu, M., Jamasb, A. R., Chenthamarakshan, V., Lozano, A., Das, P. & Tang, J. Protein representation learning by geometric structure pretraining. In Proc. Eleventh International Conference on Learning Representations (2023)."},{"key":"931_CR9","unstructured":"Somnath, V. R., Bunne, C. & Krause, A. Multi-scale representation learning on proteins. In Advances in Neural Information Processing Systems Vol. 34 (eds Ranzato, M. et al.) 25244\u201325255 (Curran Associates, 2021)."},{"key":"931_CR10","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-022-12201-9","volume":"12","author":"K Jha","year":"2022","unstructured":"Jha, K., Saha, S. & Singh, H. Prediction of protein\u2013protein interaction using graph neural networks. Sci. Rep. 12, 8360 (2022).","journal-title":"Sci. Rep."},{"key":"931_CR11","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-023-36736-1","volume":"14","author":"Z Gao","year":"2023","unstructured":"Gao, Z. et al. Hierarchical graph learning for protein\u2013protein interaction. Nat. Commun. 14, 1093 (2023).","journal-title":"Nat. Commun."},{"key":"931_CR12","doi-asserted-by":"publisher","first-page":"1123","DOI":"10.1126\/science.ade2574","volume":"379","author":"Z Lin","year":"2023","unstructured":"Lin, Z. et al. Evolutionary-scale prediction of atomic-level protein structure with a language model. Science 379, 1123\u20131130 (2023).","journal-title":"Science"},{"key":"931_CR13","doi-asserted-by":"publisher","first-page":"1315","DOI":"10.1038\/s41592-019-0598-1","volume":"16","author":"EC Alley","year":"2019","unstructured":"Alley, E. C., Khimulya, G., Biswas, S., AlQuraishi, M. & Church, G. M. Unified rational protein engineering with sequence-based deep representation learning. Nat. Methods 16, 1315\u20131322 (2019).","journal-title":"Nat. Methods"},{"key":"931_CR14","doi-asserted-by":"publisher","first-page":"e2016239118","DOI":"10.1073\/pnas.2016239118","volume":"118","author":"A Rives","year":"2021","unstructured":"Rives, A. et al. Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences. Proc. Natl Acad. Sci. USA 118, e2016239118 (2021).","journal-title":"Proc. Natl Acad. Sci. USA"},{"key":"931_CR15","doi-asserted-by":"publisher","first-page":"1099","DOI":"10.1038\/s41587-022-01618-2","volume":"41","author":"A Madani","year":"2023","unstructured":"Madani, A. et al. Large language models generate functional protein sequences across diverse families. Nat. Biotechnol. 41, 1099\u20131106 (2023).","journal-title":"Nat. Biotechnol."},{"key":"931_CR16","unstructured":"Notin, P. et al. Tranception: protein fitness prediction with autoregressive transformers and inference-time retrieval. In Proc. 39th International Conference on Machine Learning, Vol. 162 (eds Chaudhuri, K. et al.) 16990\u201317017 (PMLR, 2022)."},{"key":"931_CR17","doi-asserted-by":"publisher","first-page":"243","DOI":"10.1016\/j.tcb.2021.10.010","volume":"32","author":"BW Wright","year":"2022","unstructured":"Wright, B. W., Yi, Z., Weissman, J. S. & Chen, J. The dark proteome: translation from noncanonical open reading frames. Trends Cell Biol. 32, 243\u2013258 (2022).","journal-title":"Trends Cell Biol."},{"key":"931_CR18","unstructured":"Liu, J. et al. Towards out-of-distribution generalization: a survey. Preprint at https:\/\/arxiv.org\/abs\/2108.13624 (2023)."},{"key":"931_CR19","unstructured":"Ye, H. et al. Towards a theoretical framework of out-of-distribution generalization. In Advances in Neural Information Processing Systems (eds Beygelzimer, A. et al.) 1801 (2021)."},{"key":"931_CR20","unstructured":"Weber, M. et al. Certifying out-of-domain generalization for blackbox functions. In Proc. 39th International Conference on Machine Learning Vol. 162 (eds Chaudhuri, K. et al.) 23527\u201323548 (PMLR, 2022)."},{"key":"931_CR21","unstructured":"Koh, P. W. et al. Wilds: a benchmark of in-the-wild distribution shifts. Preprint at https:\/\/arxiv.org\/abs\/2012.07421 (2021)."},{"key":"931_CR22","unstructured":"Liang, P. et al. Holistic evaluation of language models. Trans. Mach. Learn. Res. 2835\u20138856 (2023)."},{"key":"931_CR23","doi-asserted-by":"crossref","unstructured":"Rao, R. et al. Evaluating protein transfer learning with TAPE. In Advances in Neural Information Processing Systems 32, 9689\u20139701 (2019).","DOI":"10.1101\/676825"},{"key":"931_CR24","unstructured":"Xu, M. et al. PEER: a comprehensive and multi-task benchmark for protein sequence understanding. In Proc. Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (2022)."},{"key":"931_CR25","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-022-19608-4","volume":"12","author":"H Capel","year":"2022","unstructured":"Capel, H. et al. Proteinglue multi-task benchmark suite for self-supervised protein modeling. Sci. Rep. 12, 16047 (2022).","journal-title":"Sci. Rep."},{"key":"931_CR26","doi-asserted-by":"crossref","unstructured":"Dallago, C. et al. FLIP: benchmark tasks in fitness landscape inference for proteins. In Proc. Neural Information Processing Systems Track on Datasets and Benchmarks Vol. 1 (eds Vanschoren, J. & Yeung, S.) (2021).","DOI":"10.1101\/2021.11.09.467890"},{"key":"931_CR27","doi-asserted-by":"crossref","unstructured":"Hu, Y., Jacob, J., Parker, G. J. M. et al. The challenges of deploying artificial intelligence models in a rapidly evolving pandemic. Nat. Mach. Intell. 2, 298\u2013300 (2020)","DOI":"10.1038\/s42256-020-0185-2"},{"key":"931_CR28","doi-asserted-by":"publisher","first-page":"1014256","DOI":"10.3389\/fimmu.2022.1014256","volume":"13","author":"F Grazioli","year":"2022","unstructured":"Grazioli, F. et al. On TCR binding predictors failing to generalize to unseen peptides. Front. Immunol. 13, 1014256 (2022).","journal-title":"Front. Immunol."},{"key":"931_CR29","doi-asserted-by":"publisher","first-page":"756","DOI":"10.1038\/s41551-023-01049-7","volume":"7","author":"S Azizi","year":"2023","unstructured":"Azizi, S. et al. Robust and data-efficient generalization of self-supervised machine learning for diagnostic imaging. Nat. Biomed. Eng. 7, 756\u2013779 (2023).","journal-title":"Nat. Biomed. Eng."},{"key":"931_CR30","doi-asserted-by":"publisher","first-page":"665","DOI":"10.1038\/s42256-020-00257-z","volume":"2","author":"R Geirhos","year":"2020","unstructured":"Geirhos, R. et al. Shortcut learning in deep neural networks. Nat. Mach. Intell. 2, 665\u2013673 (2020).","journal-title":"Nat. Mach. Intell."},{"key":"931_CR31","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-023-37572-z","volume":"14","author":"A Chatterjee","year":"2023","unstructured":"Chatterjee, A. et al. Improving the generalizability of protein-ligand binding predictions with AI-bind. Nat. Commun. 14, 1989 (2023).","journal-title":"Nat. Commun."},{"key":"931_CR32","doi-asserted-by":"publisher","first-page":"E7","DOI":"10.1038\/s41586-021-04207-6","volume":"601","author":"J Frazer","year":"2022","unstructured":"Frazer, J. et al. Disease variant prediction with deep generative models of evolutionary data. Nature 601, E7 (2022).","journal-title":"Nature"},{"key":"931_CR33","doi-asserted-by":"publisher","first-page":"818","DOI":"10.1038\/s41586-023-06617-0","volume":"622","author":"NN Thadani","year":"2023","unstructured":"Thadani, N. N. et al. Learning from prepandemic data to forecast viral escape. Nature 622, 818\u2013825 (2023).","journal-title":"Nature"},{"key":"931_CR34","unstructured":"Stark, H., Ganea, O.-E., Pattanaik, L., Barzilay, R. & Jaakkola, T. Equibind: geometric deep learning for drug binding structure prediction. In Proc. 39th International Conference on Machine Learning Vol. 162 (eds Chaudhuri, K. et al.) 20503\u201320521 (PMLR, 2022)."},{"key":"931_CR35","doi-asserted-by":"crossref","unstructured":"Mahajan, S. P., Ruffolo, J. A. & Gray, J. J. Contextual protein and antibody encodings from equivariant graph transformers. Preprint at bioRxiv https:\/\/www.biorxiv.org\/content\/10.1101\/2023.07.15.549154v2 (2023).","DOI":"10.1101\/2023.07.15.549154"},{"key":"931_CR36","doi-asserted-by":"publisher","DOI":"10.1186\/s12859-019-2932-0","volume":"20","author":"M AlQuraishi","year":"2019","unstructured":"AlQuraishi, M. ProteinNet: a standardized data set for machine learning of protein structure. BMC Bioinformatics 20, 311 (2019).","journal-title":"BMC Bioinformatics"},{"key":"931_CR37","doi-asserted-by":"publisher","first-page":"eadl2528","DOI":"10.1126\/science.adl2528","volume":"384","author":"R Krishna","year":"2024","unstructured":"Krishna, R. et al. Generalized biomolecular modeling and design with RoseTTAFold All-Atom. Science 384, eadl2528 (2024).","journal-title":"Science"},{"key":"931_CR38","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-023-38347-2","volume":"14","author":"A Kroll","year":"2023","unstructured":"Kroll, A., Ranjan, S., Engqvist, M. K. M. & Lercher, M. J. A general model to predict small molecule substrates of enzymes based on machine and deep learning. Nat. Commun. 14, 2787 (2023).","journal-title":"Nat. Commun."},{"key":"931_CR39","doi-asserted-by":"publisher","first-page":"40","DOI":"10.1038\/s41580-021-00407-0","volume":"23","author":"JG Greener","year":"2022","unstructured":"Greener, J. G., Kandathil, S. M., Moffat, L. & Jones, D. T. A guide to machine learning for biologists. Nat. Rev. Mol. Cell Biol. 23, 40\u201355 (2022).","journal-title":"Nat. Rev. Mol. Cell Biol."},{"key":"931_CR40","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-023-47204-7","volume":"13","author":"MA Jambrich","year":"2023","unstructured":"Jambrich, M. A., Tusnady, G. E. & Dobson, L. How AlphaFold2 shaped the structural coverage of the human transmembrane proteome. Sci. Rep. 13, 20283 (2023).","journal-title":"Sci. Rep."},{"key":"931_CR41","doi-asserted-by":"publisher","first-page":"4111","DOI":"10.1021\/jm048957q","volume":"48","author":"R Wang","year":"2005","unstructured":"Wang, R., Fang, X., Lu, Y., Yang, C.-Y. & Wang, S. The PDBBind database: methodologies and updates. J. Med. Chem. 48, 4111\u20134119 (2005).","journal-title":"J. Med. Chem."},{"key":"931_CR42","unstructured":"Krause, B., Lu, L., Murray, I. & Renals, S. Multiplicative LSTM for sequence modelling. Preprint at https:\/\/arxiv.org\/abs\/1609.07959 (2017)."},{"key":"931_CR43","doi-asserted-by":"publisher","first-page":"1295","DOI":"10.1093\/bioinformatics\/btx780","volume":"34","author":"J Hou","year":"2017","unstructured":"Hou, J., Adhikari, B. & Cheng, J. DeepSF: deep convolutional neural network for mapping protein sequences to folds. Bioinformatics 34, 1295\u20131303 (2017).","journal-title":"Bioinformatics"},{"key":"931_CR44","first-page":"44\u201357.e9","volume":"29","author":"AJ Greaney","year":"2021","unstructured":"Greaney, A. J. et al. Complete mapping of mutations to the SARS-CoV-2 spike receptor-binding domain that escape antibody recognition. Cell Host Microbe 29, 44\u201357.e9 (2021).","journal-title":"Cell Host Microbe"},{"key":"931_CR45","doi-asserted-by":"publisher","first-page":"397","DOI":"10.1038\/nature17995","volume":"533","author":"KS Sarkisyan","year":"2016","unstructured":"Sarkisyan, K. S. et al. Local fitness landscape of the green fluorescent protein. Nature 533, 397\u2013401 (2016).","journal-title":"Nature"},{"key":"931_CR46","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-021-22732-w","volume":"12","author":"J-E Shin","year":"2021","unstructured":"Shin, J.-E. et al. Protein design and variant prediction using autoregressive generative models. Nat. Commun. 12, 2403 (2021).","journal-title":"Nat. Commun."},{"key":"931_CR47","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1128\/CMR.00034-09","volume":"23","author":"GN Forrest","year":"2010","unstructured":"Forrest, G. N. & Tamura, K. Rifampin combination therapy for nonmycobacterial infections. Clin. Microbiol. Rev. 23, 14\u201334 (2010).","journal-title":"Clin. Microbiol. Rev."},{"key":"931_CR48","doi-asserted-by":"publisher","first-page":"625","DOI":"10.1038\/ja.2014.107","volume":"67","author":"BP Goldstein","year":"2014","unstructured":"Goldstein, B. P. Resistance to rifampicin: a review. J. Antibiot. 67, 625\u2013630 (2014).","journal-title":"J. Antibiot."},{"key":"931_CR49","unstructured":"Vaswani, A. et al. Attention is all you need. In Advances in Neural Information Processing Systems Vol. 30 (eds Guyon, I. et al.) (Curran Associates, 2017)."},{"key":"931_CR50","doi-asserted-by":"crossref","unstructured":"Cui, H., Wang, C., Maan, H. et al. scGPT: toward building a foundation model for single-cell multi-omics using generative AI. Nat. Methods 21, 1470\u20131480 (2024).","DOI":"10.1038\/s41592-024-02201-0"},{"key":"931_CR51","unstructured":"Ramesh, A. et al. Zero-shot text-to-image generation. In Proc. 38th International Conference on Machine Learning Vol. 139 (eds Meila, M. & Zhang, T.) 8821\u20138831 (PMLR, 2021)."},{"key":"931_CR52","unstructured":"Brown, T. B. et al. Language models are few-shot learners. In Advances in Neural Information Processing Systems Vol. 33 (eds Larochelle, H. et al.) 1877\u20131901 (Curran Associates, 2020)."},{"key":"931_CR53","unstructured":"Touvron, H. et al. Llama 2: open foundation and fine-tuned chat models. Preprint at https:\/\/arxiv.org\/abs\/2307.09288 (2023)."},{"key":"931_CR54","unstructured":"Anil, R. et al. Palm 2 Technical Report. Preprint at https:\/\/arxiv.org\/abs\/2305.10403 (2023)."},{"key":"931_CR55","doi-asserted-by":"publisher","unstructured":"Kedzierska, K. Z., Crawford, L., Amini, A. P. & Lu, A. X. Assessing the limits of zero-shot foundation models in single-cell biology. Preprint at bioRxiv https:\/\/doi.org\/10.1101\/2023.10.16.561085 (2023).","DOI":"10.1101\/2023.10.16.561085"},{"key":"931_CR56","doi-asserted-by":"publisher","first-page":"164","DOI":"10.1126\/science.adg8538","volume":"383","author":"AM Chekroud","year":"2024","unstructured":"Chekroud, A. M. et al. Illusory generalizability of clinical prediction models. Science 383, 164\u2013167 (2024).","journal-title":"Science"},{"key":"931_CR57","doi-asserted-by":"publisher","first-page":"D523","DOI":"10.1093\/nar\/gkac1052","volume":"51","author":"TU Consortium","year":"2022","unstructured":"Consortium, T. U. UniProt: the Universal Protein Knowledgebase in 2023. Nucleic Acids Res. 51, D523\u2013D531 (2022).","journal-title":"Nucleic Acids Res."},{"key":"931_CR58","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-022-28313-9","volume":"13","author":"N Anand","year":"2022","unstructured":"Anand, N. et al. Protein sequence design with a learned potential. Nat. Commun. 13, 746 (2022).","journal-title":"Nat. Commun."},{"key":"931_CR59","doi-asserted-by":"crossref","unstructured":"Guo, Z., Liu, J., Wang, Y. et al. Diffusion models in bioinformatics and computational biology. Nat. Rev. Bioeng. 2, 136\u2013154 (2024).","DOI":"10.1038\/s44222-023-00114-9"},{"key":"931_CR60","doi-asserted-by":"publisher","unstructured":"Youssef, A. et al. Rapidai: a framework for rapidly deployable ai for novel disease and pandemic preparedness. Preprint at medRxiv https:\/\/doi.org\/10.1101\/2022.08.09.22278600 (2022).","DOI":"10.1101\/2022.08.09.22278600"},{"key":"931_CR61","doi-asserted-by":"publisher","first-page":"e2025581118","DOI":"10.1073\/pnas.2025581118","volume":"118","author":"D Morselli Gysi","year":"2021","unstructured":"Morselli Gysi, D. et al. Network medicine framework for identifying drug-repurposing opportunities for COVID-19. Proc. Natl Acad. Sci. USA 118, e2025581118 (2021).","journal-title":"Proc. Natl Acad. Sci. USA"},{"key":"931_CR62","doi-asserted-by":"publisher","first-page":"1033","DOI":"10.1038\/s41589-022-01131-2","volume":"18","author":"K Huang","year":"2022","unstructured":"Huang, K. et al. Artificial intelligence foundation for therapeutic science. Nat. Chem. Biol. 18, 1033\u20131036 (2022).","journal-title":"Nat. Chem. Biol."},{"key":"931_CR63","doi-asserted-by":"crossref","unstructured":"Gainza, P. et al. De novo design of protein interactions with learned surface fingerprints. Nature 617, 176\u2013184 (2023).","DOI":"10.1038\/s41586-023-05993-x"},{"key":"931_CR64","doi-asserted-by":"crossref","unstructured":"Wong, F. et al. Discovery of a structural class of antibiotics with explainable deep learning. Nature 626, 177\u2013185 (2024).","DOI":"10.1038\/s41586-023-06887-8"},{"key":"931_CR65","unstructured":"Buttenschoen, M., Morris, G. M. & Deane, C. M. Posebusters: AI-based docking methods fail to generate physically valid poses or generalise to novel sequences. Preprint at https:\/\/arxiv.org\/abs\/2308.05777 (2023)."},{"key":"931_CR66","unstructured":"Li, J. et al. Leak proof PDBBind: a reorganized dataset of protein-ligand complexes for more generalizable binding affinity prediction. Preprint at https:\/\/arxiv.org\/abs\/2308.09639 (2024)."},{"key":"931_CR67","doi-asserted-by":"publisher","first-page":"1181","DOI":"10.1056\/NEJMc1413930","volume":"372","author":"E Sanchez-Padilla","year":"2015","unstructured":"Sanchez-Padilla, E. et al. Detection of drug-resistant tuberculosis by xpert MTB\/RIF in Swaziland. New Eng. J. Med. 372, 1181\u20131182 (2015).","journal-title":"New Eng. J. Med."},{"key":"931_CR68","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-023-41967-3","volume":"14","author":"AL Dias","year":"2023","unstructured":"Dias, A. L., Bustillo, L. & Rodrigues, T. Limitations of representation learning in small molecule property prediction. Nat. Commun. 14, 6394 (2023).","journal-title":"Nat. Commun."},{"key":"931_CR69","doi-asserted-by":"crossref","unstructured":"Hsu, C. et al. Learning inverse folding from millions of predicted structures. In Proc. 39th International Conference on Machine Learning Vol. 162 (eds Chaudhuri, K. et al.) 8946\u20138970 (PMLR, 2022).","DOI":"10.1101\/2022.04.10.487779"},{"key":"931_CR70","first-page":"3.1.1\u20133.1.8","volume":"Chapter 3","author":"WR Pearson","year":"2013","unstructured":"Pearson, W. R. An introduction to sequence similarity (\u2018homology\u2019) searching. Curr. Protoc. Bioinformatics Chapter 3, 3.1.1\u20133.1.8 (2013).","journal-title":"Curr. Protoc. Bioinformatics"},{"key":"931_CR71","doi-asserted-by":"publisher","first-page":"1422","DOI":"10.1093\/bioinformatics\/btp163","volume":"25","author":"PJA Cock","year":"2009","unstructured":"Cock, P. J. A. et al. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics 25, 1422\u20131423 (2009).","journal-title":"Bioinformatics"},{"key":"931_CR72","doi-asserted-by":"crossref","unstructured":"Krivelevich, M., M\u00e9sz\u00e1ros, T., Michaeli, P. & Shikhelman, C. Greedy maximal independent sets via local limits. Preprint at https:\/\/arxiv.org\/abs\/1907.07216 (2023).","DOI":"10.1002\/rsa.21200"},{"key":"931_CR73","doi-asserted-by":"crossref","unstructured":"Karp, R. M. Reducibility among Combinatorial Problems (ed. Bohlinger, J. D.) 85\u2013103 (Springer, 1972).","DOI":"10.1007\/978-1-4684-2001-2_9"},{"key":"931_CR74","doi-asserted-by":"publisher","first-page":"2114","DOI":"10.1093\/bioinformatics\/btu170","volume":"30","author":"AM Bolger","year":"2014","unstructured":"Bolger, A. M., Lohse, M. & Usadel, B. Trimmomatic: a flexible trimmer for Illumina sequence data. Bioinformatics 30, 2114\u20132120 (2014).","journal-title":"Bioinformatics"},{"key":"931_CR75","doi-asserted-by":"publisher","first-page":"455","DOI":"10.1089\/cmb.2012.0021","volume":"19","author":"A Bankevich","year":"2012","unstructured":"Bankevich, A. et al. SPAdes: a new genome assembly algorithm and its applications to single-cell sequencing. J. Comput. Biol. 19, 455\u2013477 (2012).","journal-title":"J. Comput. Biol."},{"key":"931_CR76","doi-asserted-by":"publisher","first-page":"3094","DOI":"10.1093\/bioinformatics\/bty191","volume":"34","author":"H Li","year":"2018","unstructured":"Li, H. Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics 34, 3094\u20133100 (2018).","journal-title":"Bioinformatics"},{"key":"931_CR77","doi-asserted-by":"publisher","first-page":"e112963","DOI":"10.1371\/journal.pone.0112963","volume":"9","author":"BJ Walker","year":"2014","unstructured":"Walker, B. J. et al. Pilon: an integrated tool for comprehensive microbial variant detection and genome assembly improvement. PLoS ONE 9, e112963 (2014).","journal-title":"PLoS ONE"},{"key":"931_CR78","doi-asserted-by":"publisher","DOI":"10.1093\/gigascience\/giab008","volume":"10","author":"P Danecek","year":"2021","unstructured":"Danecek, P. et al. Twelve years of SAMtools and BCFtools. Gigascience 10, giab008 (2021).","journal-title":"Gigascience"},{"key":"931_CR79","doi-asserted-by":"publisher","first-page":"3387","DOI":"10.1093\/bioinformatics\/btx431","volume":"33","author":"JJ Almagro Armenteros","year":"2017","unstructured":"Almagro Armenteros, J. J., S\u00f8nderby, C. K., S\u00f8nderby, S. K., Nielsen, H. & Winther, O. DeepLoc: prediction of protein subcellular localization using deep learning. Bioinformatics 33, 3387\u20133395 (2017).","journal-title":"Bioinformatics"},{"key":"931_CR80","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-022-34742-3","volume":"13","author":"M Seuma","year":"2022","unstructured":"Seuma, M., Lehner, B. & Bolognesi, B. An atlas of amyloid aggregation: the impact of substitutions, insertions, deletions and truncations on amyloid beta fibril nucleation. Nat. Commun. 13, 7084 (2022).","journal-title":"Nat. Commun."},{"key":"931_CR81","doi-asserted-by":"publisher","first-page":"1537","DOI":"10.1261\/rna.040709.113","volume":"19","author":"D Melamed","year":"2013","unstructured":"Melamed, D., Young, D. L., Gamble, C. E., Miller, C. R. & Fields, S. Deep mutational scanning of an RRM domain of the Saccharomyces cerevisiae poly(a)-binding protein. RNA 19, 1537\u20131551 (2013).","journal-title":"RNA"},{"key":"931_CR82","doi-asserted-by":"publisher","first-page":"1295","DOI":"10.1093\/bioinformatics\/btx780","volume":"34","author":"J Hou","year":"2018","unstructured":"Hou, J., Adhikari, B. & Cheng, J. DeepSF: deep convolutional neural network for mapping protein sequences to folds. Bioinformatics 34, 1295\u20131303 (2018).","journal-title":"Bioinformatics"},{"key":"931_CR83","doi-asserted-by":"publisher","first-page":"520","DOI":"10.1002\/prot.25674","volume":"87","author":"MS Klausen","year":"2019","unstructured":"Klausen, M. S. et al. NetSurfP-2.0: improved prediction of protein structural features by integrated deep learning. Proteins 87, 520\u2013527 (2019).","journal-title":"Proteins"},{"key":"931_CR84","unstructured":"Corso, G., Stark, H., Jing, B., Barzilay, R. & Jaakkola, T. Diffdock: diffusion steps, twists, and turns for molecular docking. In Proc. Eleventh International Conference on Learning Representations (2023)."},{"key":"931_CR85","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1186\/1758-2946-3-33","volume":"3","author":"NM O\u2019Boyle","year":"2011","unstructured":"O\u2019Boyle, N. M. et al. Open Babel: an open chemical toolbox. J. Cheminform. 3, 33 (2011).","journal-title":"J. Cheminform."},{"key":"931_CR86","doi-asserted-by":"publisher","first-page":"403","DOI":"10.1016\/S0022-2836(05)80360-2","volume":"215","author":"SF Altschul","year":"1990","unstructured":"Altschul, S. F., Gish, W., Miller, W., Myers, E. W. & Lipman, D. J. Basic local alignment search tool. J. Mol. Biol. 215, 403\u2013410 (1990).","journal-title":"J. Mol. Biol."},{"key":"931_CR87","doi-asserted-by":"publisher","first-page":"243","DOI":"10.1038\/s41587-023-01773-0","volume":"42","author":"M van Kempen","year":"2024","unstructured":"van Kempen, M. et al. Fast and accurate protein structure search with foldseek. Nat. Biotechnol. 42, 243\u2013246 (2024).","journal-title":"Nat. Biotechnol."},{"key":"931_CR88","doi-asserted-by":"publisher","first-page":"726","DOI":"10.1021\/jm061277y","volume":"50","author":"MJ Hartshorn","year":"2007","unstructured":"Hartshorn, M. J. et al. Diverse, high-quality test set for the validation of proteinligand docking performance. J. Med. Chem. 50, 726\u2013741 (2007).","journal-title":"J. Med. Chem."},{"key":"931_CR89","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1107\/S2052520616003954","volume":"72","author":"CR Groom","year":"2016","unstructured":"Groom, C. R., Bruno, I. J., Lightfoot, M. P. & Ward, S. C. The Cambridge Structural Database. Acta Crystallogr. Sect. B 72, 171\u2013179 (2016).","journal-title":"Acta Crystallogr. Sect. B"},{"key":"931_CR90","doi-asserted-by":"publisher","first-page":"356","DOI":"10.1016\/j.ebiom.2019.04.016","volume":"43","author":"ML Chen","year":"2019","unstructured":"Chen, M. L. et al. Beyond multidrug resistance: leveraging rare variants with machine and statistical learning models in Mycobacterium tuberculosis resistance prediction. EBioMedicine 43, 356\u2013369 (2019).","journal-title":"EBioMedicine"},{"key":"931_CR91","unstructured":"Alain, G. & Bengio, Y. Understanding intermediate layers using linear classifier probes. In Proc. Fifth International Conference on Learning Representations (2018)."},{"key":"931_CR92","doi-asserted-by":"publisher","first-page":"e1002195","DOI":"10.1371\/journal.pcbi.1002195","volume":"7","author":"SR Eddy","year":"2011","unstructured":"Eddy, S. R. Accelerated profile HMM searches. PLoS Comput. Biol. 7, e1002195 (2011).","journal-title":"PLoS Comput. Biol."},{"key":"931_CR93","doi-asserted-by":"publisher","first-page":"1582","DOI":"10.1093\/bioinformatics\/bty862","volume":"35","author":"TA Hopf","year":"2018","unstructured":"Hopf, T. A. et al. The EVcouplings Python framework for coevolutionary sequence analysis. Bioinformatics 35, 1582\u20131584 (2018).","journal-title":"Bioinformatics"},{"key":"931_CR94","unstructured":"Wandb (Weights & Biases, 2020); https:\/\/wandb.com"},{"key":"931_CR95","unstructured":"Paszke, A. et al. Pytorch: an imperative style, high-performance deep learning library. In Proc. 33rd International Conference on Neural Information Processing Systems 721 (Curran Associates, 2019)."},{"key":"931_CR96","doi-asserted-by":"publisher","unstructured":"Ektefaie, Y. SPECTRA. Harvard Dataverse, vol. V2 https:\/\/doi.org\/10.7910\/DVN\/W5UUNN (2024).","DOI":"10.7910\/DVN\/W5UUNN"},{"key":"931_CR97","unstructured":"Ektefaie, Y. SPECTRA (the spectral framework of model evaluation) v.1.0.3. GitHub https:\/\/github.com\/mims-harvard\/SPECTRA (2024)."},{"key":"931_CR98","doi-asserted-by":"crossref","unstructured":"Yu, F., Koltun, V. & Funkhouser, T. Dilated residual networks. In Proc. IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 636\u2013644 (2017).","DOI":"10.1109\/CVPR.2017.75"},{"key":"931_CR99","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1002\/prot.25415","volume":"86","author":"J Moult","year":"2018","unstructured":"Moult, J., Fidelis, K., Kryshtafovych, A., Schwede, T. & Tramontano, A. Critical assessment of methods of protein structure prediction (CASP)-Round XII. Proteins 86, 7\u201315 (2018).","journal-title":"Proteins"},{"key":"931_CR100","first-page":"482","volume":"19","author":"Y Yang","year":"2018","unstructured":"Yang, Y. et al. Sixty-five years of the long march in protein secondary structure prediction: the final stretch? Brief. Bioinform. 19, 482\u2013494 (2018).","journal-title":"Brief. Bioinform."},{"key":"931_CR101","doi-asserted-by":"publisher","first-page":"508","DOI":"10.1002\/(SICI)1097-0134(19990301)34:4<508::AID-PROT10>3.0.CO;2-4","volume":"34","author":"JA Cuff","year":"1999","unstructured":"Cuff, J. A. & Barton, G. J. Evaluation and improvement of multiple sequence methods for protein secondary structure prediction. Proteins 34, 508\u2013519 (1999).","journal-title":"Proteins"},{"key":"931_CR102","doi-asserted-by":"publisher","first-page":"173","DOI":"10.1038\/nmeth.1818","volume":"9","author":"M Remmert","year":"2011","unstructured":"Remmert, M., Biegert, A., Hauser, A. & S\u00f6ding, J. HHblits: lightning-fast iterative protein sequence searching by HMM-HMM alignment. Nat. Methods 9, 173\u2013175 (2011).","journal-title":"Nat. Methods"},{"key":"931_CR103","doi-asserted-by":"crossref","unstructured":"Lu, W. et al. Tankbind: trigonometry-aware neural networks for drug-protein binding structure prediction. In Advances in Neural Information Processing Systems Vol. 35 (eds Koyejo, S. et al.) 7236\u20137249 (Curran Associates, 2022).","DOI":"10.1101\/2022.06.06.495043"}],"container-title":["Nature Machine Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.nature.com\/articles\/s42256-024-00931-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s42256-024-00931-6","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s42256-024-00931-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,17]],"date-time":"2024-12-17T19:04:01Z","timestamp":1734462241000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.nature.com\/articles\/s42256-024-00931-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,6]]},"references-count":103,"journal-issue":{"issue":"12","published-online":{"date-parts":[[2024,12]]}},"alternative-id":["931"],"URL":"https:\/\/doi.org\/10.1038\/s42256-024-00931-6","relation":{"has-preprint":[{"id-type":"doi","id":"10.1101\/2024.02.25.581982","asserted-by":"object"}]},"ISSN":["2522-5839"],"issn-type":[{"value":"2522-5839","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,6]]},"assertion":[{"value":"31 March 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 October 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 December 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declare no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}]}}