{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T20:30:13Z","timestamp":1777581013700,"version":"3.51.4"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2025,2,25]],"date-time":"2025-02-25T00:00:00Z","timestamp":1740441600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,25]],"date-time":"2025-02-25T00:00:00Z","timestamp":1740441600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100000923","name":"Department of Education and Training | Australian Research Council","doi-asserted-by":"publisher","award":["FT210100097,DP240101547"],"award-info":[{"award-number":["FT210100097,DP240101547"]}],"id":[{"id":"10.13039\/501100000923","id-type":"DOI","asserted-by":"publisher"}]},{"name":"CSIRO \u2013 National Science Foundation (US) AI Research Collaboration Program"},{"DOI":"10.13039\/501100000925","name":"Department of Health | National Health and Medical Research Council","doi-asserted-by":"publisher","award":["APP2013629"],"award-info":[{"award-number":["APP2013629"]}],"id":[{"id":"10.13039\/501100000925","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000925","name":"Department of Health | National Health and Medical Research Council","doi-asserted-by":"publisher","award":["APP2013629"],"award-info":[{"award-number":["APP2013629"]}],"id":[{"id":"10.13039\/501100000925","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000925","name":"Department of Health | National Health and Medical Research Council","doi-asserted-by":"publisher","award":["2013629"],"award-info":[{"award-number":["2013629"]}],"id":[{"id":"10.13039\/501100000925","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Health and Medical Research Council (NHMRC) of Australia, Department of Health and Aged Care (MRFF) Stem Cell Therapies Mission grant"},{"name":"National Health and Medical Research Council (NHMRC) of Australia, Department of Health and Aged Care (MRFF) Stem Cell Therapies Mission grant"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Nat Mach Intell"],"DOI":"10.1038\/s42256-025-00994-z","type":"journal-article","created":{"date-parts":[[2025,2,25]],"date-time":"2025-02-25T10:03:19Z","timestamp":1740477799000},"page":"437-447","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":50,"title":["Large language models for scientific discovery in molecular property prediction"],"prefix":"10.1038","volume":"7","author":[{"given":"Yizhen","family":"Zheng","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0488-2616","authenticated-orcid":false,"given":"Huan Yee","family":"Koh","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3503-5708","authenticated-orcid":false,"given":"Jiaxin","family":"Ju","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0528-9416","authenticated-orcid":false,"given":"Anh T. N.","family":"Nguyen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4412-1707","authenticated-orcid":false,"given":"Lauren T.","family":"May","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9963-5169","authenticated-orcid":false,"given":"Geoffrey I.","family":"Webb","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0794-527X","authenticated-orcid":false,"given":"Shirui","family":"Pan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,25]]},"reference":[{"key":"994_CR1","doi-asserted-by":"crossref","first-page":"1104","DOI":"10.1257\/aer.20180338","volume":"110","author":"N Bloom","year":"2020","unstructured":"Bloom, N., Jones, C. I., Reenen, J. & Webb, M. Are ideas getting harder to find? Am. Econ. Rev. 110, 1104\u20131144 (2020).","journal-title":"Am. Econ. Rev."},{"key":"994_CR2","doi-asserted-by":"crossref","first-page":"47","DOI":"10.1038\/s41586-023-06221-2","volume":"620","author":"H Wang","year":"2023","unstructured":"Wang, H. et al. Scientific discovery in the age of artificial intelligence. Nature 620, 47\u201360 (2023).","journal-title":"Nature"},{"key":"994_CR3","doi-asserted-by":"crossref","unstructured":"Frank, M. Baby steps in evaluating the capacities of large language models. Nat. Rev. Psychol. 2, 451\u2013452 (2023).","DOI":"10.1038\/s44159-023-00211-x"},{"key":"994_CR4","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T. et al. Language models are few-shot learners. Adv. Neural Inf. Process Syst. 33, 1877\u2013901 (2020).","journal-title":"Adv. Neural Inf. Process Syst."},{"key":"994_CR5","unstructured":"Achiam, J. et al. Gpt-4 technical report. Preprint at https:\/\/arxiv.org\/abs\/2303.08774 (2023)."},{"key":"994_CR6","doi-asserted-by":"crossref","unstructured":"Jiang, L. Y. et al. Health system-scale language models are all-purpose prediction engines. Nature 619, 357\u2013362 (2023).","DOI":"10.1038\/s41586-023-06160-y"},{"key":"994_CR7","doi-asserted-by":"crossref","unstructured":"Singhal, K. et al. Large language models encode clinical knowledge. Nature 620, 172\u2013180 (2023).","DOI":"10.1038\/s41586-023-06291-2"},{"key":"994_CR8","unstructured":"Mirza, A. et al. Are large language models superhuman chemists? Preprint at https:\/\/arxiv.org\/abs\/2404.01475 (2024)."},{"key":"994_CR9","doi-asserted-by":"crossref","first-page":"513","DOI":"10.1039\/C7SC02664A","volume":"9","author":"Z Wu","year":"2018","unstructured":"Wu, Z. et al. MoleculeNet: a benchmark for molecular machine learning. Chem. Sci. 9, 513\u2013530 (2018).","journal-title":"Chem. Sci."},{"key":"994_CR10","unstructured":"Taylor, R. et al. Galactica: a large language model for science. Preprint at https:\/\/arxiv.org\/abs\/2211.09085 (2022)."},{"key":"994_CR11","unstructured":"Almazrouei, E. et al. The Falcon series of open language models. Preprint at https:\/\/arxiv.org\/abs\/2311.16867 (2023)."},{"key":"994_CR12","doi-asserted-by":"crossref","first-page":"4","DOI":"10.1016\/j.addr.2012.09.019","volume":"64","author":"CA Lipinski","year":"2012","unstructured":"Lipinski, C. A., Lombardo, F., Dominy, B. W. & Feeney, P. J. Experimental and computational approaches to estimate solubility and permeability in drug discovery and development settings. Adv. Drug Deliv. Rev. 64, 4\u201317 (2012).","journal-title":"Adv. Drug Deliv. Rev."},{"key":"994_CR13","doi-asserted-by":"publisher","unstructured":"Landrum, G. et al. rdkit\/rdkit: 2024_09_5 (Q3 2024) Release (Release_2024_09_5). Zenodo https:\/\/doi.org\/10.5281\/zenodo.14779836 (2025).","DOI":"10.5281\/zenodo.14779836"},{"key":"994_CR14","unstructured":"Hu, W. et al. Strategies for pre-training graph neural networks. In Proc. International Conference on Learning Representations (2020)."},{"key":"994_CR15","first-page":"5812","volume":"33","author":"Y You","year":"2020","unstructured":"You, Y. et al. Graph contrastive learning with augmentations. Adv. Neural Inf. Process. Sys. 33, 5812\u20135823 (2020).","journal-title":"Adv. Neural Inf. Process. Sys."},{"key":"994_CR16","doi-asserted-by":"crossref","first-page":"279","DOI":"10.1038\/s42256-022-00447-x","volume":"4","author":"Y Wang","year":"2022","unstructured":"Wang, Y., Wang, J., Cao, Z. & Farimani, A. B. Molecular contrastive learning of representations via graph neural networks. Nat. Mach. Intell. 4, 279\u2013287 (2022).","journal-title":"Nat. Mach. Intell."},{"key":"994_CR17","unstructured":"St\u00e4rk, H. et al. 3D Infomax improves GNNs for molecular property prediction. In Proc. International Conference on Machine Learning (eds Chaudhuri, K. et al.) 20479\u201320502 (PMLR, 2022)."},{"key":"994_CR18","unstructured":"Liu, S. et al. Pre-training molecular graph representation with 3D geometry. In Proc. 10th International Conference on Learning Representations (2022)."},{"key":"994_CR19","doi-asserted-by":"crossref","unstructured":"Xia, J. et al. Mole-bert: rethinking pre-training graph neural networks for molecules. In Proc. 11th International Conference on Learning Representations (2023).","DOI":"10.26434\/chemrxiv-2023-dngg4"},{"key":"994_CR20","first-page":"12559","volume":"33","author":"Y Rong","year":"2020","unstructured":"Rong, Y. et al. Self-supervised graph transformer on large-scale molecular data. Adv. Neural Inf. Process. Sys. 33, 12559\u201312571 (2020).","journal-title":"Adv. Neural Inf. Process. Sys."},{"key":"994_CR21","doi-asserted-by":"crossref","unstructured":"Zhou, G. et al. Uni-mol: a universal 3D molecular representation learning framework. In Proc. 11th International Conference on Learning Representations (2023).","DOI":"10.26434\/chemrxiv-2022-jjm0j-v4"},{"key":"994_CR22","doi-asserted-by":"crossref","first-page":"742","DOI":"10.1021\/ci100050t","volume":"50","author":"D Rogers","year":"2010","unstructured":"Rogers, D. & Hahn, M. Extended-connectivity fingerprints. J. Chem. Inf. Model. 50, 742\u2013754 (2010).","journal-title":"J. Chem. Inf. Model."},{"key":"994_CR23","doi-asserted-by":"crossref","first-page":"688\u2013702.e13","DOI":"10.1016\/j.cell.2020.01.021","volume":"180","author":"JM Stokes","year":"2020","unstructured":"Stokes, J. M. et al. A deep learning approach to antibiotic discovery. Cell 180, 688\u2013702.e13 (2020).","journal-title":"Cell"},{"key":"994_CR24","doi-asserted-by":"crossref","first-page":"177","DOI":"10.1038\/s41586-023-06887-8","volume":"626","author":"F Wong","year":"2024","unstructured":"Wong, F. et al. Discovery of a structural class of antibiotics with explainable deep learning. Nature 626, 177\u2013185 (2024).","journal-title":"Nature"},{"key":"994_CR25","unstructured":"Zhang, D. et al. Chemllm: a chemical large language model. Preprint at https:\/\/arxiv.org\/abs\/2402.06852 (2024)."},{"key":"994_CR26","unstructured":"Zhao, Z. et al. ChemDFM: a large language foundation model for chemistry. In 38th Conference on Neural Information Processing Systems, Foundation Models for Science: Progress, Opportunities, and Challenges (NeurIPS, 2024)."},{"key":"994_CR27","unstructured":"Cai, Z. et al. Internlm2 technical report. Preprint at https:\/\/arxiv.org\/abs\/2403.17297 (2024)."},{"key":"994_CR28","unstructured":"Touvron, H. et al. Llama: open and efficient foundation language models. Preprint at https:\/\/arxiv.org\/abs\/2302.13971 (2023)."},{"key":"994_CR29","doi-asserted-by":"publisher","unstructured":"Haque, M. & Li, S. Exploring ChatGPT and its impact on society. AI Ethics https:\/\/doi.org\/10.1007\/s43681-024-00435-4 (2024).","DOI":"10.1007\/s43681-024-00435-4"},{"key":"994_CR30","unstructured":"Wei, J. et al. Emergent abilities of large language models. Transact. Mach. Learn. Res. https:\/\/openreview.net\/pdf?id=yzkSU5zdwD (2022)."},{"key":"994_CR31","unstructured":"McKnight, P. E. & Najab, J. in The Corsini Encyclopedia of Psychology (eds Weiner, I. B. & Craighead, W. E.) (Wiley, 2010)."},{"key":"994_CR32","doi-asserted-by":"crossref","first-page":"1936","DOI":"10.1021\/acs.jcim.6b00290","volume":"56","author":"G Subramanian","year":"2016","unstructured":"Subramanian, G., Ramsundar, B., Pande, V. & Denny, R. A. Computational modeling of \u03b2-secretase 1 (BACE-1) inhibitors using ligand based approaches. J. Chem. Inf. Model. 56, 1936\u20131949 (2016).","journal-title":"J. Chem. Inf. Model."},{"key":"994_CR33","doi-asserted-by":"crossref","first-page":"420","DOI":"10.1021\/cn100007x","volume":"1","author":"T Wager","year":"2010","unstructured":"Wager, T. Defining desirable central nervous system drug space through the alignment of molecular properties, in vitro adme, and safety attributes. ACS Chem. Neurosci. 1, 420\u2013434 (2010).","journal-title":"ACS Chem. Neurosci."},{"key":"994_CR34","doi-asserted-by":"crossref","first-page":"435","DOI":"10.1021\/cn100008c","volume":"1","author":"T Wager","year":"2010","unstructured":"Wager, T., Hou, X., Verhoest, P. & Villalobos, A. Moving beyond rules: the development of a central nervous system multiparameter optimization (cns mpo) approach to enable alignment of druglike properties. ACS Chem. Neurosci. 1, 435\u2013449 (2010).","journal-title":"ACS Chem. Neurosci."},{"key":"994_CR35","doi-asserted-by":"crossref","first-page":"961","DOI":"10.4155\/tde.15.32","volume":"6","author":"W Geldenhuys","year":"2015","unstructured":"Geldenhuys, W., Mohammad, A., Adkins, C. & Lockman, P. Molecular determinants of blood\u2013brain barrier permeation. Ther. Deliv. 6, 961\u2013971 (2015).","journal-title":"Ther. Deliv."},{"key":"994_CR36","doi-asserted-by":"crossref","first-page":"157","DOI":"10.1162\/tacl_a_00638","volume":"12","author":"NF Liu","year":"2024","unstructured":"Liu, N. F. et al. Lost in the middle: how language models use long contexts. Trans. Assoc. Comput. Linguist. 12, 157\u2013173 (2024).","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"994_CR37","doi-asserted-by":"crossref","unstructured":"Qin, G., Feng, Y. & Van Durme, B. The NLP task effectiveness of long-range transformers. In Proc. 17th Conference of the European Chapter of the Association for Computational Linguistics (eds Vlachos, A. & Augenstein, I.) 3774\u20133790 (ACL, 2023).","DOI":"10.18653\/v1\/2023.eacl-main.273"},{"key":"994_CR38","doi-asserted-by":"crossref","first-page":"D506","DOI":"10.1093\/nar\/gky1049","volume":"47","author":"The UniProt Consortium.","year":"2019","unstructured":"The UniProt Consortium. UniProt: a worldwide hub of protein knowledge. Nucleic Acids Res. 47, D506\u2013D515 (2019).","journal-title":"Nucleic Acids Res."},{"key":"994_CR39","doi-asserted-by":"crossref","first-page":"D36","DOI":"10.1093\/nar\/gks1195","volume":"41","author":"DA Benson","year":"2013","unstructured":"Benson, D. A. et al. GenBank. Nucleic Acids Res. 41, D36\u2013D42 (2013).","journal-title":"Nucleic Acids Res."},{"key":"994_CR40","doi-asserted-by":"crossref","first-page":"5","DOI":"10.1023\/A:1010933404324","volume":"45","author":"L Breiman","year":"2001","unstructured":"Breiman, L. Random forests. Mach. Learn. 45, 5\u201332 (2001).","journal-title":"Mach. Learn."},{"key":"994_CR41","doi-asserted-by":"crossref","first-page":"578","DOI":"10.1016\/j.jmat.2023.08.007","volume":"10","author":"YJ Park","year":"2024","unstructured":"Park, Y. J. et al. Can chatgpt be used to generate scientific hypotheses? J. Materiomics 10, 578\u2013584 (2024).","journal-title":"J. Materiomics"},{"key":"994_CR42","unstructured":"Honda, S., Shi, S. & Ueda, H. R. Smiles transformer: pre-trained molecular fingerprint for low data drug discovery. Preprint at https:\/\/arxiv.org\/abs\/1911.04738 (2019)."},{"key":"994_CR43","doi-asserted-by":"publisher","unstructured":"zyzisastudyreallyhardguy & Ju, J. Code repository LLM4SD: release v.1.0. Zenodo https:\/\/doi.org\/10.5281\/zenodo.13986921 (2024).","DOI":"10.5281\/zenodo.13986921"},{"key":"994_CR44","doi-asserted-by":"crossref","unstructured":"Student. The probable error of a mean. Biometrika 6, 1\u201325 (1908).","DOI":"10.2307\/2331554"}],"container-title":["Nature Machine Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.nature.com\/articles\/s42256-025-00994-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s42256-025-00994-z","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s42256-025-00994-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,24]],"date-time":"2025-03-24T23:33:37Z","timestamp":1742859217000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.nature.com\/articles\/s42256-025-00994-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,25]]},"references-count":44,"journal-issue":{"issue":"3","published-online":{"date-parts":[[2025,3]]}},"alternative-id":["994"],"URL":"https:\/\/doi.org\/10.1038\/s42256-025-00994-z","relation":{},"ISSN":["2522-5839"],"issn-type":[{"value":"2522-5839","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,25]]},"assertion":[{"value":"24 October 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 January 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declare no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}]}}