{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,27]],"date-time":"2026-06-27T03:01:06Z","timestamp":1782529266914,"version":"3.54.5"},"reference-count":55,"publisher":"IOP Publishing","issue":"1","license":[{"start":{"date-parts":[[2022,1,31]],"date-time":"2022-01-31T00:00:00Z","timestamp":1643587200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0"},{"start":{"date-parts":[[2022,1,31]],"date-time":"2022-01-31T00:00:00Z","timestamp":1643587200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/iopscience.iop.org\/info\/page\/text-and-data-mining"}],"content-domain":{"domain":["iopscience.iop.org"],"crossmark-restriction":false},"short-container-title":["Mach. Learn.: Sci. Technol."],"published-print":{"date-parts":[[2022,3,1]]},"abstract":"<jats:title>Abstract<\/jats:title>\n                  <jats:p>Transformer models coupled with a simplified molecular line entry system (SMILES) have recently proven to be a powerful combination for solving challenges in cheminformatics. These models, however, are often developed specifically for a single application and can be very resource-intensive to train. In this work we present the Chemformer model\u2014a Transformer-based model which can be quickly applied to both sequence-to-sequence and discriminative cheminformatics tasks. Additionally, we show that self-supervised pre-training can improve performance and significantly speed up convergence on downstream tasks. On direct synthesis and retrosynthesis prediction benchmark datasets we publish state-of-the-art results for top-1 accuracy. We also improve on existing approaches for a molecular optimisation task and show that Chemformer can optimise on multiple discriminative tasks simultaneously. Models, datasets and code will be made available after publication.<\/jats:p>","DOI":"10.1088\/2632-2153\/ac3ffb","type":"journal-article","created":{"date-parts":[[2021,12,7]],"date-time":"2021-12-07T09:14:15Z","timestamp":1638868455000},"page":"015022","update-policy":"https:\/\/doi.org\/10.1088\/crossmark-policy","source":"Crossref","is-referenced-by-count":259,"title":["Chemformer: a pre-trained transformer for computational chemistry"],"prefix":"10.1088","volume":"3","author":[{"given":"Ross","family":"Irwin","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Spyridon","family":"Dimitriadis","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiazhen","family":"He","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1614-7376","authenticated-orcid":true,"given":"Esben Jannik","family":"Bjerrum","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"266","published-online":{"date-parts":[[2022,1,31]]},"reference":[{"key":"mlstac3ffbbib1","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"key":"mlstac3ffbbib2","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","article-title":"Long short-term memory","volume":"9","author":"Hochreiter","year":"1997","journal-title":"Neural Comput."},{"key":"mlstac3ffbbib3","first-page":"pp 1724","article-title":"Learning phrase representations using RNN encoder\u2013decoder for statistical machine translation","author":"Cho","year":"2014"},{"key":"mlstac3ffbbib4","doi-asserted-by":"publisher","first-page":"1572","DOI":"10.1021\/acscentsci.9b00576","article-title":"Molecular transformer: a model for uncertainty-calibrated chemical reaction prediction","volume":"5","author":"Schwaller","year":"2019","journal-title":"ACS Cent. Sci."},{"key":"mlstac3ffbbib5","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1038\/s41467-020-19266-y","article-title":"State-of-the-art augmented NLP transformer models for direct and single-step retrosynthesis","volume":"11","author":"Tetko","year":"2020","journal-title":"Nat. Commun."},{"key":"mlstac3ffbbib6","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s13321-021-00497-0","article-title":"Molecular optimization by capturing chemist\u2019s intuition using deep neural networks","volume":"13","author":"He","year":"2021","journal-title":"J. Cheminform."},{"key":"mlstac3ffbbib7","doi-asserted-by":"crossref","DOI":"10.26434\/chemrxiv.14416133.v1","article-title":"Transformer neural network for structure constrained molecular optimization","author":"He","year":"2021"},{"key":"mlstac3ffbbib8","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1021\/ci00057a005","article-title":"Smiles, a chemical language and information system. 1. Introduction to methodology and encoding rules","volume":"28","author":"Weininger","year":"1988","journal-title":"J. Chem. Inf. Comput. Sci."},{"key":"mlstac3ffbbib9","article-title":"Molecular representation learning with language models and domain-relevant auxiliary tasks","author":"Fabian","year":"2020"},{"key":"mlstac3ffbbib10","article-title":"ChemBERTa: large-scale self-supervised pretraining for molecular property prediction","author":"Chithrananda","year":"2020"},{"key":"mlstac3ffbbib11","doi-asserted-by":"crossref","DOI":"10.1101\/2020.12.23.424259","article-title":"X-MOL: large-scale pre-training for molecular understanding and diverse molecular analysis","author":"Xue","year":"2020"},{"key":"mlstac3ffbbib12","first-page":"pp 429","article-title":"SMILES-BERT: large scale unsupervised pre-training for molecular property prediction","author":"Wang","year":"2019"},{"key":"mlstac3ffbbib13","doi-asserted-by":"publisher","first-page":"bbab152","DOI":"10.1093\/bib\/bbab152","article-title":"MG-BERT: leveraging unsupervised atomic representation learning for molecular property prediction","volume":"22","author":"Zhang","year":"2021","journal-title":"Brief. Bioinform."},{"key":"mlstac3ffbbib14","article-title":"Molecule attention transformer","author":"Maziarka","year":"2020"},{"key":"mlstac3ffbbib15","article-title":"Do large scale molecular language representations capture important structural information?","author":"Ross","year":"2021"},{"key":"mlstac3ffbbib16","article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"mlstac3ffbbib17","first-page":"pp 7871","article-title":"BART: denoising sequence-to-sequence pre-training for natural language generation, translation and comprehension","author":"Lewis","year":"2020"},{"key":"mlstac3ffbbib18","article-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"key":"mlstac3ffbbib19","article-title":"Language models are unsupervised multitask learners","author":"Radford","year":"2019"},{"key":"mlstac3ffbbib20","article-title":"Unified language model pre-training for natural language understanding and generation","author":"Dong","year":"2019"},{"key":"mlstac3ffbbib21","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"mlstac3ffbbib22","doi-asserted-by":"publisher","first-page":"2357","DOI":"10.3390\/molecules25102357","article-title":"Transfer learning: making retrosynthetic predictions based on a small chemical reaction dataset scale to a new level","volume":"25","author":"Bai","year":"2020","journal-title":"Molecules"},{"key":"mlstac3ffbbib23","article-title":"Data transfer approaches to improve seq-to-seq retrosynthesis","author":"Ishiguro","year":"2020"},{"key":"mlstac3ffbbib24","doi-asserted-by":"publisher","first-page":"9368","DOI":"10.1039\/D0CC02657C","article-title":"Heck reaction prediction using a transformer model based on a transfer learning strategy","volume":"56","author":"Wang","year":"2020","journal-title":"Chem. Commun."},{"key":"mlstac3ffbbib25","doi-asserted-by":"publisher","first-page":"8648","DOI":"10.1039\/D1SC02362D","article-title":"Predicting enzymatic reactions with a molecular transformer","volume":"12","author":"Kreutter","year":"2021","journal-title":"Chem. Sci."},{"key":"mlstac3ffbbib26","doi-asserted-by":"publisher","first-page":"1415","DOI":"10.1039\/D0QO01636E","article-title":"Data augmentation and transfer learning strategies for reaction prediction in low chemical data regimes","volume":"8","author":"Zhang","year":"2021","journal-title":"Org. Chem. Front."},{"key":"mlstac3ffbbib27","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1038\/s41467-020-18671-7","article-title":"Transfer learning enables the molecular transformer to predict regio- and stereoselective reactions on carbohydrates","volume":"11","author":"Pesciullesi","year":"2020","journal-title":"Nat. Commun."},{"key":"mlstac3ffbbib28","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s13321-020-00430-x","article-title":"Inductive transfer learning for molecular activity prediction: next-gen QSAR models with MolPMoFiT","volume":"12","author":"Li","year":"2020","journal-title":"J. Cheminform."},{"key":"mlstac3ffbbib29","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s13321-020-00423-w","article-title":"Transformer-CNN: Swiss knife for QSAR modeling and interpretation","volume":"12","author":"Karpov","year":"2020","journal-title":"J. Cheminform."},{"key":"mlstac3ffbbib30","doi-asserted-by":"publisher","first-page":"2324","DOI":"10.1021\/acs.jcim.5b00559","article-title":"Zinc 15\u2013ligand discovery for everyone","volume":"55","author":"Sterling","year":"2015","journal-title":"J. Chem. Inf. Model."},{"key":"mlstac3ffbbib31","doi-asserted-by":"publisher","first-page":"131","DOI":"10.3390\/biom8040131","article-title":"Improving chemical autoencoder latent space and molecular de novo generation diversity with heteroencoders","volume":"8","author":"Bjerrum","year":"2018","journal-title":"Biomolecules"},{"key":"mlstac3ffbbib32","article-title":"SMILES enumeration as data augmentation for neural network modeling of molecules","author":"Bjerrum","year":"2017"},{"key":"mlstac3ffbbib33","first-page":"pp 2604","article-title":"Predicting organic reaction outcomes with Weisfeiler\u2013Lehman network","author":"Jin","year":"2017"},{"key":"mlstac3ffbbib34","doi-asserted-by":"publisher","first-page":"4385","DOI":"10.1021\/acs.jmedchem.6b00153","article-title":"Big data from pharmaceutical patents: a computational analysis of medicinal chemists\u2019 bread and butter","volume":"59","author":"Schneider","year":"2016","journal-title":"J. Med. Chem."},{"key":"mlstac3ffbbib35","doi-asserted-by":"publisher","first-page":"2336","DOI":"10.1021\/acs.jcim.6b00564","article-title":"What\u2019s what: the (nearly) definitive guide to reaction role assignment","volume":"56","author":"Schneider","year":"2016","journal-title":"J. Chem. Inf. Model."},{"key":"mlstac3ffbbib36","doi-asserted-by":"publisher","first-page":"D930","DOI":"10.1093\/nar\/gky1075","article-title":"ChEMBL: towards direct deposition of bioassay data","volume":"47","author":"Mendez","year":"2019","journal-title":"Nucleic Acids Res."},{"key":"mlstac3ffbbib37","doi-asserted-by":"publisher","first-page":"254","DOI":"10.1038\/s42256-020-0174-5","article-title":"Direct steering of de novo molecular generation with descriptor conditional recurrent neural networks","volume":"2","author":"Kotsias","year":"2020","journal-title":"Nat. Mach. Intell."},{"key":"mlstac3ffbbib38","article-title":"Levenshtein augmentation improves performance of SMILES based deep-learning synthesis prediction","author":"Sumner","year":"2020"},{"key":"mlstac3ffbbib39","article-title":"An overview of multi-task learning in deep neural networks","author":"Ruder","year":"2017"},{"key":"mlstac3ffbbib40","doi-asserted-by":"publisher","first-page":"513","DOI":"10.1039\/C7SC02664A","article-title":"MoleculeNet: a benchmark for molecular machine learning","volume":"9","author":"Wu","year":"2018","journal-title":"Chem. Sci."},{"key":"mlstac3ffbbib41","first-page":"2825","article-title":"Scikit-learn: machine learning in python","volume":"12","author":"Pedregosa","year":"2011","journal-title":"J. Mach. Learn. Res."},{"key":"mlstac3ffbbib42","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s13321-020-00428-5","article-title":"Industry-scale application and evaluation of deep learning for drug target prediction","volume":"12","author":"Sturm","year":"2020","journal-title":"J. Cheminform."},{"key":"mlstac3ffbbib43","first-page":"pp 8024","article-title":"PyTorch: an imperative style, high-performance deep learning library","volume":"vol 32","author":"Paszke","year":"2019"},{"key":"mlstac3ffbbib44","article-title":"PyTorch lightning","volume":"vol 3","author":"Falcon","year":"2019"},{"key":"mlstac3ffbbib45","article-title":"Layer normalization","author":"Ba","year":"2016"},{"key":"mlstac3ffbbib46","article-title":"Gaussian error linear units (GELUs)","author":"Hendrycks","year":"2016"},{"key":"mlstac3ffbbib47","doi-asserted-by":"publisher","DOI":"10.1117\/12.2520589","article-title":"Super-convergence: very fast training of neural networks using large learning rates","volume":"11006","author":"Smith","year":"2019","journal-title":"Proc. SPIE"},{"key":"mlstac3ffbbib48","article-title":"Adam: a method for stochastic optimization","author":"Kingma","year":"2014"},{"key":"mlstac3ffbbib49","doi-asserted-by":"crossref","DOI":"10.26434\/chemrxiv-2021-kzhbs","article-title":"PySMILESUtils\u2013enabling deep learning with the SMILES chemical language","author":"Bjerrum","year":"2021"},{"key":"mlstac3ffbbib50","doi-asserted-by":"publisher","first-page":"3370","DOI":"10.1021\/acs.jcim.9b00237","article-title":"Analyzing learned molecular representations for property prediction","volume":"59","author":"Yang","year":"2019","journal-title":"J. Chem. Inf. Model."},{"key":"mlstac3ffbbib51","doi-asserted-by":"publisher","first-page":"47","DOI":"10.1021\/acs.jcim.9b00949","article-title":"Predicting retrosynthetic reactions using self-corrected transformer neural networks","volume":"60","author":"Zheng","year":"2019","journal-title":"J. Chem. Inf. Model."},{"key":"mlstac3ffbbib52","doi-asserted-by":"publisher","first-page":"123","DOI":"10.1021\/acs.jcim.0c01074","article-title":"Valid, plausible and diverse retrosynthesis using tied two-way transformers with latent variables","volume":"61","author":"Kim","year":"2021","journal-title":"J. Chem. Inf. Model."},{"key":"mlstac3ffbbib53","article-title":"Molecule edit graph attention network: modeling chemical reactions as sequences of graph edits","author":"Sacha","year":"2020"},{"key":"mlstac3ffbbib54","article-title":"Retrosynthesis prediction with conditional graph logic network","author":"Dai","year":"2020"},{"key":"mlstac3ffbbib55","article-title":"Learning graph models for template-free retrosynthesis","author":"Somnath","year":"2020"}],"container-title":["Machine Learning: Science and Technology"],"original-title":[],"link":[{"URL":"https:\/\/iopscience.iop.org\/article\/10.1088\/2632-2153\/ac3ffb","content-type":"text\/html","content-version":"am","intended-application":"text-mining"},{"URL":"https:\/\/iopscience.iop.org\/article\/10.1088\/2632-2153\/ac3ffb\/pdf","content-type":"application\/pdf","content-version":"am","intended-application":"text-mining"},{"URL":"https:\/\/iopscience.iop.org\/article\/10.1088\/2632-2153\/ac3ffb","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/iopscience.iop.org\/article\/10.1088\/2632-2153\/ac3ffb\/pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/iopscience.iop.org\/article\/10.1088\/2632-2153\/ac3ffb\/pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"https:\/\/iopscience.iop.org\/article\/10.1088\/2632-2153\/ac3ffb\/pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/iopscience.iop.org\/article\/10.1088\/2632-2153\/ac3ffb\/pdf","content-type":"application\/pdf","content-version":"am","intended-application":"similarity-checking"},{"URL":"https:\/\/iopscience.iop.org\/article\/10.1088\/2632-2153\/ac3ffb\/pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,17]],"date-time":"2022-05-17T02:23:33Z","timestamp":1652754213000},"score":1,"resource":{"primary":{"URL":"https:\/\/iopscience.iop.org\/article\/10.1088\/2632-2153\/ac3ffb"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,1,31]]},"references-count":55,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2022,1,31]]},"published-print":{"date-parts":[[2022,3,1]]}},"URL":"https:\/\/doi.org\/10.1088\/2632-2153\/ac3ffb","relation":{"has-preprint":[{"id-type":"doi","id":"10.26434\/chemrxiv-2021-v2pnn","asserted-by":"object"}]},"ISSN":["2632-2153"],"issn-type":[{"value":"2632-2153","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,1,31]]},"assertion":[{"value":"Chemformer: a pre-trained transformer for computational chemistry","name":"article_title","label":"Article Title"},{"value":"Machine Learning: Science and Technology","name":"journal_title","label":"Journal Title"},{"value":"paper","name":"article_type","label":"Article Type"},{"value":"\u00a9 2022 The Author(s). Published by IOP Publishing Ltd","name":"copyright_information","label":"Copyright Information"},{"value":"2021-09-06","name":"date_received","label":"Date Received","group":{"name":"publication_dates","label":"Publication dates"}},{"value":"2021-12-03","name":"date_accepted","label":"Date Accepted","group":{"name":"publication_dates","label":"Publication dates"}},{"value":"2022-01-31","name":"date_epub","label":"Online publication date","group":{"name":"publication_dates","label":"Publication dates"}}]}}