{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T14:35:16Z","timestamp":1777646116041,"version":"3.51.4"},"reference-count":53,"publisher":"Oxford University Press (OUP)","issue":"1","license":[{"start":{"date-parts":[[2024,1,4]],"date-time":"2024-01-04T00:00:00Z","timestamp":1704326400000},"content-version":"vor","delay-in-days":43,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Tsinghua University Initiative Scientific Research Program","award":["20221080025"],"award-info":[{"award-number":["20221080025"]}]},{"name":"Tsinghua-Peking University Center for Life Sciences","award":["61020100120"],"award-info":[{"award-number":["61020100120"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,11,22]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:p>Protein\u2013DNA interaction is critical for life activities such as replication, transcription and splicing. Identifying protein\u2013DNA binding residues is essential for modeling their interaction and downstream studies. However, developing accurate and efficient computational methods for this task remains challenging. Improvements in this area have the potential to drive novel applications in biotechnology and drug design. In this study, we propose a novel approach called Contrastive Learning And Pre-trained Encoder (CLAPE), which combines a pre-trained protein language model and the contrastive learning method to predict DNA binding residues. We trained the CLAPE-DB model on the protein\u2013DNA binding sites dataset and evaluated the model performance and generalization ability through various experiments. The results showed that the area under ROC curve values of the CLAPE-DB model on the two benchmark datasets reached 0.871 and 0.881, respectively, indicating superior performance compared to other existing models. CLAPE-DB showed better generalization ability and was specific to DNA-binding sites. In addition, we trained CLAPE on different protein\u2013ligand binding sites datasets, demonstrating that CLAPE is a general framework for binding sites prediction. To facilitate the scientific community, the benchmark datasets and codes are freely available at https:\/\/github.com\/YAndrewL\/clape.<\/jats:p>","DOI":"10.1093\/bib\/bbad488","type":"journal-article","created":{"date-parts":[[2024,1,4]],"date-time":"2024-01-04T01:20:31Z","timestamp":1704331231000},"source":"Crossref","is-referenced-by-count":72,"title":["Protein\u2013DNA binding sites prediction based on pre-trained protein language model and contrastive learning"],"prefix":"10.1093","volume":"25","author":[{"given":"Yufan","family":"Liu","sequence":"first","affiliation":[{"name":"MOE Key Laboratory of Bioinformatics , State Key Laboratory of Molecular Oncology, School of Pharmaceutical Sciences, , Beijing, 100084 , China"},{"name":"Tsinghua University , State Key Laboratory of Molecular Oncology, School of Pharmaceutical Sciences, , Beijing, 100084 , China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5830-0669","authenticated-orcid":false,"given":"Boxue","family":"Tian","sequence":"additional","affiliation":[{"name":"MOE Key Laboratory of Bioinformatics , State Key Laboratory of Molecular Oncology, School of Pharmaceutical Sciences, , Beijing, 100084 , China"},{"name":"Tsinghua University , State Key Laboratory of Molecular Oncology, School of Pharmaceutical Sciences, , Beijing, 100084 , China"}]}],"member":"286","published-online":{"date-parts":[[2024,1,3]]},"reference":[{"key":"2024011113572486400_ref1","doi-asserted-by":"crossref","first-page":"185","DOI":"10.1038\/nrmicro2261","article-title":"Bacterial nucleoid-associated proteins, nucleoid structure and gene expression","volume":"8","author":"Dillon","year":"2010","journal-title":"Nat Rev Microbiol"},{"key":"2024011113572486400_ref2","doi-asserted-by":"crossref","first-page":"650","DOI":"10.1016\/j.cell.2018.01.029","article-title":"The human transcription factors","volume":"172","author":"Lambert","year":"2018","journal-title":"Cell"},{"key":"2024011113572486400_ref3","doi-asserted-by":"crossref","first-page":"D408","DOI":"10.1093\/nar\/gkn749","article-title":"PEDANT covers all complete RefSeq genomes","volume":"37","author":"Walter","year":"2009","journal-title":"Nucleic Acids Res"},{"key":"2024011113572486400_ref4","doi-asserted-by":"crossref","first-page":"685","DOI":"10.1038\/nature05673","article-title":"Foxp3 controls regulatory T-cell function by interacting with AML1\/Runx1","volume":"446","author":"Ono","year":"2007","journal-title":"Nature"},{"key":"2024011113572486400_ref5","doi-asserted-by":"crossref","first-page":"861","DOI":"10.1016\/j.cell.2007.11.019","article-title":"Induction of pluripotent stem cells from adult human fibroblasts by defined factors","volume":"131","author":"Takahashi","year":"2007","journal-title":"Cell"},{"key":"2024011113572486400_ref6","doi-asserted-by":"crossref","first-page":"448","DOI":"10.1038\/nature13163","article-title":"REST and stress resistance in ageing and Alzheimer\u2019s disease","volume":"507","author":"Lu","year":"2014","journal-title":"Nature"},{"key":"2024011113572486400_ref7","doi-asserted-by":"crossref","first-page":"59","DOI":"10.1016\/j.neulet.2019.01.042","article-title":"Loss of nuclear REST\/NRSF in aged-dopaminergic neurons in Parkinson\u2019s disease patients","volume":"699","author":"Kawamura","year":"2019","journal-title":"Neurosci Lett"},{"key":"2024011113572486400_ref8","doi-asserted-by":"crossref","first-page":"4533","DOI":"10.1021\/acs.jmedchem.6b01761","article-title":"Drug discovery targeting bromodomain-containing protein 4","volume":"60","author":"Liu","year":"2017","journal-title":"J Med Chem"},{"key":"2024011113572486400_ref9","doi-asserted-by":"crossref","first-page":"95","DOI":"10.1111\/jnc.13625","article-title":"Physiological functions and pathobiology of TDP-43 and FUS\/TLS proteins","volume":"138","author":"Ratti","year":"2016","journal-title":"J Neurochem"},{"key":"2024011113572486400_ref10","doi-asserted-by":"crossref","first-page":"840","DOI":"10.1038\/nrg3306","article-title":"ChIP\u2013seq and beyond: new and improved methodologies to detect and characterize protein\u2013DNA interactions","volume":"13","author":"Furey","year":"2012","journal-title":"Nat Rev Genet"},{"key":"2024011113572486400_ref11","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1186\/s13007-021-00780-z","article-title":"DNA\u2013protein interaction studies: a historical and comparative analysis","volume":"17","author":"Ferraz","year":"2021","journal-title":"Plant Methods"},{"key":"2024011113572486400_ref12","doi-asserted-by":"crossref","first-page":"3570","DOI":"10.1093\/nar\/28.18.3570","article-title":"PSI-BLAST searches using hidden Markov models of structural repeats: prediction of an unusual sliding DNA clamp and of \u03b2-propellers in UV-damaged DNA-binding protein","volume":"28","author":"Neuwald","year":"2000","journal-title":"Nucleic Acids Res"},{"key":"2024011113572486400_ref13","doi-asserted-by":"crossref","first-page":"173","DOI":"10.1038\/nmeth.1818","article-title":"HHblits: lightning-fast iterative protein sequence searching by HMM-HMM alignment","volume":"9","author":"Remmert","year":"2012","journal-title":"Nat Methods"},{"key":"2024011113572486400_ref14","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1186\/1752-0509-4-S2-S1","article-title":"BindN+ for accurate prediction of DNA and RNA-binding residues from protein sequence features","volume":"4","author":"Wang","year":"2010","journal-title":"BMC Syst Biol"},{"key":"2024011113572486400_ref15","doi-asserted-by":"crossref","DOI":"10.1093\/bib\/bbac322","article-title":"A deep learning-based method for the prediction of DNA interacting residues in a protein","volume":"23","author":"Patiyal","year":"2022","journal-title":"Brief Bioinform"},{"key":"2024011113572486400_ref16","doi-asserted-by":"crossref","first-page":"D411","DOI":"10.1093\/nar\/gkq1105","article-title":"A series of PDB related databases for everyday needs","volume":"39","author":"Joosten","year":"2010","journal-title":"Nucleic Acids Res"},{"key":"2024011113572486400_ref17","doi-asserted-by":"crossref","first-page":"404","DOI":"10.1093\/bioinformatics\/16.4.404","article-title":"The PSIPRED protein structure prediction server","volume":"16","author":"McGuffin","year":"2000","journal-title":"Bioinformatics"},{"key":"2024011113572486400_ref18","doi-asserted-by":"crossref","first-page":"W243","DOI":"10.1093\/nar\/gkl298","article-title":"BindN: a web-based tool for efficient prediction of DNA and RNA binding sites in amino acid sequences","volume":"34","author":"Wang","year":"2006","journal-title":"Nucleic Acids Res"},{"key":"2024011113572486400_ref19","doi-asserted-by":"crossref","first-page":"5510","DOI":"10.3390\/ijms22115510","article-title":"DeepDISE: DNA binding site prediction using a deep learning method","volume":"22","author":"Hendrix","year":"2021","journal-title":"Int J Mol Sci"},{"key":"2024011113572486400_ref20","doi-asserted-by":"crossref","first-page":"124","DOI":"10.1109\/TCBB.2018.2858806","article-title":"EL_LSTM: prediction of DNA-binding residue from protein sequence by combining long short-term memory and ensemble learning","volume":"17","author":"Zhou","year":"2018","journal-title":"IEEE\/ACM Trans Comput Biol Bioinform"},{"key":"2024011113572486400_ref21","doi-asserted-by":"crossref","first-page":"2428","DOI":"10.1016\/j.jmb.2020.02.026","article-title":"ProNA2020 predicts protein\u2013DNA, protein\u2013RNA, and protein\u2013protein binding proteins and residues from sequence","volume":"432","author":"Qiu","year":"2020","journal-title":"J Mol Biol"},{"key":"2024011113572486400_ref22","doi-asserted-by":"crossref","first-page":"930","DOI":"10.1093\/bioinformatics\/bty756","article-title":"Improving the prediction of protein\u2013nucleic acids binding residues via multiple sequence profiles and the consensus of complementary methods","volume":"35","author":"Su","year":"2019","journal-title":"Bioinformatics"},{"key":"2024011113572486400_ref23","doi-asserted-by":"crossref","first-page":"e51","DOI":"10.1093\/nar\/gkab044","article-title":"GraphBind: protein structural context embedded rules learned by hierarchical graph neural networks for recognizing nucleic-acid-binding residues","volume":"49","author":"Xia","year":"2021","journal-title":"Nucleic Acids Res"},{"key":"2024011113572486400_ref24","doi-asserted-by":"crossref","first-page":"162","DOI":"10.1093\/bioinformatics\/btaa701","article-title":"Unsupervised protein embeddings outperform hand-crafted sequence and structure features at predicting molecular function","volume":"37","author":"Villegas-Morcillo","year":"2021","journal-title":"Bioinformatics"},{"key":"2024011113572486400_ref25","doi-asserted-by":"crossref","first-page":"225","DOI":"10.1016\/j.aiopen.2021.08.002","article-title":"Pre-trained models: past, present and future","volume":"2","author":"Han","year":"2021","journal-title":"AI Open"},{"issue":"2009","key":"2024011113572486400_ref26","article-title":"Improved the protein complex prediction with protein language models","volume":"2022","author":"Chen","year":"2015","journal-title":"bioRxiv"},{"issue":"2011","key":"2024011113572486400_ref27","article-title":"Adapting protein language models for rapid DTI prediction","volume":"2022","author":"Sledzieski","year":"2003","journal-title":"bioRxiv"},{"key":"2024011113572486400_ref28","doi-asserted-by":"crossref","DOI":"10.1093\/bioinformatics\/btad456","article-title":"CSI: Contrastive data Stratification for Interaction prediction and its application to compound\u2013protein interaction prediction","volume":"39","author":"Kalia","year":"2023","journal-title":"Bioinformatics"},{"key":"2024011113572486400_ref29","doi-asserted-by":"crossref","DOI":"10.1073\/pnas.2220778120","article-title":"Contrastive learning in protein language space predicts interactions between drugs and protein targets","volume":"120","author":"Singh","year":"2023","journal-title":"Proc Natl Acad Sci U S A"},{"key":"2024011113572486400_ref30","doi-asserted-by":"crossref","first-page":"7112","DOI":"10.1109\/TPAMI.2021.3095381","article-title":"Prottrans: toward understanding the language of life through self-supervised learning","volume":"44","author":"Elnaggar","year":"2021","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"2024011113572486400_ref31"},{"key":"2024011113572486400_ref32","author":"Song"},{"key":"2024011113572486400_ref33","article-title":"Predicting protein-peptide binding residues via interpretable deep learning","author":"Wang","year":"2022","journal-title":"Bioinformatics"},{"key":"2024011113572486400_ref34"},{"key":"2024011113572486400_ref35","doi-asserted-by":"crossref","DOI":"10.1093\/bib\/bbab564","article-title":"AlphaFold2-aware protein-DNA binding site prediction using graph transformer","author":"Yuan","year":"2022","journal-title":"Brief Bioinform"},{"key":"2024011113572486400_ref36","author":"Yang"},{"issue":"2018","key":"2024011113572486400_ref37","volume":"1945\u20131954","author":"He","journal-title":"2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition"},{"key":"2024011113572486400_ref38","doi-asserted-by":"crossref","first-page":"318","DOI":"10.1109\/TPAMI.2018.2858826","article-title":"Focal loss for dense object detection","volume":"42","author":"Lin","year":"2017","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"2024011113572486400_ref39","author":"Cui"},{"key":"2024011113572486400_ref40","author":"Wen"},{"key":"2024011113572486400_ref41","doi-asserted-by":"crossref","first-page":"233","DOI":"10.1146\/annurev-biochem-060408-091030","article-title":"Origins of specificity in protein-DNA recognition","volume":"79","author":"Rohs","year":"2010","journal-title":"Annu Rev Biochem"},{"key":"2024011113572486400_ref42","doi-asserted-by":"crossref","first-page":"reviews001.1","DOI":"10.1186\/gb-2000-1-1-reviews001","article-title":"An overview of the structures of protein-DNA complexes","volume":"1","author":"Luscombe","year":"2000","journal-title":"Genome Biol"},{"key":"2024011113572486400_ref43","doi-asserted-by":"crossref","first-page":"109","DOI":"10.1002\/prot.21328","article-title":"Dynamic \u03b1-helices: conformations that do not conform","volume":"68","author":"Sandhu","year":"2007","journal-title":"Proteins"},{"key":"2024011113572486400_ref44","doi-asserted-by":"crossref","first-page":"bbac236","DOI":"10.1093\/bib\/bbac236","article-title":"iDRNA-ITF: identifying DNA- and RNA-binding residues in proteins based on induction and transfer framework","volume":"23","author":"Wang","year":"2022","journal-title":"Brief Bioinform"},{"key":"2024011113572486400_ref45","doi-asserted-by":"crossref","first-page":"bbac538","DOI":"10.1093\/bib\/bbac538","article-title":"Prediction of RNA-interacting residues in a protein using CNN and evolutionary profile","volume":"24","author":"Patiyal","year":"2023","journal-title":"Brief Bioinform"},{"key":"2024011113572486400_ref46","doi-asserted-by":"crossref","first-page":"321","DOI":"10.1613\/jair.953","article-title":"SMOTE: synthetic minority over-sampling technique","volume":"16","author":"Chawla","year":"2002","journal-title":"J Artif Intell Res"},{"key":"2024011113572486400_ref47"},{"key":"2024011113572486400_ref48","doi-asserted-by":"crossref","first-page":"1123","DOI":"10.1126\/science.ade2574","article-title":"Evolutionary-scale prediction of atomic-level protein structure with a language model","volume":"379","author":"Lin","year":"2023","journal-title":"Science"},{"key":"2024011113572486400_ref49","doi-asserted-by":"crossref","first-page":"503","DOI":"10.1038\/s42003-022-03445-2","article-title":"PepNN: a deep attention model for the identification of peptide binding sites","volume":"5","author":"Abdin","year":"2022","journal-title":"Commun Biol"},{"issue":"(13)","key":"2024011113572486400_ref50","doi-asserted-by":"crossref","first-page":"1658","DOI":"10.1093\/bioinformatics\/btl158","article-title":"Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences","volume":"22","author":"Li","year":"2006","journal-title":"Bioinformatics"},{"key":"2024011113572486400_ref51","doi-asserted-by":"crossref","first-page":"D1096","DOI":"10.1093\/nar\/gks966","article-title":"BioLiP: a semi-manually curated database for biologically relevant ligand\u2013protein interactions","volume":"41","author":"Yang","year":"2012","journal-title":"Nucleic Acids Research"},{"key":"2024011113572486400_ref52","doi-asserted-by":"crossref","first-page":"D1140","DOI":"10.1093\/nar\/gkt1043","article-title":"SAbDab: the structural antibody database","volume":"42","author":"Dunbar","year":"2013","journal-title":"Nucleic Acids Res"},{"key":"2024011113572486400_ref53","article-title":"HuggingFace\u2019s Transformers: State-of-the-art Natural Language Processing","author":"Wolf","year":"2019","journal-title":"ArXiv"}],"container-title":["Briefings in Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bib\/article-pdf\/25\/1\/bbad488\/55381199\/bbad488.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bib\/article-pdf\/25\/1\/bbad488\/55381199\/bbad488.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,11]],"date-time":"2024-01-11T14:59:06Z","timestamp":1704985146000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bib\/article\/doi\/10.1093\/bib\/bbad488\/7505238"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,22]]},"references-count":53,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2023,11,22]]}},"URL":"https:\/\/doi.org\/10.1093\/bib\/bbad488","relation":{},"ISSN":["1467-5463","1477-4054"],"issn-type":[{"value":"1467-5463","type":"print"},{"value":"1477-4054","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2024,1,1]]},"published":{"date-parts":[[2023,11,22]]},"article-number":"bbad488"}}