{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,25]],"date-time":"2026-06-25T22:18:36Z","timestamp":1782425916924,"version":"3.54.5"},"reference-count":34,"publisher":"Oxford University Press (OUP)","issue":"15","license":[{"start":{"date-parts":[[2018,3,15]],"date-time":"2018-03-15T00:00:00Z","timestamp":1521072000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/academic.oup.com\/journals\/pages\/about_us\/legal\/notices"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,8,1]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:sec>\n                  <jats:title>Motivation<\/jats:title>\n                  <jats:p>Protein solubility plays a vital role in pharmaceutical research and production yield. For a given protein, the extent of its solubility can represent the quality of its function, and is ultimately defined by its sequence. Thus, it is imperative to develop novel, highly accurate in silico sequence-based protein solubility predictors. In this work we propose, DeepSol, a novel Deep Learning-based protein solubility predictor. The backbone of our framework is a convolutional neural network that exploits k-mer structure and additional sequence and structural features extracted from the protein sequence.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Results<\/jats:title>\n                  <jats:p>DeepSol outperformed all known sequence-based state-of-the-art solubility prediction methods and attained an accuracy of 0.77 and Matthew\u2019s correlation coefficient of 0.55. The superior prediction accuracy of DeepSol allows to screen for sequences with enhanced production capacity and can more reliably predict solubility of novel proteins.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Availability and implementation<\/jats:title>\n                  <jats:p>DeepSol\u2019s best performing models and results are publicly deposited at https:\/\/doi.org\/10.5281\/zenodo.1162886 (Khurana and Mall, 2018).<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Supplementary information<\/jats:title>\n                  <jats:p>Supplementary data are available at Bioinformatics online.<\/jats:p>\n               <\/jats:sec>","DOI":"10.1093\/bioinformatics\/bty166","type":"journal-article","created":{"date-parts":[[2018,3,15]],"date-time":"2018-03-15T10:12:46Z","timestamp":1521108766000},"page":"2605-2613","source":"Crossref","is-referenced-by-count":208,"title":["DeepSol: a deep learning framework for sequence-based protein solubility prediction"],"prefix":"10.1093","volume":"34","author":[{"given":"Sameer","family":"Khurana","sequence":"first","affiliation":[{"name":"Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Reda","family":"Rawi","sequence":"additional","affiliation":[{"name":"Vaccine Research Center, National Institute of Allergy and Infectious Diseases, National Institute of Health, Bethesda, MD, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Khalid","family":"Kunji","sequence":"additional","affiliation":[{"name":"Qatar Computing Research Institute, Hamad Bin Khalifa University, Doha, Qatar"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Gwo-Yu","family":"Chuang","sequence":"additional","affiliation":[{"name":"Vaccine Research Center, National Institute of Allergy and Infectious Diseases, National Institute of Health, Bethesda, MD, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Halima","family":"Bensmail","sequence":"additional","affiliation":[{"name":"Qatar Computing Research Institute, Hamad Bin Khalifa University, Doha, Qatar"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Raghvendra","family":"Mall","sequence":"additional","affiliation":[{"name":"Qatar Computing Research Institute, Hamad Bin Khalifa University, Doha, Qatar"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"286","published-online":{"date-parts":[[2018,3,15]]},"reference":[{"key":"2023012713051156700_bty166-B1","doi-asserted-by":"crossref","first-page":"237","DOI":"10.1016\/j.jmb.2011.12.005","article-title":"Sequence-based prediction of protein solubility","volume":"421","author":"Agostini","year":"2012","journal-title":"J. Mol. Biol"},{"key":"2023012713051156700_bty166-B2","doi-asserted-by":"crossref","first-page":"e0141287.","DOI":"10.1371\/journal.pone.0141287","article-title":"Continuous distributed representation of biological sequences for deep proteomics and genomics","volume":"10","author":"Asgari","year":"2015","journal-title":"PloS One"},{"key":"2023012713051156700_bty166-B3","doi-asserted-by":"crossref","first-page":"2884","DOI":"10.1093\/nar\/29.13.2884","article-title":"SPINE: an integrated tracking database and data mining approach for identifying feasible targets in high-throughput structural proteomics","volume":"29","author":"Bertone","year":"2001","journal-title":"Nucleic Acids Res"},{"key":"2023012713051156700_bty166-B4","doi-asserted-by":"crossref","first-page":"S21.","DOI":"10.1186\/1471-2105-11-S1-S21","article-title":"Learning to predict expression efficacy of vectors in recombinant protein production","volume":"11","author":"Chan","year":"2010","journal-title":"BMC Bioinformatics"},{"key":"2023012713051156700_bty166-B5","doi-asserted-by":"crossref","first-page":"953","DOI":"10.1093\/bib\/bbt057","article-title":"Bioinformatics approaches for improved recombinant protein production in Escherichia coli: protein solubility prediction","volume":"15","author":"Chang","year":"2014","journal-title":"Brief. Bioinform"},{"key":"2023012713051156700_bty166-B6","doi-asserted-by":"crossref","first-page":"903","DOI":"10.1038\/82823","article-title":"Structural proteomics of an archaeon","volume":"7","author":"Christendat","year":"2000","journal-title":"Nat. Struct. Biol"},{"key":"2023012713051156700_bty166-B7","doi-asserted-by":"crossref","first-page":"273","DOI":"10.1007\/BF00994018","article-title":"Support vector networks","volume":"20","author":"Cortes","year":"1995","journal-title":"Mach. Learn"},{"key":"2023012713051156700_bty166-B8","doi-asserted-by":"crossref","first-page":"382","DOI":"10.1002\/(SICI)1097-0290(19991120)65:4<382::AID-BIT2>3.0.CO;2-I","article-title":"New fusion protein systems designed to give soluble expression in Escherichia coli","volume":"65","author":"Davis","year":"1999","journal-title":"Biotechnol. Bioeng"},{"key":"2023012713051156700_bty166-B9","doi-asserted-by":"crossref","first-page":"1189","DOI":"10.1214\/aos\/1013203451","article-title":"Greedy function approximation: a gradient boosting machine","volume":"29","author":"Friedman","year":"2001","journal-title":"Ann. Stat"},{"key":"2023012713051156700_bty166-B10","doi-asserted-by":"crossref","first-page":"3150","DOI":"10.1093\/bioinformatics\/bts565","article-title":"CD-HIT: accelerated for clustering the next-generation sequencing data","volume":"28","author":"Fu","year":"2012","journal-title":"Bioinformatics"},{"key":"2023012713051156700_bty166-B11","volume-title":"Digital Design and Computer Architecture.","author":"Harris","year":"2010"},{"key":"2023012713051156700_bty166-B12","author":"Hou","year":"2017"},{"key":"2023012713051156700_bty166-B13","doi-asserted-by":"crossref","first-page":"S3","DOI":"10.1186\/1471-2105-13-S17-S3","article-title":"Prediction and analysis of protein solubility using a novel scoring card method with dipeptide composition","volume":"13(Suppl 1)","author":"Huang","year":"2012","journal-title":"BMC Bioinformatics"},{"key":"2023012713051156700_bty166-B14","doi-asserted-by":"crossref","first-page":"582","DOI":"10.1110\/ps.041009005","article-title":"Understanding the relationship between the primary structure of proteins and its propensity to be soluble on overexpression in Escherichia coli","volume":"14","author":"Idicula-Thomas","year":"2005","journal-title":"Protein Sci"},{"key":"2023012713051156700_bty166-B15","author":"Khurana","year":"2018"},{"key":"2023012713051156700_bty166-B16","author":"Kingma","year":"2015"},{"key":"2023012713051156700_bty166-B17","first-page":"1995","article-title":"Convolutional networks for images, speech, and time series","volume":"3361","author":"LeCun","year":"1995","journal-title":"The Handbook of Brain Theory and Neural Networks"},{"key":"2023012713051156700_bty166-B18","doi-asserted-by":"crossref","first-page":"1658","DOI":"10.1093\/bioinformatics\/btl158","article-title":"CD-HIT: a fast program for clustering and comparing large sets of protein or nucleotide sequences","volume":"22","author":"Li","year":"2006","journal-title":"Bioinformatics"},{"key":"2023012713051156700_bty166-B19","author":"Li","year":"2016"},{"key":"2023012713051156700_bty166-B20","first-page":"2592","article-title":"SSpro\/ACCpro 5: almost perfect prediction of protein secondary structure and relative solvent accessibility using profiles, machine learning and structural similarity","volume":"30","author":"Magnan","year":"2014","journal-title":"Bioinformatics (Oxford, England)"},{"key":"2023012713051156700_bty166-B21","doi-asserted-by":"crossref","first-page":"2200","DOI":"10.1093\/bioinformatics\/btp386","article-title":"SOLpro: accurate sequence-based prediction of protein solubility","volume":"25","author":"Magnan","year":"2009","journal-title":"Bioinformatics"},{"key":"2023012713051156700_bty166-B22","doi-asserted-by":"crossref","first-page":"50","DOI":"10.1214\/aoms\/1177730491","article-title":"On a test of whether one of two random variables is stochastically larger than the other","volume":"18","author":"Mann","year":"1947","journal-title":"Ann. Math. Stat"},{"key":"2023012713051156700_bty166-B23","first-page":"3111","article-title":"Distributed representations of words and phrases and their compositionality","author":"Mikolov","year":"2013","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2023012713051156700_bty166-B24","doi-asserted-by":"crossref","first-page":"927.","DOI":"10.1038\/nbt732","article-title":"Engineering soluble proteins for structural genomics","volume":"20","author":"P\u00e9delacq","year":"2002","journal-title":"Nat. Biotechnol"},{"key":"2023012713051156700_bty166-B25","article-title":"PaRSnIP: sequence-based protein solubility prediction using gradient boosting machine","author":"Rawi","year":"2017","journal-title":"Bioinformatics"},{"key":"2023012713051156700_bty166-B26","doi-asserted-by":"crossref","first-page":"2536","DOI":"10.1093\/bioinformatics\/btl623","article-title":"Protein solubility: sequence based prediction and experimental verification","volume":"23","author":"Smialowski","year":"2007","journal-title":"Bioinformatics"},{"key":"2023012713051156700_bty166-B27","doi-asserted-by":"crossref","first-page":"2192","DOI":"10.1111\/j.1742-4658.2012.08603.x","article-title":"PROSO II - a new method for protein solubility prediction","volume":"279","author":"Smialowski","year":"2012","journal-title":"FEBS J"},{"key":"2023012713051156700_bty166-B28","doi-asserted-by":"crossref","DOI":"10.1142\/5089","volume-title":"Least Squares Support Vector Machines","author":"Suykens","year":"2002"},{"key":"2023012713051156700_bty166-B29","doi-asserted-by":"crossref","first-page":"136","DOI":"10.1016\/j.sbi.2017.01.004","article-title":"Exploring the relationships between protein sequence, structure and solubility","volume":"42","author":"Trainor","year":"2017","journal-title":"Curr. Opin. Struct. Biol"},{"key":"2023012713051156700_bty166-B30","doi-asserted-by":"crossref","first-page":"e45869.","DOI":"10.1371\/journal.pone.0045869","article-title":"Exploring sequence characteristics related to high-level production of secreted proteins in aspergillus niger","volume":"7","author":"van den Berg","year":"2012","journal-title":"PLoS One"},{"key":"2023012713051156700_bty166-B31","article-title":"Protein secondary structure prediction using deep convolutional neural fields","volume":"6","author":"Wang","year":"2016","journal-title":"Sci. Rep"},{"key":"2023012713051156700_bty166-B32","doi-asserted-by":"crossref","first-page":"e1005324.","DOI":"10.1371\/journal.pcbi.1005324","article-title":"Accurate de novo prediction of protein contact map by ultra-deep learning model","volume":"13","author":"Wang","year":"2017","journal-title":"PLoS Comput. Biol"},{"key":"2023012713051156700_bty166-B33","first-page":"443","article-title":"Predicting the solubility of recombinant proteins in Escherichia coli","volume":"9","author":"Wilkinson","year":"1991","journal-title":"Biotechnology"},{"key":"2023012713051156700_bty166-B34","author":"Xu","year":"2015"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/34\/15\/2605\/48935396\/bioinformatics_34_15_2605.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/34\/15\/2605\/48935396\/bioinformatics_34_15_2605.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T14:07:17Z","timestamp":1674828437000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/34\/15\/2605\/4938490"}},"subtitle":[],"editor":[{"given":"Alfonso","family":"Valencia","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"editor"}]}],"short-title":[],"issued":{"date-parts":[[2018,3,15]]},"references-count":34,"journal-issue":{"issue":"15","published-print":{"date-parts":[[2018,8,1]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/bty166","relation":{},"ISSN":["1367-4803","1367-4811"],"issn-type":[{"value":"1367-4803","type":"print"},{"value":"1367-4811","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2018,8,1]]},"published":{"date-parts":[[2018,3,15]]}}}