{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,25]],"date-time":"2026-06-25T22:18:32Z","timestamp":1782425912846,"version":"3.54.5"},"reference-count":20,"publisher":"Oxford University Press (OUP)","issue":"7","funder":[{"DOI":"10.13039\/100000060","name":"National Institute of Allergy and Infectious Diseases","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000060","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,4,1]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:sec>\n                  <jats:title>Motivation<\/jats:title>\n                  <jats:p>Protein solubility can be a decisive factor in both research and production efficiency, and in silico sequence-based predictors that can accurately estimate solubility outcomes are highly sought.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Results<\/jats:title>\n                  <jats:p>In this study, we present a novel approach termed PRotein SolubIlity Predictor (PaRSnIP), which uses a gradient boosting machine algorithm as well as an approximation of sequence and structural features of the protein of interest. Based on an independent test set, PaRSnIP outperformed other state-of-the-art sequence-based methods by more than 9% in accuracy and 0.17 in Matthew\u2019s correlation coefficient, with an overall accuracy of 74% and Matthew\u2019s correlation coefficient of 0.48. Additionally, PaRSnIP provides importance scores for all features used in training. We observed higher fractions of exposed residues to associate positively with protein solubility and tripeptide stretches with multiple histidines to associate negatively with solubility. The improved prediction accuracy of PaRSnIP should enable it to predict protein solubility with greater reliability and to screen for sequence variants with enhanced manufacturability.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Availability and implementation<\/jats:title>\n                  <jats:p>PaRSnIP software is available for download under GitHub (https:\/\/github.com\/RedaRawi\/PaRSnIP).<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Supplementary information<\/jats:title>\n                  <jats:p>Supplementary data are available at Bioinformatics online.<\/jats:p>\n               <\/jats:sec>","DOI":"10.1093\/bioinformatics\/btx662","type":"journal-article","created":{"date-parts":[[2017,10,17]],"date-time":"2017-10-17T19:11:49Z","timestamp":1508267509000},"page":"1092-1098","source":"Crossref","is-referenced-by-count":104,"title":["PaRSnIP: sequence-based protein solubility prediction using gradient boosting machine"],"prefix":"10.1093","volume":"34","author":[{"given":"Reda","family":"Rawi","sequence":"first","affiliation":[{"name":"Vaccine Research Center, National Institute of Allergy and Infectious Diseases, National Institutes of Health, Bethesda, MD, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Raghvendra","family":"Mall","sequence":"additional","affiliation":[{"name":"Qatar Computing Research Institute, Hamad Bin Khalifa University, Doha, Qatar"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Khalid","family":"Kunji","sequence":"additional","affiliation":[{"name":"Qatar Computing Research Institute, Hamad Bin Khalifa University, Doha, Qatar"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chen-Hsiang","family":"Shen","sequence":"additional","affiliation":[{"name":"Vaccine Research Center, National Institute of Allergy and Infectious Diseases, National Institutes of Health, Bethesda, MD, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Peter D","family":"Kwong","sequence":"additional","affiliation":[{"name":"Vaccine Research Center, National Institute of Allergy and Infectious Diseases, National Institutes of Health, Bethesda, MD, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Gwo-Yu","family":"Chuang","sequence":"additional","affiliation":[{"name":"Vaccine Research Center, National Institute of Allergy and Infectious Diseases, National Institutes of Health, Bethesda, MD, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"286","published-online":{"date-parts":[[2017,10,23]]},"reference":[{"key":"2023012712495112100_btx662-B1","doi-asserted-by":"crossref","first-page":"237","DOI":"10.1016\/j.jmb.2011.12.005","article-title":"Sequence-based prediction of protein solubility","volume":"421","author":"Agostini","year":"2012","journal-title":"J. Mol. Biol"},{"key":"2023012712495112100_btx662-B2","doi-asserted-by":"crossref","first-page":"2884","DOI":"10.1093\/nar\/29.13.2884","article-title":"SPINE: an integrated tracking database and data mining approach for identifying feasible targets in high-throughput structural proteomics","volume":"29","author":"Bertone","year":"2001","journal-title":"Nucleic Acids Res"},{"key":"2023012712495112100_btx662-B3","doi-asserted-by":"crossref","first-page":"3333.","DOI":"10.1038\/srep03333","article-title":"Soluble expression of proteins correlates with a lack of positively-charged surface","volume":"3","author":"Chan","year":"2013","journal-title":"Sci. Rep"},{"key":"2023012712495112100_btx662-B4","doi-asserted-by":"crossref","first-page":"953","DOI":"10.1093\/bib\/bbt057","article-title":"Bioinformatics approaches for improved recombinant protein production in Escherichia coli: protein solubility prediction","volume":"15","author":"Chang","year":"2014","journal-title":"Brief. Bioinformatics"},{"key":"2023012712495112100_btx662-B5","doi-asserted-by":"crossref","first-page":"903","DOI":"10.1038\/82823","article-title":"Structural proteomics of an archaeon","volume":"7","author":"Christendat","year":"2000","journal-title":"Nat. Struct. Biol"},{"key":"2023012712495112100_btx662-B6","doi-asserted-by":"crossref","first-page":"273","DOI":"10.1007\/BF00994018","article-title":"Support-Vector Networks","volume":"20","author":"Cortes","year":"1995","journal-title":"Mach. Learn"},{"key":"2023012712495112100_btx662-B7","doi-asserted-by":"crossref","first-page":"382","DOI":"10.1002\/(SICI)1097-0290(19991120)65:4<382::AID-BIT2>3.0.CO;2-I","article-title":"New fusion protein systems designed to give soluble expression in Escherichia coli","volume":"65","author":"Davis","year":"1999","journal-title":"Biotechnol. Bioeng"},{"key":"2023012712495112100_btx662-B8","doi-asserted-by":"crossref","first-page":"1189","DOI":"10.1214\/aos\/1013203451","article-title":"Greedy function approximation: a gradient boosting machine","volume":"29","author":"Friedman","year":"2001","journal-title":"Ann. Stat"},{"key":"2023012712495112100_btx662-B9","doi-asserted-by":"crossref","first-page":"3150","DOI":"10.1093\/bioinformatics\/bts565","article-title":"CD-HIT: accelerated for clustering the next-generation sequencing data","volume":"28","author":"Fu","year":"2012","journal-title":"Bioinformatics (Oxford, England)"},{"key":"2023012712495112100_btx662-B10","doi-asserted-by":"crossref","first-page":"S3","DOI":"10.1186\/1471-2105-13-S17-S3","article-title":"Prediction and analysis of protein solubility using a novel scoring card method with dipeptide composition","volume":"13","author":"Huang","year":"2012","journal-title":"BMC Bioinformatics"},{"key":"2023012712495112100_btx662-B11","doi-asserted-by":"crossref","first-page":"582","DOI":"10.1110\/ps.041009005","article-title":"Understanding the relationship between the primary structure of proteins and its propensity to be soluble on overexpression in Escherichia coli","volume":"14","author":"Idicula-Thomas","year":"2005","journal-title":"Prot. Sci"},{"key":"2023012712495112100_btx662-B12","first-page":"1658","article-title":"Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences","volume":"22","author":"Li","year":"2006","journal-title":"Bioinformatics (Oxford, England)"},{"key":"2023012712495112100_btx662-B13","first-page":"1014.","article-title":"Newton-Raphson and EM algorithms for linear mixed-effects models for repeated-measures data","volume":"83","author":"Lindstrom","year":"1988","journal-title":"J. Am. Stat. Assoc"},{"key":"2023012712495112100_btx662-B14","first-page":"2592","article-title":"SSpro\/ACCpro 5: almost perfect prediction of protein secondary structure and relative solvent accessibility using profiles, machine learning and structural similarity","volume":"30","author":"Magnan","year":"2014","journal-title":"Bioinformatics (Oxford, England)"},{"key":"2023012712495112100_btx662-B15","doi-asserted-by":"crossref","first-page":"2200","DOI":"10.1093\/bioinformatics\/btp386","article-title":"SOLpro: accurate sequence-based prediction of protein solubility","volume":"25","author":"Magnan","year":"2009","journal-title":"Bioinformatics (Oxford, England)"},{"key":"2023012712495112100_btx662-B16","volume-title":"Nonlinear Estimation and Classification","author":"Schapire","year":"2003"},{"key":"2023012712495112100_btx662-B17","doi-asserted-by":"crossref","first-page":"2536","DOI":"10.1093\/bioinformatics\/btl623","article-title":"Protein solubility: sequence based prediction and experimental verification","volume":"23","author":"Smialowski","year":"2007","journal-title":"Bioinformatics"},{"key":"2023012712495112100_btx662-B18","doi-asserted-by":"crossref","first-page":"2192","DOI":"10.1111\/j.1742-4658.2012.08603.x","article-title":"PROSO II - a new method for protein solubility prediction","volume":"279","author":"Smialowski","year":"2012","journal-title":"FEBS J"},{"key":"2023012712495112100_btx662-B19","first-page":"443","article-title":"Predicting the solubility of recombinant proteins in Escherichia coli","volume":"9","author":"Wilkinson","year":"1991","journal-title":"Bio\/Technology (Nature Publishing Company)"},{"key":"2023012712495112100_btx662-B20","doi-asserted-by":"crossref","first-page":"217","DOI":"10.1023\/B:jsfg.0000031965.37625.0e","article-title":"His tag effect on solubility of human proteins produced in Escherichia coli: a comparison between four expression vectors","volume":"5","author":"Woestenenk","year":"2004","journal-title":"J. Struct. Funct. Genomics"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/34\/7\/1092\/48913462\/bioinformatics_34_7_1092.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/34\/7\/1092\/48913462\/bioinformatics_34_7_1092.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T13:44:04Z","timestamp":1674827044000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/34\/7\/1092\/4562500"}},"subtitle":[],"editor":[{"given":"John","family":"Hancock","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"editor"}]}],"short-title":[],"issued":{"date-parts":[[2017,10,23]]},"references-count":20,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2018,4,1]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/btx662","relation":{},"ISSN":["1367-4803","1367-4811"],"issn-type":[{"value":"1367-4803","type":"print"},{"value":"1367-4811","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2018,4,1]]},"published":{"date-parts":[[2017,10,23]]}}}