{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,15]],"date-time":"2026-02-15T02:55:48Z","timestamp":1771124148229,"version":"3.50.1"},"reference-count":52,"publisher":"Oxford University Press (OUP)","issue":"16","license":[{"start":{"date-parts":[[2017,4,7]],"date-time":"2017-04-07T00:00:00Z","timestamp":1491523200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/academic.oup.com\/journals\/pages\/about_us\/legal\/notices"}],"funder":[{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","award":["R01 GM057089"],"award-info":[{"award-number":["R01 GM057089"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017,8,15]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:sec>\n                  <jats:title>Motivation<\/jats:title>\n                  <jats:p>The Human Protein Atlas (HPA) enables the simultaneous characterization of thousands of proteins across various tissues to pinpoint their spatial location in the human body. This has been achieved through transcriptomics and high-throughput immunohistochemistry-based approaches, where over 40\u2009000 unique human protein fragments have been expressed in E. coli. These datasets enable quantitative tracking of entire cellular proteomes and present new avenues for understanding molecular-level properties influencing expression and solubility.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Results<\/jats:title>\n                  <jats:p>Combining computational biology and machine learning identifies protein properties that hinder the HPA high-throughput antibody production pipeline. We predict protein expression and solubility with accuracies of 70% and 80%, respectively, based on a subset of key properties (aromaticity, hydropathy and isoelectric point). We guide the selection of protein fragments based on these characteristics to optimize high-throughput experimentation.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Availability and implementation<\/jats:title>\n                  <jats:p>We present the machine learning workflow as a series of IPython notebooks hosted on GitHub (https:\/\/github.com\/SBRG\/Protein_ML). The workflow can be used as a template for analysis of further expression and solubility datasets.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Supplementary information<\/jats:title>\n                  <jats:p>Supplementary data are available at Bioinformatics online.<\/jats:p>\n               <\/jats:sec>","DOI":"10.1093\/bioinformatics\/btx207","type":"journal-article","created":{"date-parts":[[2017,4,5]],"date-time":"2017-04-05T19:57:27Z","timestamp":1491422247000},"page":"2487-2495","source":"Crossref","is-referenced-by-count":12,"title":["Machine learning in computational biology to accelerate high-throughput protein expression"],"prefix":"10.1093","volume":"33","author":[{"given":"Anand","family":"Sastry","sequence":"first","affiliation":[{"name":"Department of Bioengineering, University of California, San Diego, CA, USA"}]},{"given":"Jonathan","family":"Monk","sequence":"additional","affiliation":[{"name":"Department of Bioengineering, University of California, San Diego, CA, USA"}]},{"given":"Hanna","family":"Tegel","sequence":"additional","affiliation":[{"name":"KTH - Royal Institute of Technology, Department of Proteomics and Nanobiotechnology, Stockholm, Sweden"}]},{"given":"Mathias","family":"Uhlen","sequence":"additional","affiliation":[{"name":"KTH - Royal Institute of Technology, Department of Proteomics and Nanobiotechnology, Stockholm, Sweden"},{"name":"The Novo Nordisk Foundation Center for Biosustainability, Technical University of Denmark, Lyngby, Denmark"}]},{"given":"Bernhard O","family":"Palsson","sequence":"additional","affiliation":[{"name":"Department of Bioengineering, University of California, San Diego, CA, USA"},{"name":"The Novo Nordisk Foundation Center for Biosustainability, Technical University of Denmark, Lyngby, Denmark"}]},{"given":"Johan","family":"Rockberg","sequence":"additional","affiliation":[{"name":"KTH - Royal Institute of Technology, Department of Proteomics and Nanobiotechnology, Stockholm, Sweden"}]},{"given":"Elizabeth","family":"Brunk","sequence":"additional","affiliation":[{"name":"Department of Bioengineering, University of California, San Diego, CA, USA"},{"name":"The Novo Nordisk Foundation Center for Biosustainability, Technical University of Denmark, Lyngby, Denmark"}]}],"member":"286","published-online":{"date-parts":[[2017,4,7]]},"reference":[{"key":"2023020206245384900_btx207-B1","doi-asserted-by":"crossref","first-page":"405","DOI":"10.1074\/mcp.M300022-MCP200","article-title":"Affinity proteomics for systematic protein profiling of chromosome 21 gene products in human tissues","volume":"2","author":"Agaton","year":"2003","journal-title":"Mol. Cell. Proteomics"},{"key":"2023020206245384900_btx207-B2","doi-asserted-by":"crossref","first-page":"878","DOI":"10.15252\/msb.20156651","article-title":"Deep learning for computational biology","volume":"12","author":"Angermueller","year":"2016","journal-title":"Mol. Syst. Biol"},{"key":"2023020206245384900_btx207-B3","doi-asserted-by":"crossref","first-page":"2087","DOI":"10.15252\/embj.201694699","article-title":"Codon identity regulates mRNA stability and translation efficiency during the maternal-to-zygotic transition","volume":"35","author":"Bazzini","year":"2016","journal-title":"EMBO J"},{"key":"2023020206245384900_btx207-B4","doi-asserted-by":"crossref","first-page":"333","DOI":"10.1038\/nrg3433","article-title":"Computational solutions for omics data","volume":"14","author":"Berger","year":"2013","journal-title":"Nat. Rev. Genet"},{"key":"2023020206245384900_btx207-B54","doi-asserted-by":"crossref","first-page":"2832","DOI":"10.1002\/pmic.200800203","article-title":"A whole-genome bioinformatics approach to selection of antigens for systematic antibody generation","volume":"8","author":"Berglund","year":"2008","journal-title":"Proteomics"},{"key":"2023020206245384900_btx207-B5","doi-asserted-by":"crossref","first-page":"358","DOI":"10.1038\/nature16509","article-title":"Codon influence on protein expression in e. coli correlates with mRNA levels","volume":"529","author":"Bo\u00ebl","year":"2016","journal-title":"Nature"},{"key":"2023020206245384900_btx207-B6","doi-asserted-by":"crossref","first-page":"99","DOI":"10.1016\/0022-2836(68)90076-4","article-title":"RNA codons and protein synthesis. 15. dissimilar responses of mammalian and bacterial transfer RNA fractions to messenger RNA codons","volume":"37","author":"Caskey","year":"1968","journal-title":"J. Mol. Biol"},{"key":"2023020206245384900_btx207-B7","doi-asserted-by":"crossref","first-page":"D93","DOI":"10.1093\/nar\/gkn787","article-title":"GtRNAdb: a database of transfer RNA genes detected in genomic sequence","volume":"37","author":"Chan","year":"2009","journal-title":"Nucleic Acids Res"},{"key":"2023020206245384900_btx207-B8","doi-asserted-by":"crossref","first-page":"2641","DOI":"10.1101\/gad.8.21.2641","article-title":"Role of the AGA\/AGG codons, the rarest codons in global gene expression in Escherichia coli","volume":"8","author":"Chen","year":"1994","journal-title":"Genes Dev"},{"key":"2023020206245384900_btx207-B9","doi-asserted-by":"crossref","first-page":"W72","DOI":"10.1093\/nar\/gki396","article-title":"SCRATCH: a protein structure and structural feature prediction server","volume":"33","author":"Cheng","year":"2005","journal-title":"Nucleic Acids Res"},{"key":"2023020206245384900_btx207-B10","doi-asserted-by":"crossref","first-page":"1422","DOI":"10.1093\/bioinformatics\/btp163","article-title":"Biopython: freely available python tools for computational molecular biology and bioinformatics","volume":"25","author":"Cock","year":"2009","journal-title":"Bioinformatics"},{"key":"2023020206245384900_btx207-B11","doi-asserted-by":"crossref","first-page":"2718","DOI":"10.1128\/jb.178.9.2718-2720.1996","article-title":"Synonymous codon selection controls in vivo turnover and amount of mRNA in Escherichia coli bla and ompa genes","volume":"178","author":"Deana","year":"1996","journal-title":"J. Bacteriol"},{"key":"2023020206245384900_btx207-B12","doi-asserted-by":"crossref","first-page":"374","DOI":"10.1002\/bit.22537","article-title":"Prediction of protein solubility in Escherichia coli using logistic regression","volume":"105","author":"Diaz","year":"2010","journal-title":"Biotechnol. Bioeng"},{"key":"2023020206245384900_btx207-B13","doi-asserted-by":"crossref","first-page":"649","DOI":"10.1006\/jmbi.1996.0428","article-title":"Co-variation of tRNA abundance and codon usage in Escherichia coli at different growth rates","volume":"260","author":"Dong","year":"1996","journal-title":"J. Mol. Biol"},{"key":"2023020206245384900_btx207-B14","doi-asserted-by":"crossref","first-page":"5036","DOI":"10.1093\/nar\/gkh834","article-title":"Solving the riddle of codon usage preferences: a test for translational selection","volume":"32","author":"dos Reis","year":"2004","journal-title":"Nucleic Acids Res"},{"key":"2023020206245384900_btx207-B15","doi-asserted-by":"crossref","first-page":"13091","DOI":"10.1038\/ncomms13091","article-title":"Multi-omic data integration enables discovery of hidden biological regularities","volume":"7","author":"Ebrahim","year":"2016","journal-title":"Nat. Commun"},{"key":"2023020206245384900_btx207-B16","doi-asserted-by":"crossref","first-page":"3784","DOI":"10.1093\/nar\/gkg563","article-title":"ExPASy: the proteomics server for in-depth protein knowledge and analysis","volume":"31","author":"Gasteiger","year":"2003","journal-title":"Nucleic Acids Res"},{"key":"2023020206245384900_btx207-B17","doi-asserted-by":"crossref","first-page":"115","DOI":"10.1016\/j.jmb.2003.11.053","article-title":"Mining the structural genomics pipeline: identification of protein properties that affect high-throughput experimental analysis","volume":"336","author":"Goh","year":"2004","journal-title":"J. Mol. Biol"},{"key":"2023020206245384900_btx207-B18","doi-asserted-by":"crossref","first-page":"475","DOI":"10.1126\/science.1241934","article-title":"Causes and effects of n-terminal codon bias in bacterial genes","volume":"342","author":"Goodman","year":"2013","journal-title":"Science"},{"key":"2023020206245384900_btx207-B19","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1186\/1471-2105-15-134","article-title":"A review of machine learning methods to predict the solubility of overexpressed recombinant proteins in Escherichia coli","volume":"15","author":"Habibi","year":"2014","journal-title":"BMC Bioinformatics"},{"key":"2023020206245384900_btx207-B21","doi-asserted-by":"crossref","first-page":"1444","DOI":"10.1002\/pmic.201200175","article-title":"ESPRESSO: a system for estimating protein expression and solubility in protein expression systems","volume":"13","author":"Hirose","year":"2013","journal-title":"Proteomics"},{"key":"2023020206245384900_btx207-B22","doi-asserted-by":"crossref","first-page":"278","DOI":"10.1093\/bioinformatics\/bti810","article-title":"A support vector machine-based method for predicting the propensity of a protein to be soluble or to form inclusion body on overexpression in Escherichia coli","volume":"22","author":"Idicula-Thomas","year":"2006","journal-title":"Bioinformatics"},{"key":"2023020206245384900_btx207-B23","doi-asserted-by":"crossref","first-page":"389","DOI":"10.1016\/0022-2836(81)90003-6","article-title":"Correlation between the abundance of Escherichia coli transfer RNAs and the occurrence of the respective codons in its protein genes: a proposal for a synonymous codon choice that is optimal for the E. coli translational system","volume":"151","author":"Ikemura","year":"1981","journal-title":"J. Mol. Biol"},{"key":"2023020206245384900_btx207-B24","doi-asserted-by":"crossref","first-page":"857","DOI":"10.1093\/bioinformatics\/btu744","article-title":"DISOPRED3: precise disordered region predictions with annotated protein-binding activity","volume":"31","author":"Jones","year":"2015","journal-title":"Bioinformatics"},{"key":"2023020206245384900_btx207-B25","doi-asserted-by":"crossref","first-page":"13","DOI":"10.1016\/j.gene.2005.06.037","article-title":"Regulation of translation via mRNA structure in prokaryotes and eukaryotes","volume":"361","author":"Kozak","year":"2005","journal-title":"Gene"},{"key":"2023020206245384900_btx207-B26","doi-asserted-by":"crossref","first-page":"255","DOI":"10.1126\/science.1170160","article-title":"Coding-sequence determinants of gene expression in Escherichia coli","volume":"324","author":"Kudla","year":"2009","journal-title":"Science"},{"key":"2023020206245384900_btx207-B27","doi-asserted-by":"crossref","first-page":"406","DOI":"10.1007\/978-3-540-77046-6_50","volume-title":"Pattern Recognition and Machine Intelligence","author":"Kumar","year":"2007"},{"key":"2023020206245384900_btx207-B28","doi-asserted-by":"crossref","first-page":"538","DOI":"10.1038\/nature10965","article-title":"The anti-Shine-Dalgarno sequence drives translational pausing and codon choice in bacteria","volume":"484","author":"Li","year":"2012","journal-title":"Nature"},{"key":"2023020206245384900_btx207-B29","doi-asserted-by":"crossref","first-page":"624","DOI":"10.1016\/j.cell.2014.02.033","article-title":"Quantifying absolute protein synthesis rates reveals principles underlying allocation of cellular resources","volume":"157","author":"Li","year":"2014","journal-title":"Cell"},{"key":"2023020206245384900_btx207-B30","doi-asserted-by":"crossref","first-page":"1453","DOI":"10.1016\/j.str.2003.10.002","article-title":"Protein disorder prediction: implications for structural proteomics","volume":"11","author":"Linding","year":"2003","journal-title":"Structure"},{"key":"2023020206245384900_btx207-B32","doi-asserted-by":"crossref","first-page":"e49","DOI":"10.1093\/nar\/gkv036","article-title":"Solid-phase cloning for high-throughput assembly of single and multiple DNA parts","volume":"43","author":"Lundqvist","year":"2015","journal-title":"Nucleic Acids Res"},{"key":"2023020206245384900_btx207-B33","doi-asserted-by":"crossref","first-page":"2200","DOI":"10.1093\/bioinformatics\/btp386","article-title":"SOLpro: accurate sequence-based prediction of protein solubility","volume":"25","author":"Magnan","year":"2009","journal-title":"Bioinformatics"},{"key":"2023020206245384900_btx207-B34","doi-asserted-by":"crossref","first-page":"3","DOI":"10.1007\/978-1-60327-429-6_1","article-title":"UNAFold: software for nucleic acid folding and hybridization","volume":"453","author":"Markham","year":"2008","journal-title":"Methods Mol. Biol"},{"key":"2023020206245384900_btx207-B35","doi-asserted-by":"crossref","first-page":"144","DOI":"10.1016\/j.molbiopara.2006.03.011","article-title":"Heterologous expression of proteins from Plasmodium falciparum: results from 1000 genes","volume":"148","author":"Mehlin","year":"2006","journal-title":"Mol. Biochem. Parasitol"},{"key":"2023020206245384900_btx207-B36","doi-asserted-by":"crossref","first-page":"686","DOI":"10.1016\/j.celrep.2015.12.073","article-title":"Clarifying the translational pausing landscape in bacteria by ribosome profiling","volume":"14","author":"Mohammad","year":"2016","journal-title":"Cell Rep"},{"key":"2023020206245384900_btx207-B37","doi-asserted-by":"crossref","first-page":"98","DOI":"10.1016\/j.sbi.2013.12.006","article-title":"Assessing the accuracy of physical models used in protein-folding simulations: quantitative evidence from long molecular dynamics simulations","volume":"24","author":"Piana","year":"2014","journal-title":"Curr. Opin. Struct. Biol"},{"key":"2023020206245384900_btx207-B38","doi-asserted-by":"crossref","first-page":"588","DOI":"10.1002\/bies.201400187","article-title":"Decoding the jargon of bottom-up metabolic systems biology","volume":"37","author":"Rolfsson","year":"2015","journal-title":"Bioessays"},{"key":"2023020206245384900_btx207-B39","doi-asserted-by":"crossref","first-page":"172","DOI":"10.3389\/fmicb.2014.00172","article-title":"Recombinant protein expression in Escherichia coli: advances and challenges","volume":"5","author":"Rosano","year":"2014","journal-title":"Front. Microbiol"},{"key":"2023020206245384900_btx207-B40","doi-asserted-by":"crossref","first-page":"3975","DOI":"10.1021\/bi00411a013","article-title":"Influence of duplexes 3\u2032 to the mRNA initiation codon on the efficiency of monosome formation","volume":"27","author":"Shakin-Eshleman","year":"1988","journal-title":"Biochemistry"},{"key":"2023020206245384900_btx207-B41","doi-asserted-by":"crossref","first-page":"1281","DOI":"10.1093\/nar\/15.3.1281","article-title":"The codon adaptation index: a measure of directional synonymous codon usage bias, and its potential applications","volume":"15","author":"Sharp","year":"1987","journal-title":"Nucleic Acids Res"},{"key":"2023020206245384900_btx207-B42","doi-asserted-by":"crossref","first-page":"2536","DOI":"10.1093\/bioinformatics\/btl623","article-title":"Protein solubility: sequence based prediction and experimental verification","volume":"23","author":"Smialowski","year":"2007","journal-title":"Bioinformatics"},{"key":"2023020206245384900_btx207-B43","doi-asserted-by":"crossref","first-page":"2192","DOI":"10.1111\/j.1742-4658.2012.08603.x","article-title":"PROSO II: a new method for protein solubility prediction","volume":"279","author":"Smialowski","year":"2012","journal-title":"FEBS J"},{"key":"2023020206245384900_btx207-B44","doi-asserted-by":"crossref","first-page":"328","DOI":"10.1016\/j.jmb.2012.06.010","article-title":"Silent substitutions predictably alter translation elongation rates and protein folding efficiencies","volume":"422","author":"Spencer","year":"2012","journal-title":"J. Mol. Biol"},{"key":"2023020206245384900_btx207-B52","doi-asserted-by":"crossref","first-page":"6","DOI":"10.1016\/j.bbapap.2005.07.002","article-title":"High-throughput solubility assay for purified recombinant protein immunogens","volume":"1752","author":"Stenvall","year":"2005","journal-title":"Biochim. Biophys"},{"key":"2023020206245384900_btx207-B53","doi-asserted-by":"crossref","first-page":"51","DOI":"10.1002\/biot.200800183","article-title":"High-throughput protein production--lessons from scaling up from 10 to 288 recombinant proteins per week","volume":"4","author":"Tegel","year":"2009","journal-title":"Biotechnol J"},{"key":"2023020206245384900_btx207-B45","doi-asserted-by":"crossref","first-page":"3645","DOI":"10.1073\/pnas.0909910107","article-title":"Translation efficiency is determined by both codon bias and folding energy","volume":"107","author":"Tuller","year":"2010","journal-title":"Proc. Natl. Acad. Sci. U. S. A"},{"key":"2023020206245384900_btx207-B46","doi-asserted-by":"crossref","first-page":"1248","DOI":"10.1038\/nbt1210-1248","article-title":"Towards a knowledge-based human protein atlas","volume":"28","author":"Uhl\u00e9n","year":"2010","journal-title":"Nat. Biotechnol"},{"key":"2023020206245384900_btx207-B47","doi-asserted-by":"crossref","first-page":"1260419.","DOI":"10.1126\/science.1260419","article-title":"Proteomics. Tissue-based map of the human proteome","volume":"347","author":"Uhl\u00e9n","year":"2015","journal-title":"Science"},{"key":"2023020206245384900_btx207-B48","doi-asserted-by":"crossref","first-page":"953","DOI":"10.1038\/80726","article-title":"Creating a structural genomics consortium","volume":"7 Suppl","author":"Williamson","year":"2000","journal-title":"Nat. Struct. Biol"},{"key":"2023020206245384900_btx207-B49","doi-asserted-by":"crossref","first-page":"3369","DOI":"10.1093\/bioinformatics\/bti534","article-title":"RONN: the bio-basis function neural network technique applied to the detection of natively disordered regions in proteins","volume":"21","author":"Yang","year":"2005","journal-title":"Bioinformatics"},{"key":"2023020206245384900_btx207-B50","doi-asserted-by":"crossref","first-page":"57","DOI":"10.1186\/1471-2105-9-57","article-title":"Gene function prediction using labeled and unlabeled data","volume":"9","author":"Zhao","year":"2008","journal-title":"BMC Bioinformatics"},{"key":"2023020206245384900_btx207-B51","doi-asserted-by":"crossref","first-page":"1125","DOI":"10.1002\/prot.21870","article-title":"Protein classification with imbalanced data","volume":"70","author":"Zhao","year":"2008","journal-title":"Proteins"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/33\/16\/2487\/49040972\/bioinformatics_33_16_2487.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/33\/16\/2487\/49040972\/bioinformatics_33_16_2487.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,2,2]],"date-time":"2023-02-02T06:26:54Z","timestamp":1675319214000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/33\/16\/2487\/3111846"}},"subtitle":[],"editor":[{"given":"Alfonso","family":"Valencia","sequence":"additional","affiliation":[]}],"short-title":[],"issued":{"date-parts":[[2017,4,7]]},"references-count":52,"journal-issue":{"issue":"16","published-print":{"date-parts":[[2017,8,15]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/btx207","relation":{},"ISSN":["1367-4803","1367-4811"],"issn-type":[{"value":"1367-4803","type":"print"},{"value":"1367-4811","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2017,8,15]]},"published":{"date-parts":[[2017,4,7]]}}}