{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,8]],"date-time":"2026-02-08T08:12:22Z","timestamp":1770538342845,"version":"3.49.0"},"reference-count":60,"publisher":"Oxford University Press (OUP)","issue":"8","license":[{"start":{"date-parts":[[2017,12,2]],"date-time":"2017-12-02T00:00:00Z","timestamp":1512172800000},"content-version":"vor","delay-in-days":1,"URL":"https:\/\/academic.oup.com\/journals\/pages\/about_us\/legal\/notices"}],"funder":[{"DOI":"10.13039\/100000002","name":"NIH","doi-asserted-by":"publisher","award":["R01 GM118709"],"award-info":[{"award-number":["R01 GM118709"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF","doi-asserted-by":"publisher","award":["ACI-1053575"],"award-info":[{"award-number":["ACI-1053575"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100009429","name":"NRSA","doi-asserted-by":"publisher","award":["F31GM116570"],"award-info":[{"award-number":["F31GM116570"]}],"id":[{"id":"10.13039\/100009429","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,4,15]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:sec>\n                  <jats:title>Motivation<\/jats:title>\n                  <jats:p>Multiple sequence alignments (MSAs) can provide essential input to many bioinformatics applications, including protein structure prediction and functional annotation. However, the optimal selection of sequences to obtain biologically informative MSAs for such purposes is poorly explored, and has traditionally been performed manually.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Results<\/jats:title>\n                  <jats:p>We present Selection of Alignment by Maximal Mutual Information (SAMMI), an automated, sequence-based approach to objectively select an optimal MSA from a large set of alternatives sampled from a general sequence database search. The hypothesis of this approach is that the mutual information among MSA columns will be maximal for those MSAs that contain the most diverse set possible of the most structurally and functionally homogeneous protein sequences. SAMMI was tested to select MSAs for functional site residue prediction by analysis of conservation patterns on a set of 435 proteins obtained from protein\u2013ligand (peptides, nucleic acids and small substrates) and protein\u2013protein interaction databases.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Availability and implementation<\/jats:title>\n                  <jats:p>A freely accessible program, including source code, implementing SAMMI is available at https:\/\/github.com\/nelsongil92\/SAMMI.git.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Supplementary information<\/jats:title>\n                  <jats:p>Supplementary data are available at Bioinformatics online.<\/jats:p>\n               <\/jats:sec>","DOI":"10.1093\/bioinformatics\/btx779","type":"journal-article","created":{"date-parts":[[2017,11,30]],"date-time":"2017-11-30T04:16:43Z","timestamp":1512015403000},"page":"1278-1286","source":"Crossref","is-referenced-by-count":10,"title":["Identifying functionally informative evolutionary sequence profiles"],"prefix":"10.1093","volume":"34","author":[{"given":"Nelson","family":"Gil","sequence":"first","affiliation":[{"name":"Department of Systems & Computational Biology, Albert Einstein College of Medicine, Bronx, NY, USA"}]},{"given":"Andras","family":"Fiser","sequence":"additional","affiliation":[{"name":"Department of Systems & Computational Biology, Albert Einstein College of Medicine, Bronx, NY, USA"}]}],"member":"286","published-online":{"date-parts":[[2017,12,1]]},"reference":[{"key":"2023012713005335400_btx779-B1","doi-asserted-by":"crossref","first-page":"416","DOI":"10.1016\/j.jmb.2008.12.045","article-title":"Domain-based and family-specific sequence identity thresholds increase the levels of reliable protein function transfer","volume":"387","author":"Addou","year":"2009","journal-title":"J. Mol. Biol"},{"key":"2023012713005335400_btx779-B2","doi-asserted-by":"crossref","first-page":"484","DOI":"10.1186\/1471-2105-7-484","article-title":"A statistical score for assessing the quality of multiple sequence alignments","volume":"7","author":"Ahola","year":"2006","journal-title":"BMC Bioinformatics"},{"key":"2023012713005335400_btx779-B3","doi-asserted-by":"crossref","first-page":"693","DOI":"10.1016\/0022-2836(87)90352-4","article-title":"Correlation of co-ordinated amino acid substitutions with function in viruses related to tobacco mosaic virus","volume":"193","author":"Altschuh","year":"1987","journal-title":"J. Mol. Biol"},{"key":"2023012713005335400_btx779-B4","doi-asserted-by":"crossref","first-page":"3389","DOI":"10.1093\/nar\/25.17.3389","article-title":"Gapped BLAST and PSI-BLAST: a new generation of protein database search programs","volume":"25","author":"Altschul","year":"1997","journal-title":"Nucleic Acids Res"},{"key":"2023012713005335400_btx779-B5","doi-asserted-by":"crossref","first-page":"e35","DOI":"10.1093\/bioinformatics\/btl218","article-title":"The iRMSD: a local measure of sequence alignment accuracy using structural information","volume":"22","author":"Armougom","year":"2006","journal-title":"Bioinformatics"},{"key":"2023012713005335400_btx779-B6","doi-asserted-by":"crossref","first-page":"164","DOI":"10.1093\/oxfordjournals.molbev.a026229","article-title":"Correlations among amino acid sites in bHLH protein domains: an information theoretic analysis","volume":"17","author":"Atchley","year":"2000","journal-title":"Mol. Biol. Evol"},{"key":"2023012713005335400_btx779-B7","doi-asserted-by":"crossref","first-page":"749","DOI":"10.1111\/j.1432-1033.1991.tb16076.x","article-title":"Amino acid sequence analysis of the annexin super-gene family of proteins","volume":"198","author":"Barton","year":"1991","journal-title":"Eur. J. Biochem"},{"key":"2023012713005335400_btx779-B8","doi-asserted-by":"crossref","first-page":"e1000633.","DOI":"10.1371\/journal.pcbi.1000633","article-title":"Disentangling direct from indirect co-evolution of residues in protein alignments","volume":"6","author":"Burger","year":"2010","journal-title":"PLoS Comput. Biol"},{"key":"2023012713005335400_btx779-B9","doi-asserted-by":"crossref","first-page":"1125","DOI":"10.1093\/bioinformatics\/btp135","article-title":"Correction for phylogeny, small number of observations and data redundancy improves the identification of coevolving amino acid pairs using mutual information","volume":"25","author":"Buslje","year":"2009","journal-title":"Bioinformatics"},{"key":"2023012713005335400_btx779-B10","doi-asserted-by":"crossref","first-page":"1875","DOI":"10.1093\/bioinformatics\/btm270","article-title":"Predicting functionally important residues from sequence conservation","volume":"23","author":"Capra","year":"2007","journal-title":"Bioinformatics"},{"key":"2023012713005335400_btx779-B11","doi-asserted-by":"crossref","first-page":"171","DOI":"10.1038\/nsb0295-171","article-title":"A method to predict functional residues in proteins","volume":"2","author":"Casari","year":"1995","journal-title":"Nat. Struct. Biol"},{"key":"2023012713005335400_btx779-B12","doi-asserted-by":"crossref","first-page":"1625","DOI":"10.1093\/molbev\/msu117","article-title":"TCS: a new multiple sequence alignment reliability measure to estimate alignment accuracy and improve phylogenetic tree reconstruction","volume":"31","author":"Chang","year":"2014","journal-title":"Mol. Biol. Evol"},{"key":"2023012713005335400_btx779-B13","doi-asserted-by":"crossref","DOI":"10.1093\/bib\/bbv099","article-title":"Multiple sequence alignment modeling: methods and applications","volume":"17","author":"Chatzou","year":"2016","journal-title":"Brief Bioinform"},{"key":"2023012713005335400_btx779-B14","volume-title":"Elements of Information Theory","author":"Cover","year":"2006"},{"key":"2023012713005335400_btx779-B15","author":"de Oliveira","year":"2016"},{"key":"2023012713005335400_btx779-B16","doi-asserted-by":"crossref","first-page":"e37645","DOI":"10.1371\/journal.pone.0037645","article-title":"Protein sequence alignment analysis by local covariation: coevolution statistics detect benchmark alignment errors","volume":"7","author":"Dickson","year":"2012","journal-title":"PLoS One"},{"key":"2023012713005335400_btx779-B17","doi-asserted-by":"crossref","first-page":"e11082","DOI":"10.1371\/journal.pone.0011082","article-title":"Identifying and seeing beyond multiple sequence alignment errors using intra-molecular protein covariation","volume":"5","author":"Dickson","year":"2010","journal-title":"PLoS One"},{"key":"2023012713005335400_btx779-B18","doi-asserted-by":"crossref","first-page":"334","DOI":"10.1186\/1471-2105-13-334","article-title":"Protein interface classification by evolutionary analysis","volume":"13","author":"Duarte","year":"2012","journal-title":"BMC Bioinformatics"},{"key":"2023012713005335400_btx779-B19","doi-asserted-by":"crossref","first-page":"333","DOI":"10.1093\/bioinformatics\/btm604","article-title":"Mutual information without the influence of phylogeny or entropy dramatically improves residue contact prediction","volume":"24","author":"Dunn","year":"2008","journal-title":"Bioinformatics"},{"key":"2023012713005335400_btx779-B20","doi-asserted-by":"crossref","first-page":"325","DOI":"10.1016\/0092-8674(94)90301-8","article-title":"The crystal structure of human hypoxanthine-guanine phosphoribosyltransferase with bound GMP","volume":"78","author":"Eads","year":"1994","journal-title":"Cell"},{"key":"2023012713005335400_btx779-B21","doi-asserted-by":"crossref","first-page":"341","DOI":"10.1016\/j.jcp.2014.07.024","article-title":"Fast pseudolikelihood maximization for direct-coupling analysis of protein structure from many homologous amino-acid sequences","volume":"276","author":"Ekeberg","year":"2014","journal-title":"J. Comput. Phys"},{"key":"2023012713005335400_btx779-B22","doi-asserted-by":"crossref","first-page":"330","DOI":"10.1002\/prot.10043","article-title":"A study on protein sequence alignment quality","volume":"46","author":"Elofsson","year":"2002","journal-title":"Proteins"},{"key":"2023012713005335400_btx779-B23","doi-asserted-by":"crossref","first-page":"63.","DOI":"10.1186\/1471-2105-14-63","article-title":"Protein structure based prediction of catalytic residues","volume":"14","author":"Fajardo","year":"2013","journal-title":"BMC Bioinformatics"},{"key":"2023012713005335400_btx779-B24","doi-asserted-by":"crossref","first-page":"e1003847","DOI":"10.1371\/journal.pcbi.1003847","article-title":"Improving contact prediction along three dimensions","volume":"10","author":"Feinauer","year":"2014","journal-title":"PLoS Comput. Biol"},{"key":"2023012713005335400_btx779-B25","doi-asserted-by":"crossref","first-page":"309","DOI":"10.1002\/prot.340180402","article-title":"Correlated mutations and residue contacts in proteins","volume":"18","author":"Gobel","year":"1994","journal-title":"Proteins"},{"key":"2023012713005335400_btx779-B26","doi-asserted-by":"crossref","first-page":"774","DOI":"10.1016\/j.cell.2009.07.038","article-title":"Protein sectors: evolutionary units of three-dimensional structure","volume":"138","author":"Halabi","year":"2009","journal-title":"Cell"},{"key":"2023012713005335400_btx779-B27","doi-asserted-by":"crossref","first-page":"10915","DOI":"10.1073\/pnas.89.22.10915","article-title":"Amino acid substitution matrices from protein blocks","volume":"89","author":"Henikoff","year":"1992","journal-title":"Proc. Natl. Acad. Sci. USA"},{"key":"2023012713005335400_btx779-B28","article-title":"Sequence co-evolution gives 3D contacts and structures of protein complexes","volume":"3","author":"Hopf","year":"2014","journal-title":"eLife 2014"},{"key":"2023012713005335400_btx779-B29","author":"Hubbard","year":"1993"},{"key":"2023012713005335400_btx779-B30","doi-asserted-by":"crossref","first-page":"184","DOI":"10.1093\/bioinformatics\/btr638","article-title":"PSICOV: precise structural contact prediction using sparse inverse covariance estimation on large multiple sequence alignments","volume":"28","author":"Jones","year":"2012","journal-title":"Bioinformatics"},{"key":"2023012713005335400_btx779-B31","doi-asserted-by":"crossref","first-page":"999","DOI":"10.1093\/bioinformatics\/btu791","article-title":"MetaPSICOV: combining coevolution methods for accurate prediction of contacts and long range hydrogen bonding in proteins","volume":"31","author":"Jones","year":"2015","journal-title":"Bioinformatics"},{"key":"2023012713005335400_btx779-B32","doi-asserted-by":"crossref","first-page":"15674","DOI":"10.1073\/pnas.1314045110","article-title":"Assessing the utility of coevolution-based residue-residue contact predictions in a sequence- and structure-rich era","volume":"110","author":"Kamisetty","year":"2013","journal-title":"Proc. Natl. Acad. Sci. USA"},{"key":"2023012713005335400_btx779-B33","doi-asserted-by":"crossref","first-page":"7176","DOI":"10.1073\/pnas.90.15.7176","article-title":"Covariation of mutations in the V3 loop of human immunodeficiency virus type 1 envelope protein: an information theoretic analysis","volume":"90","author":"Korber","year":"1993","journal-title":"Proc. Natl. Acad. Sci. USA"},{"key":"2023012713005335400_btx779-B34","doi-asserted-by":"crossref","first-page":"774","DOI":"10.1016\/j.jmb.2007.05.022","article-title":"Inference of macromolecular assemblies from crystalline state","volume":"372","author":"Krissinel","year":"2007","journal-title":"J. Mol. Biol"},{"key":"2023012713005335400_btx779-B35","doi-asserted-by":"crossref","first-page":"34.","DOI":"10.1186\/s12861-015-0085-6","article-title":"Conservation analysis of sequences flanking the testis-determining gene Sry in 17 mammalian species","volume":"15","author":"Larney","year":"2015","journal-title":"BMC Dev. Biol"},{"key":"2023012713005335400_btx779-B36","doi-asserted-by":"crossref","first-page":"7120","DOI":"10.1093\/nar\/gki1020","article-title":"Automatic assessment of alignment quality","volume":"33","author":"Lassmann","year":"2005","journal-title":"Nucleic Acids Res"},{"key":"2023012713005335400_btx779-B37","doi-asserted-by":"crossref","first-page":"21.","DOI":"10.1016\/S0959-440X(02)00284-1","article-title":"Evolutionary predictions of binding surfaces and interactions","volume":"12","author":"Lichtarge","year":"2002","journal-title":"Curr. Opin. Struct. Biol"},{"key":"2023012713005335400_btx779-B38","doi-asserted-by":"crossref","first-page":"e27872","DOI":"10.1371\/journal.pone.0027872","article-title":"Improving the alignment quality of consistency based aligners with an evaluation function using synonymous protein words","volume":"6","author":"Lin","year":"2011","journal-title":"PLoS One"},{"key":"2023012713005335400_btx779-B39","doi-asserted-by":"crossref","first-page":"295","DOI":"10.1126\/science.286.5438.295","article-title":"Evolutionarily conserved pathways of energetic connectivity in protein families","volume":"286","author":"Lockless","year":"1999","journal-title":"Science"},{"key":"2023012713005335400_btx779-B40","doi-asserted-by":"crossref","first-page":"2592","DOI":"10.1093\/bioinformatics\/btu352","article-title":"SSpro\/ACCpro 5: almost perfect prediction of protein secondary structure and relative solvent accessibility using profiles, machine learning and structural similarity","volume":"30","author":"Magnan","year":"2014","journal-title":"Bioinformatics"},{"key":"2023012713005335400_btx779-B41","doi-asserted-by":"crossref","first-page":"e28766","DOI":"10.1371\/journal.pone.0028766","article-title":"Protein 3D structure computed from evolutionary sequence variation","volume":"6","author":"Marks","year":"2011","journal-title":"PLoS One"},{"key":"2023012713005335400_btx779-B42","doi-asserted-by":"crossref","first-page":"E1293","DOI":"10.1073\/pnas.1111471108","article-title":"Direct-coupling analysis of residue coevolution captures native contacts across many protein families","volume":"108","author":"Morcos","year":"2011","journal-title":"Proc. Natl. Acad. Sci. USA"},{"key":"2023012713005335400_btx779-B43","doi-asserted-by":"crossref","first-page":"425","DOI":"10.1093\/bioinformatics\/btf882","article-title":"AltAVisT: comparing alternative multiple sequence alignments","volume":"19","author":"Morgenstern","year":"2003","journal-title":"Bioinformatics"},{"key":"2023012713005335400_btx779-B44","doi-asserted-by":"crossref","first-page":"4364","DOI":"10.1093\/nar\/gkl514","article-title":"MUMMALS: multiple sequence alignment improved by using hidden Markov models with local structural information","volume":"34","author":"Pei","year":"2006","journal-title":"Nucleic Acids Res"},{"key":"2023012713005335400_btx779-B45","doi-asserted-by":"crossref","first-page":"498","DOI":"10.1093\/bioinformatics\/btm637","article-title":"Predicting disulfide bond connectivity in proteins by correlated mutations analysis","volume":"24","author":"Rubinstein","year":"2008","journal-title":"Bioinformatics"},{"key":"2023012713005335400_btx779-B46","doi-asserted-by":"crossref","first-page":"766","DOI":"10.1016\/j.str.2013.02.022","article-title":"Functional classification of immune regulatory proteins","volume":"21","author":"Rubinstein","year":"2013","journal-title":"Structure"},{"key":"2023012713005335400_btx779-B47","doi-asserted-by":"crossref","first-page":"3128","DOI":"10.1093\/bioinformatics\/btu500","article-title":"CCMpred\u2013fast and precise prediction of protein residue-residue contacts from correlated mutations","volume":"30","author":"Seemayer","year":"2014","journal-title":"Bioinformatics"},{"key":"2023012713005335400_btx779-B48","doi-asserted-by":"crossref","first-page":"1815","DOI":"10.1093\/bioinformatics\/btt259","article-title":"PconsC: combination of direct information methods and alignments improves contact prediction","volume":"29","author":"Skwark","year":"2013","journal-title":"Bioinformatics"},{"key":"2023012713005335400_btx779-B49","doi-asserted-by":"crossref","first-page":"e1003889","DOI":"10.1371\/journal.pcbi.1003889","article-title":"Improved contact predictions using the recognition of protein like contact patterns","volume":"10","author":"Skwark","year":"2014","journal-title":"PLoS Comput. Biol"},{"key":"2023012713005335400_btx779-B50","doi-asserted-by":"crossref","first-page":"327","DOI":"10.1093\/bioinformatics\/15.4.327","article-title":"Automated analysis of interatomic contacts in proteins","volume":"15","author":"Sobolev","year":"1999","journal-title":"Bioinformatics"},{"key":"2023012713005335400_btx779-B51","doi-asserted-by":"crossref","first-page":"902","DOI":"10.1006\/jmbi.1997.1008","article-title":"Multiple sequence threading: an analysis of alignment quality and stability","volume":"269","author":"Taylor","year":"1997","journal-title":"J. Mol. Biol"},{"key":"2023012713005335400_btx779-B52","doi-asserted-by":"crossref","first-page":"127","DOI":"10.1002\/prot.20527","article-title":"BAliBASE 3.0: latest developments of the multiple sequence alignment benchmark","volume":"61","author":"Thompson","year":"2005","journal-title":"Proteins"},{"key":"2023012713005335400_btx779-B53","doi-asserted-by":"crossref","first-page":"2682","DOI":"10.1093\/nar\/27.13.2682","article-title":"A comprehensive comparison of multiple sequence alignment programs","volume":"27","author":"Thompson","year":"1999","journal-title":"Nucleic Acids Res"},{"key":"2023012713005335400_btx779-B54","doi-asserted-by":"crossref","first-page":"376","DOI":"10.1093\/bib\/bbt068","article-title":"Information theory applications for biological sequence analysis","volume":"15","author":"Vinga","year":"2014","journal-title":"Brief Bioinform"},{"key":"2023012713005335400_btx779-B55","doi-asserted-by":"crossref","first-page":"3031","DOI":"10.1016\/j.jmb.2015.07.016","article-title":"Updates to the integrated protein\u2013protein interaction benchmarks: docking benchmark version 5 and affinity benchmark version 2","volume":"427","author":"Vreven","year":"2015","journal-title":"J. Mol. Biol"},{"key":"2023012713005335400_btx779-B56","doi-asserted-by":"crossref","first-page":"66","DOI":"10.1147\/rd.41.0066","article-title":"Information theoretical analysis of multivariate correlation","volume":"4","author":"Watanabe","year":"1960","journal-title":"IBM J. Res. Dev"},{"key":"2023012713005335400_btx779-B57","volume-title":"Data Mining: Practical Machine Learning Tools and Techniques","author":"Witten","year":"2011"},{"key":"2023012713005335400_btx779-B58","doi-asserted-by":"crossref","first-page":"D1096","DOI":"10.1093\/nar\/gks966","article-title":"BioLiP: a semi-manually curated database for biologically relevant ligand-protein interactions","volume":"41","author":"Yang","year":"2013","journal-title":"Nucleic Acids Res."},{"key":"2023012713005335400_btx779-B59","doi-asserted-by":"crossref","first-page":"945","DOI":"10.1016\/j.jmb.2013.11.009","article-title":"Functional clustering of immunoglobulin superfamily proteins with protein\u2013protein interaction information calibrated hidden Markov model sequence profiles","volume":"426","author":"Yap","year":"2014","journal-title":"J. Mol. Biol"},{"key":"2023012713005335400_btx779-B60","doi-asserted-by":"crossref","first-page":"5130","DOI":"10.1073\/pnas.1522586113","article-title":"Part mutual information for quantifying direct associations in networks","volume":"113","author":"Zhao","year":"2016","journal-title":"Proc. Natl. Acad. Sci. USA"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/34\/8\/1278\/48915116\/bioinformatics_34_8_1278.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/34\/8\/1278\/48915116\/bioinformatics_34_8_1278.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T13:51:02Z","timestamp":1674827462000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/34\/8\/1278\/4683462"}},"subtitle":[],"editor":[{"given":"John","family":"Hancock","sequence":"additional","affiliation":[]}],"short-title":[],"issued":{"date-parts":[[2017,12,1]]},"references-count":60,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2018,4,15]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/btx779","relation":{},"ISSN":["1367-4803","1367-4811"],"issn-type":[{"value":"1367-4803","type":"print"},{"value":"1367-4811","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2018,4,15]]},"published":{"date-parts":[[2017,12,1]]}}}