{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,29]],"date-time":"2026-03-29T12:09:30Z","timestamp":1774786170911,"version":"3.50.1"},"reference-count":15,"publisher":"Oxford University Press (OUP)","issue":"18","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2005,9,15]]},"abstract":"<jats:title>Abstract<\/jats:title><jats:p>Summary: The CluSTr database employs a fully automatic single-linkage hierarchical clustering method based on a similarity matrix. In order to compute the matrix, first all-against-all pair-wise comparisons between protein sequences are computed using the Smith\u2013Waterman algorithm. The statistical significance of the similarity scores is then assessed using a Monte Carlo analysis, yielding Z-values, which are used to populate the matrix. This paper describes automated annotation experiments that quantify the predictive power and hence the biological relevance of the CluSTr data. The experiments utilized the UniProt data-mining framework to derive annotation predictions using combinations of InterPro and CluSTr. We show that this combination of data sources greatly increases the precision of predictions made by the data-mining framework, compared with the use of InterPro data alone. We conclude that the CluSTr approach to clustering proteins makes a valuable contribution to traditional protein classifications.<\/jats:p><jats:p>Availability: \u00a0http:\/\/www.ebi.ac.uk\/clustr\/<\/jats:p><jats:p>Contact: \u00a0rolf.apweiler@ebi.ac.uk<\/jats:p>","DOI":"10.1093\/bioinformatics\/bti542","type":"journal-article","created":{"date-parts":[[2005,6,17]],"date-time":"2005-06-17T00:36:34Z","timestamp":1118968594000},"page":"3604-3609","source":"Crossref","is-referenced-by-count":35,"title":["The predictive power of the CluSTr database"],"prefix":"10.1093","volume":"21","author":[{"given":"Robert","family":"Petryszak","sequence":"first","affiliation":[{"name":"EMBL Outstation Hinxton, The European Bioinformatics Institute (EBI) Wellcome Trust Genome Campus, Hinxton, Cambridgeshire CB10 1SD, UK"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ernst","family":"Kretschmann","sequence":"additional","affiliation":[{"name":"EMBL Outstation Hinxton, The European Bioinformatics Institute (EBI) Wellcome Trust Genome Campus, Hinxton, Cambridgeshire CB10 1SD, UK"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Daniela","family":"Wieser","sequence":"additional","affiliation":[{"name":"EMBL Outstation Hinxton, The European Bioinformatics Institute (EBI) Wellcome Trust Genome Campus, Hinxton, Cambridgeshire CB10 1SD, UK"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rolf","family":"Apweiler","sequence":"additional","affiliation":[{"name":"EMBL Outstation Hinxton, The European Bioinformatics Institute (EBI) Wellcome Trust Genome Campus, Hinxton, Cambridgeshire CB10 1SD, UK"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"286","published-online":{"date-parts":[[2005,6,16]]},"reference":[{"key":"2023060912072844400_B1","doi-asserted-by":"crossref","unstructured":"Apweiler, R., et al. 2004UniProt: the Universal Protein knowledgebase. Nucleic Acids Res.32D115\u2013D119","DOI":"10.1093\/nar\/gkh131"},{"key":"2023060912072844400_B2","doi-asserted-by":"crossref","unstructured":"Bastien, O., et al. 2004Fundamentals of massive automatic pairwise alignments of protein sequences: theoretical significance of Z-value statistics. Bioinformatics20534\u2013537","DOI":"10.1093\/bioinformatics\/bth024"},{"key":"2023060912072844400_B3","unstructured":"Comet, J.P., et al. 1999Significance of Z-value statistics of Smith\u2013Waterman scores for protein alignments. Comput Chem.23317\u2013331"},{"key":"2023060912072844400_B4","doi-asserted-by":"crossref","unstructured":"Enright, A.J., et al. 2002An efficient algorithm for large-scale detection of protein families. Nucleic Acids Res.301575\u20131584","DOI":"10.1093\/nar\/30.7.1575"},{"key":"2023060912072844400_B5","doi-asserted-by":"crossref","unstructured":"Hermjakob, H., et al. 1999Swissknife\u2014\u2018lazy parsing\u2019 of Swiss-Prot entries. Bioinfomatics15771\u2013772","DOI":"10.1093\/bioinformatics\/15.9.771"},{"key":"2023060912072844400_B6","unstructured":"Kersey, P.J., et al. 2004The International Protein Index: An integrated database for proteomics experiments. Proteomics41985\u20131988"},{"key":"2023060912072844400_B7","doi-asserted-by":"crossref","unstructured":"Kretschmann, E., et al. 2001Automatic rule generation for protein annotation with the C4.5 data mining algorithm applied on SWISS-PROT. Bioinformatics17920\u2013926","DOI":"10.1093\/bioinformatics\/17.10.920"},{"key":"2023060912072844400_B8","unstructured":"Kretschmann, E., Rakow, A., Hackmann, A., Apweiler, R., et al. 2004The Aristotle semantic network technology. Proceedings of the Eighth World Multiconference on Systemics, Cybernetics and Informatics (SCI 2004)1365\u201370"},{"key":"2023060912072844400_B9","doi-asserted-by":"crossref","unstructured":"Kriventseva, E.V., et al. 2001CluSTr: a database of clusters of SWISS-PROT+TrEMBL proteins. Nucleic Acids Res.293\u20136","DOI":"10.1093\/nar\/29.1.33"},{"key":"2023060912072844400_B10","doi-asserted-by":"crossref","unstructured":"Mulder, N.J., et al. 2003The InterPro Database, 2003 brings increased coverage and new features. Nucleic Acids Res.31315\u2013318","DOI":"10.1093\/nar\/gkg046"},{"key":"2023060912072844400_B11","doi-asserted-by":"crossref","unstructured":"Tatusov, R.L., et al. 2003The COG database: an updated version includes eukaryotes. BMC Bioinformatics441","DOI":"10.1186\/1471-2105-4-41"},{"key":"2023060912072844400_B12","doi-asserted-by":"crossref","unstructured":"Wieser, D., et al. 2004Filtering erroneous protein annotation. Bioinformatics20i342\u2013i347","DOI":"10.1093\/bioinformatics\/bth938"},{"key":"2023060912072844400_B13","unstructured":"Wu, C.H., et al. 2003The Protein Information Resource. Nucleic Acids Res.31345\u2013347"},{"key":"2023060912072844400_B14","doi-asserted-by":"crossref","unstructured":"Yona, G., et al. 2000ProtoMap: automatic classification of protein sequences and hierarchy of protein families. Nucleic Acids Res.2849\u201355","DOI":"10.1093\/nar\/28.1.49"},{"key":"2023060912072844400_B15","doi-asserted-by":"crossref","unstructured":"Zdobnov, E.M. and Apweiler, R. 2001InterProScan\u2013an integration platform for the signature-recognition methods in InterPro. Bioinformatics17847\u2013848","DOI":"10.1093\/bioinformatics\/17.9.847"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/21\/18\/3604\/50554973\/bioinformatics_21_18_3604.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/21\/18\/3604\/50554973\/bioinformatics_21_18_3604.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T19:22:56Z","timestamp":1735759376000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/21\/18\/3604\/202120"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2005,6,16]]},"references-count":15,"journal-issue":{"issue":"18","published-print":{"date-parts":[[2005,9,15]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/bti542","relation":{},"ISSN":["1367-4811","1367-4803"],"issn-type":[{"value":"1367-4811","type":"electronic"},{"value":"1367-4803","type":"print"}],"subject":[],"published-other":{"date-parts":[[2005,9]]},"published":{"date-parts":[[2005,6,16]]}}}