{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T10:52:55Z","timestamp":1740135175922,"version":"3.37.3"},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"1","funder":[{"name":"National Science Foundation","award":["1262664"],"award-info":[{"award-number":["1262664"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["BMC Bioinformatics"],"published-print":{"date-parts":[[2018,12]]},"DOI":"10.1186\/s12859-018-2080-y","type":"journal-article","created":{"date-parts":[[2018,3,5]],"date-time":"2018-03-05T11:40:25Z","timestamp":1520250025000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Alignment-free clustering of large data sets of unannotated protein conserved regions using minhashing"],"prefix":"10.1186","volume":"19","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1822-0928","authenticated-orcid":false,"given":"Armen","family":"Abnousi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shira L.","family":"Broschat","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ananth","family":"Kalyanaraman","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2018,3,5]]},"reference":[{"issue":"D1","key":"2080_CR1","doi-asserted-by":"publisher","first-page":"158","DOI":"10.1093\/nar\/gkw1099","volume":"45","author":"U Consortium","year":"2017","unstructured":"Consortium U, et al. Uniprot: the universal protein knowledgebase. Nucleic acids research. 2017; 45(D1):158\u201369.","journal-title":"Nucleic acids research"},{"issue":"4","key":"2080_CR2","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1038\/scientificamerican1093-50","volume":"269","author":"RF Doolittle","year":"1993","unstructured":"Doolittle RF, Bork P. Evolutionarily mobile modules in proteins. Scientific American. 1993; 269(4):50\u20136.","journal-title":"Scientific American"},{"issue":"3","key":"2080_CR3","doi-asserted-by":"publisher","first-page":"405","DOI":"10.1002\/(SICI)1097-0134(199707)28:3<405::AID-PROT10>3.0.CO;2-L","volume":"28","author":"EL Sonnhammer","year":"1997","unstructured":"Sonnhammer EL, Eddy SR, Durbin R, et al. Pfam: a comprehensive database of protein domain families based on seed alignments. Proteins-Structure Function and Genetics. 1997; 28(3):405\u201320.","journal-title":"Proteins-Structure Function and Genetics"},{"issue":"8","key":"2080_CR4","doi-asserted-by":"publisher","first-page":"0161338","DOI":"10.1371\/journal.pone.0161338","volume":"11","author":"A Abnousi","year":"2016","unstructured":"Abnousi A, Broschat SL, Kalyanaraman A. A fast alignment-free approach for de novo detection of protein conserved regions. PloS ONE. 2016; 11(8):0161338.","journal-title":"PloS ONE"},{"key":"2080_CR5","volume-title":"Compression and Complexity of Sequences 1997. Proceedings","author":"AZ Broder","year":"1997","unstructured":"Broder AZ. On the resemblance and containment of documents. In: Compression and Complexity of Sequences 1997. Proceedings. USA: IEEE: 1997. p. 21\u201329."},{"key":"2080_CR6","volume-title":"Proceedings of the Thirtieth Annual ACM Symposium on Theory of Computing","author":"P Indyk","year":"1998","unstructured":"Indyk P, Motwani R. Approximate nearest neighbors: towards removing the curse of dimensionality. In: Proceedings of the Thirtieth Annual ACM Symposium on Theory of Computing. USA: ACM: 1998. p. 604\u201313."},{"key":"2080_CR7","volume-title":"VLDB, vol. 99.","author":"A Gionis","year":"1999","unstructured":"Gionis A, Indyk P, Motwani R, et al. Similarity search in high dimensions via hashing. In: VLDB, vol. 99.USA: VLDB endowment: 1999. p. 518\u201329."},{"key":"2080_CR8","doi-asserted-by":"crossref","unstructured":"Dean J, Ghemawat S. MapReduce: simplified data processing on large clusters. ACM. 2008. http:\/\/mapreduce.sandia.gov\/index.html .","DOI":"10.1145\/1327452.1327492"},{"issue":"3","key":"2080_CR9","doi-asserted-by":"publisher","first-page":"443","DOI":"10.1016\/0022-2836(70)90057-4","volume":"48","author":"SB Needleman","year":"1970","unstructured":"Needleman SB, Wunsch CD. A general method applicable to the search for similarities in the amino acid sequence of two proteins. J Mol Biol. 1970; 48(3):443\u201353.","journal-title":"J Mol Biol"},{"issue":"1","key":"2080_CR10","doi-asserted-by":"publisher","first-page":"195","DOI":"10.1016\/0022-2836(81)90087-5","volume":"147","author":"TF Smith","year":"1981","unstructured":"Smith TF, Waterman MS. Identification of common molecular subsequences. J Mol Biol. 1981; 147(1):195\u20137.","journal-title":"J Mol Biol"},{"issue":"3","key":"2080_CR11","doi-asserted-by":"publisher","first-page":"403","DOI":"10.1016\/S0022-2836(05)80360-2","volume":"215","author":"SF Altschul","year":"1990","unstructured":"Altschul SF, Gish W, Miller W, Myers EW, Lipman DJ. Basic local alignment search tool. J Mol Biol. 1990; 215(3):403\u201310.","journal-title":"J Mol Biol"},{"issue":"17","key":"2080_CR12","doi-asserted-by":"publisher","first-page":"3389","DOI":"10.1093\/nar\/25.17.3389","volume":"25","author":"SF Altschul","year":"1997","unstructured":"Altschul SF, Madden TL, Sch\u00e4ffer AA, Zhang J, Zhang Z, Miller W, Lipman DJ. Gapped blast and psi-blast: a new generation of protein database search programs. Nucleic acids Res. 1997; 25(17):3389\u2013402.","journal-title":"Nucleic acids Res"},{"issue":"10","key":"2080_CR13","doi-asserted-by":"publisher","first-page":"1923","DOI":"10.1109\/TPDS.2012.19","volume":"23","author":"C Wu","year":"2012","unstructured":"Wu C, Kalyanaraman A, Cannon WR. pgraph: Efficient parallel construction of large-scale protein sequence homology graphs. IEEE Trans Parallel Distrib Syst. 2012; 23(10):1923\u201333.","journal-title":"IEEE Trans Parallel Distrib Syst"},{"issue":"12","key":"2080_CR14","doi-asserted-by":"publisher","first-page":"1615","DOI":"10.1089\/cmb.2009.0198","volume":"16","author":"G Reinert","year":"2009","unstructured":"Reinert G, Chew D, Sun F, Waterman MS. Alignment-free sequence comparison (i): statistics and power. J Comput Biol. 2009; 16(12):1615\u201334.","journal-title":"J Comput Biol"},{"key":"2080_CR15","doi-asserted-by":"publisher","first-page":"203","DOI":"10.1007\/978-1-4020-2834-2_15","volume":"8","author":"O Sasson","year":"2004","unstructured":"Sasson O, Linial M. Protein clustering and classification. New Avenues Bioinforma. 2004; 8:203.","journal-title":"New Avenues Bioinforma"},{"issue":"D1","key":"2080_CR16","doi-asserted-by":"publisher","first-page":"279","DOI":"10.1093\/nar\/gkv1344","volume":"44","author":"RD Finn","year":"2016","unstructured":"Finn RD, Coggill P, Eberhardt RY, Eddy SR, Mistry J, Mitchell AL, Potter SC, Punta M, Qureshi M, Sangrador-Vegas A, et al. The pfam protein families database: towards a more sustainable future. Nucleic Acids Res. 2016; 44(D1):279\u201385.","journal-title":"Nucleic Acids Res"},{"issue":"11","key":"2080_CR17","doi-asserted-by":"publisher","first-page":"5857","DOI":"10.1073\/pnas.95.11.5857","volume":"95","author":"J Schultz","year":"1998","unstructured":"Schultz J, Milpetz F, Bork P, Ponting CP. Smart, a simple modular architecture research tool: identification of signaling domains. Proc Natl Acad Sci. 1998; 95(11):5857\u201364.","journal-title":"Proc Natl Acad Sci"},{"issue":"D1","key":"2080_CR18","doi-asserted-by":"publisher","first-page":"257","DOI":"10.1093\/nar\/gku949","volume":"43","author":"I Letunic","year":"2014","unstructured":"Letunic I, Doerks T, Bork P. Smart: recent updates, new developments and status in 2015. Nucleic Acids Res. 2014; 43(D1):257\u201360.","journal-title":"Nucleic Acids Res"},{"issue":"3","key":"2080_CR19","doi-asserted-by":"publisher","first-page":"265","DOI":"10.1093\/bib\/3.3.265","volume":"3","author":"CJ Sigrist","year":"2002","unstructured":"Sigrist CJ, Cerutti L, Hulo N, Gattiker A, Falquet L, Pagni M, Bairoch A, Bucher P. Prosite: a documented database using patterns and profiles as motif descriptors. Brief Bioinform. 2002; 3(3):265\u201374.","journal-title":"Brief Bioinform"},{"issue":"17","key":"2080_CR20","first-page":"3590","volume":"22","author":"T Attwood","year":"1994","unstructured":"Attwood T, Beck M, Bleasby A, Parry-Smith D. Prints\u2013a database of protein motif fingerprints. Nucleic Acids Res. 1994; 22(17):3590.","journal-title":"Nucleic Acids Res"},{"issue":"1","key":"2080_CR21","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1093\/nar\/29.1.41","volume":"29","author":"DH Haft","year":"2001","unstructured":"Haft DH, Loftus BJ, Richardson DL, Yang F, Eisen JA, Paulsen IT, White O. Tigrfams: a protein family resource for the functional identification of proteins. Nucleic Acids Res. 2001; 29(1):41\u20133.","journal-title":"Nucleic Acids Res"},{"issue":"1","key":"2080_CR22","doi-asserted-by":"publisher","first-page":"277","DOI":"10.1186\/1471-2105-7-277","volume":"7","author":"E Portugaly","year":"2006","unstructured":"Portugaly E, Harel A, Linial N, Linial M. Everest: automatic identification and classification of protein domains in all protein sequences. BMC Bioinformatics. 2006; 7(1):277.","journal-title":"BMC Bioinformatics"},{"issue":"3","key":"2080_CR23","doi-asserted-by":"publisher","first-page":"749","DOI":"10.1016\/S0022-2836(03)00269-9","volume":"328","author":"A Heger","year":"2003","unstructured":"Heger A, Holm L. Exhaustive enumeration of protein domain families. J Mol Biol. 2003; 328(3):749\u201367.","journal-title":"J Mol Biol"},{"issue":"2","key":"2080_CR24","doi-asserted-by":"publisher","first-page":"164","DOI":"10.1093\/bioinformatics\/14.2.164","volume":"14","author":"J Gracy","year":"1998","unstructured":"Gracy J, Argos P. Automated protein sequence database classification. i. integration of compositional similarity search, local similarity search, and multiple sequence alignment. Bioinformatics (Oxford, England). 1998; 14(2):164\u201373.","journal-title":"Bioinformatics (Oxford, England)"},{"key":"2080_CR25","volume-title":"Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing","author":"C Wu","year":"2008","unstructured":"Wu C, Kalyanaraman A. An efficient parallel approach for identifying protein families in large-scale metagenomic data sets. In: Proceedings of the 2008 ACM\/IEEE Conference on Supercomputing. USA: IEEE Press: 2008. p. 35."},{"issue":"2","key":"2080_CR26","doi-asserted-by":"publisher","first-page":"174","DOI":"10.1093\/bioinformatics\/14.2.174","volume":"14","author":"J Gracy","year":"1998","unstructured":"Gracy J, Argos P. Automated protein sequence database classification. ii. delineation of domain boundaries from sequence similarities. Bioinformatics (Oxford, England). 1998; 14(2):174\u201387.","journal-title":"Bioinformatics (Oxford, England)"},{"key":"2080_CR27","volume-title":"Proceedings of the 31st International Conference on Very Large Data Bases","author":"D Gibson","year":"2005","unstructured":"Gibson D, Kumar R, Tomkins A. Discovering large dense subgraphs in massive graphs. In: Proceedings of the 31st International Conference on Very Large Data Bases. USA: VLDB Endowment: 2005. p. 721\u201332."},{"issue":"10","key":"2080_CR28","doi-asserted-by":"publisher","first-page":"10008","DOI":"10.1088\/1742-5468\/2008\/10\/P10008","volume":"2008","author":"VD Blondel","year":"2008","unstructured":"Blondel VD, Guillaume J-L, Lambiotte R, Lefebvre E. Fast unfolding of communities in large networks. J Stat Mech Theory Exp. 2008; 2008(10):10008.","journal-title":"J Stat Mech Theory Exp"},{"issue":"8-13","key":"2080_CR29","doi-asserted-by":"publisher","first-page":"1157","DOI":"10.1016\/S0169-7552(97)00031-7","volume":"29","author":"AZ Broder","year":"1997","unstructured":"Broder AZ, Glassman SC, Manasse MS, Zweig G. Syntactic clustering of the web. Comput Netw ISDN Syst. 1997; 29(8-13):1157\u201366.","journal-title":"Comput Netw ISDN Syst"},{"key":"2080_CR30","first-page":"327","volume":"60","author":"AZ Broder","year":"1998","unstructured":"Broder AZ, Charikar M, Frieze AM, Mitzenmacher M. Min-wise independent permutations. J Comput Syst Sci. 1998; 60:327\u201336.","journal-title":"J Comput Syst Sci"},{"key":"2080_CR31","doi-asserted-by":"publisher","first-page":"19","DOI":"10.1016\/j.parco.2015.03.003","volume":"47","author":"H Lu","year":"2015","unstructured":"Lu H, Halappanavar M, Kalyanaraman A. Parallel heuristics for scalable community detection. Parallel Comput. 2015; 47:19\u201337.","journal-title":"Parallel Comput"},{"issue":"9","key":"2080_CR32","doi-asserted-by":"publisher","first-page":"610","DOI":"10.1016\/j.parco.2011.02.004","volume":"37","author":"SJ Plimpton","year":"2011","unstructured":"Plimpton SJ, Devine KD. Mapreduce in mpi for large-scale graph algorithms. Parallel Comput. 2011; 37(9):610\u201332.","journal-title":"Parallel Comput"},{"key":"2080_CR33","unstructured":"Lockwood S. Applications and Extensions of pClust to Big Microbial Proteomic Data.Washington State University; 2016."},{"key":"2080_CR34","doi-asserted-by":"publisher","first-page":"132","DOI":"10.1016\/j.jpdc.2014.08.009","volume":"79","author":"J Daily","year":"2015","unstructured":"Daily J, Kalyanaraman A, Krishnamoorthy S, Vishnu A. A work stealing based approach for enabling scalable optimal sequence homology detection. J Parallel Distrib Comput. 2015; 79:132\u201342.","journal-title":"J Parallel Distrib Comput"},{"key":"2080_CR35","volume-title":"Proceedings of the 22nd ACM International Conference on Conference on Information & Knowledge Management","author":"JJ Whang","year":"2013","unstructured":"Whang JJ, Gleich DF, Dhillon IS. Overlapping community detection using seed set expansion. In: Proceedings of the 22nd ACM International Conference on Conference on Information & Knowledge Management. USA: ACM: 2013. p. 2099\u2013108."},{"issue":"4","key":"2080_CR36","doi-asserted-by":"publisher","first-page":"2018","DOI":"10.1371\/journal.pone.0002018","volume":"3","author":"JJ Gillespie","year":"2008","unstructured":"Gillespie JJ, Williams K, Shukla M, Snyder EE, Nordberg EK, Ceraul SM, Dharmanolla C, Rainey D, Soneja J, Shallom JM, et al. Rickettsia phylogenomics: unwinding the intricacies of obligate intracellular life. PloS ONE. 2008; 3(4):2018.","journal-title":"PloS ONE"},{"issue":"19","key":"2080_CR37","doi-asserted-by":"publisher","first-page":"3811","DOI":"10.1093\/nar\/27.19.3811","volume":"27","author":"K Kossen","year":"1999","unstructured":"Kossen K, Uhlenbeck OC. Cloning and biochemical characterization of bacillus subtilis yxin, a dead protein specifically activated by 23s rrna: delineation of a novel sub-family of bacterial dead proteins. Nucleic Acids Res. 1999; 27(19):3811\u201320.","journal-title":"Nucleic Acids Res"},{"issue":"4","key":"2080_CR38","doi-asserted-by":"publisher","first-page":"625","DOI":"10.1016\/S0022-2836(02)01140-3","volume":"324","author":"K Kossen","year":"2002","unstructured":"Kossen K, Karginov FV, Uhlenbeck OC. The carboxy-terminal domain of the dexdh protein yxin is sufficient to confer specificity for 23s rrna. J Mol Biol. 2002; 324(4):625\u201336.","journal-title":"J Mol Biol"},{"issue":"23","key":"2080_CR39","doi-asserted-by":"publisher","first-page":"3150","DOI":"10.1093\/bioinformatics\/bts565","volume":"28","author":"L Fu","year":"2012","unstructured":"Fu L, Niu B, Zhu Z, Wu S, Li W. Cd-hit: accelerated for clustering the next-generation sequencing data. Bioinformatics. 2012; 28(23):3150\u20132.","journal-title":"Bioinformatics"},{"issue":"1","key":"2080_CR40","doi-asserted-by":"publisher","first-page":"481","DOI":"10.1186\/s12864-016-2744-9","volume":"17","author":"S Lockwood","year":"2016","unstructured":"Lockwood S, Brayton KA, Broschat SL. Comparative genomics reveals multiple pathways to mutualism for tick-borne pathogens. BMC Genomics. 2016; 17(1):481.","journal-title":"BMC Genomics"},{"key":"2080_CR41","doi-asserted-by":"crossref","unstructured":"Brandes U, Wagner D. Analysis and visualization of social networks. Graph Drawing Softw. 2004;321\u201340.","DOI":"10.1007\/978-3-642-18638-7_15"}],"container-title":["BMC Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1186\/s12859-018-2080-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,5,19]],"date-time":"2020-05-19T03:15:02Z","timestamp":1589858102000},"score":1,"resource":{"primary":{"URL":"https:\/\/bmcbioinformatics.biomedcentral.com\/articles\/10.1186\/s12859-018-2080-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,3,5]]},"references-count":41,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2018,12]]}},"alternative-id":["2080"],"URL":"https:\/\/doi.org\/10.1186\/s12859-018-2080-y","relation":{},"ISSN":["1471-2105"],"issn-type":[{"type":"electronic","value":"1471-2105"}],"subject":[],"published":{"date-parts":[[2018,3,5]]},"assertion":[{"value":"31 October 2017","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 February 2018","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 March 2018","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Not applicable.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"Not applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"The authors declare that they have no competing interests.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}},{"value":"Springer Nature remains neutral with regard to jurisdictional claims in published maps and institutional affiliations.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Publisher\u2019s Note"}}],"article-number":"83"}}