{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T14:13:36Z","timestamp":1740147216834,"version":"3.37.3"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2019,5,31]],"date-time":"2019-05-31T00:00:00Z","timestamp":1559260800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"},{"start":{"date-parts":[[2019,5,31]],"date-time":"2019-05-31T00:00:00Z","timestamp":1559260800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"funder":[{"DOI":"10.13039\/501100001871","name":"Funda\u00e7\u00e3o para a Ci\u00eancia e a Tecnologia","doi-asserted-by":"publisher","award":["PD\/BD\/105729\/2014"],"award-info":[{"award-number":["PD\/BD\/105729\/2014"]}],"id":[{"id":"10.13039\/501100001871","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Adv Data Anal Classif"],"published-print":{"date-parts":[[2020,3]]},"DOI":"10.1007\/s11634-019-00362-x","type":"journal-article","created":{"date-parts":[[2019,5,31]],"date-time":"2019-05-31T18:04:43Z","timestamp":1559325883000},"page":"57-76","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Clustering genomic words in human DNA using peaks and trends of distributions"],"prefix":"10.1007","volume":"14","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4632-3561","authenticated-orcid":false,"given":"Ana Helena","family":"Tavares","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2093-3137","authenticated-orcid":false,"given":"Jakob","family":"Raymaekers","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3807-5353","authenticated-orcid":false,"given":"Peter J.","family":"Rousseeuw","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2593-8818","authenticated-orcid":false,"given":"Paula","family":"Brito","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1051-8084","authenticated-orcid":false,"given":"Vera","family":"Afreixo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,5,31]]},"reference":[{"key":"362_CR1","unstructured":"Abramowitz M, Stegun IA (1964) Handbook of mathematical functions: with formulas, graphs, and mathematical tables, vol\u00a055. Courier Corporation"},{"issue":"2","key":"362_CR2","doi-asserted-by":"publisher","first-page":"209","DOI":"10.1093\/biostatistics\/kxu041","volume":"16","author":"V Afreixo","year":"2014","unstructured":"Afreixo V, Rodrigues JM, Bastos CA (2014) Analysis of single-strand exceptional word symmetry in the human genome: new measures. Biostatistics 16(2):209\u2013221","journal-title":"Biostatistics"},{"issue":"8","key":"362_CR3","doi-asserted-by":"crossref","first-page":"1923","DOI":"10.1101\/gr.869803","volume":"13","author":"VB Bajic","year":"2003","unstructured":"Bajic VB, Seah SH (2003) Dragon gene start finder: an advanced system for finding approximate locations of the start of gene transcriptional units. Genome Res 13(8):1923\u20131929","journal-title":"Genome Res"},{"key":"362_CR4","volume-title":"Runs and scans with applications","author":"N Balakrishnan","year":"2011","unstructured":"Balakrishnan N, Koutras MV (2011) Runs and scans with applications, vol 764. Wiley, New York"},{"issue":"4","key":"362_CR5","doi-asserted-by":"publisher","first-page":"1358","DOI":"10.1073\/pnas.89.4.1358","volume":"89","author":"C Burge","year":"1992","unstructured":"Burge C, Campbell AM, Karlin S (1992) Over-and under-representation of short oligonucleotides in DNA sequences. Proc Natl Acad Sci 89(4):1358\u20131362","journal-title":"Proc Natl Acad Sci"},{"issue":"1","key":"362_CR6","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1080\/03610927408827101","volume":"3","author":"T Cali\u0144ski","year":"1974","unstructured":"Cali\u0144ski T, Harabasz J (1974) A dendrite method for cluster analysis. Commun Stat Theory Methods 3(1):1\u201327","journal-title":"Commun Stat Theory Methods"},{"issue":"6822","key":"362_CR7","doi-asserted-by":"publisher","first-page":"860","DOI":"10.1038\/35057062","volume":"409","author":"IHGS Consortium","year":"2001","unstructured":"Consortium IHGS et al (2001) Initial sequencing and analysis of the human genome. Nature 409(6822):860","journal-title":"Nature"},{"issue":"2","key":"362_CR8","doi-asserted-by":"publisher","first-page":"553","DOI":"10.1214\/aos\/1031833664","volume":"25","author":"JA Cuesta-Albertos","year":"1997","unstructured":"Cuesta-Albertos JA, Gordaliza A, Matr\u00e1n C (1997) Trimmed k-means: an attempt to robustify quantizers. Ann Stat 25(2):553\u2013576","journal-title":"Ann Stat"},{"issue":"10","key":"362_CR9","doi-asserted-by":"publisher","first-page":"1010","DOI":"10.1101\/gad.2037511","volume":"25","author":"AM Deaton","year":"2011","unstructured":"Deaton AM, Bird A (2011) CpG islands and the regulation of transcription. Genes Dev 25(10):1010\u20131022","journal-title":"Genes Dev"},{"issue":"12","key":"362_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.18637\/jss.v047.i12","volume":"47","author":"H Fritz","year":"2012","unstructured":"Fritz H, Garcia-Escudero LA, Mayo-Iscar A (2012) tclust: an R package for a trimming approach to cluster analysis. J Stat Softw 47(12):1\u201326","journal-title":"J Stat Softw"},{"key":"362_CR11","unstructured":"Fu JC (1996) Distribution theory of runs and patterns associated with a sequence of multi-state trials. Stat Sin 957\u2013974"},{"key":"362_CR12","doi-asserted-by":"publisher","DOI":"10.1142\/4669","volume-title":"Distribution theory of runs and patterns and its applications: a finite Markov chain imbedding approach","author":"JC Fu","year":"2003","unstructured":"Fu JC, Lou WW (2003) Distribution theory of runs and patterns and its applications: a finite Markov chain imbedding approach. World Scientific, Singapore"},{"key":"362_CR13","doi-asserted-by":"publisher","first-page":"1324","DOI":"10.1214\/07-AOS515","volume":"36","author":"LA Garcia-Escudero","year":"2008","unstructured":"Garcia-Escudero LA, Gordaliza A, Matr\u00e1n C, Mayo-Iscar A (2008) A general trimming approach to robust cluster analysis. Ann Stat 36:1324\u20131345","journal-title":"Ann Stat"},{"issue":"2","key":"362_CR14","doi-asserted-by":"publisher","first-page":"261","DOI":"10.1016\/0022-2836(87)90689-9","volume":"196","author":"M Gardiner-Garden","year":"1987","unstructured":"Gardiner-Garden M, Frommer M (1987) CpG islands in vertebrate genomes. J Mol Biol 196(2):261\u2013282","journal-title":"J Mol Biol"},{"issue":"4","key":"362_CR15","doi-asserted-by":"publisher","first-page":"703","DOI":"10.3233\/IDA-2012-0545","volume":"16","author":"L Guerra","year":"2012","unstructured":"Guerra L, Robles V, Bielza C, Larra\u00f1aga P (2012) A comparison of clustering quality indices using outliers and noise. Intell Data Anal 16(4):703\u2013715","journal-title":"Intell Data Anal"},{"issue":"6","key":"362_CR16","doi-asserted-by":"publisher","first-page":"1154","DOI":"10.1016\/j.jmva.2007.07.002","volume":"99","author":"C Hennig","year":"2008","unstructured":"Hennig C (2008) Dissolution point and isolation robustness: robustness criteria for general cluster analysis methods. J Multivar Anal 99(6):1154\u20131176","journal-title":"J Multivar Anal"},{"issue":"1","key":"362_CR17","doi-asserted-by":"publisher","first-page":"193","DOI":"10.1007\/BF01908075","volume":"2","author":"L Hubert","year":"1985","unstructured":"Hubert L, Arabie P (1985) Comparing partitions. J Classif 2(1):193\u2013218","journal-title":"J Classif"},{"issue":"6","key":"362_CR18","doi-asserted-by":"publisher","first-page":"1072","DOI":"10.1037\/0033-2909.83.6.1072","volume":"83","author":"LJ Hubert","year":"1976","unstructured":"Hubert LJ, Levin JR (1976) A general statistical framework for assessing categorical clustering in free recall. Psychol Bull 83(6):1072","journal-title":"Psychol Bull"},{"issue":"4","key":"362_CR19","doi-asserted-by":"publisher","first-page":"247","DOI":"10.1093\/mutage\/gem009","volume":"22","author":"FV Jacinto","year":"2007","unstructured":"Jacinto FV, Esteller M (2007) Mutator pathways unleashed by epigenetic silencing in human cancer. Mutagenesis 22(4):247\u2013253","journal-title":"Mutagenesis"},{"key":"362_CR20","doi-asserted-by":"publisher","DOI":"10.1002\/9780470316801","volume-title":"Finding groups in data","author":"L Kaufman","year":"1990","unstructured":"Kaufman L, Rousseeuw PJ (1990) Finding groups in data. Wiley, New York"},{"issue":"3","key":"362_CR21","doi-asserted-by":"publisher","first-page":"345","DOI":"10.1089\/cmb.1996.3.345","volume":"3","author":"MY Leung","year":"1996","unstructured":"Leung MY, Marsh GM, Speed TP (1996) Over-and underrepresentation of short DNA words in herpesvirus genomes. J Comput Biol 3(3):345\u2013360","journal-title":"J Comput Biol"},{"key":"362_CR22","doi-asserted-by":"crossref","unstructured":"Liu Y, Li Z, Xiong H, Gao X, Wu J (2010) Understanding of internal clustering validation measures. In: 2010 IEEE 10th international conference on data mining (ICDM), IEEE, pp 911\u2013916","DOI":"10.1109\/ICDM.2010.35"},{"key":"362_CR23","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9781107341005","volume-title":"Applied combinatorics on words","author":"M Lothaire","year":"2005","unstructured":"Lothaire M (2005) Applied combinatorics on words, vol 105. Cambridge University Press, Cambridge"},{"issue":"4","key":"362_CR24","doi-asserted-by":"publisher","first-page":"e36","DOI":"10.1371\/journal.pcbi.0020036","volume":"2","author":"KD MacIsaac","year":"2006","unstructured":"MacIsaac KD, Fraenkel E (2006) Practical strategies for discovering regulatory DNA sequence motifs. PLoS Comput Biol 2(4):e36","journal-title":"PLoS Comput Biol"},{"issue":"3","key":"362_CR25","doi-asserted-by":"publisher","first-page":"949","DOI":"10.1093\/nar\/gkh246","volume":"32","author":"L Marino-Ramrez","year":"2004","unstructured":"Marino-Ramrez L, Spouge JL, Kanga GC, Landsman D (2004) Statistical analysis of over-represented words in human promoter sequences. Nucl Acids Res 32(3):949\u2013958","journal-title":"Nucl Acids Res"},{"issue":"2","key":"362_CR26","doi-asserted-by":"publisher","first-page":"159","DOI":"10.1007\/BF02294245","volume":"50","author":"GW Milligan","year":"1985","unstructured":"Milligan GW, Cooper MC (1985) An examination of procedures for determining the number of clusters in a data set. Psychometrika 50(2):159\u2013179","journal-title":"Psychometrika"},{"issue":"4","key":"362_CR27","doi-asserted-by":"publisher","first-page":"441","DOI":"10.1207\/s15327906mbr2104_5","volume":"21","author":"GW Milligan","year":"1986","unstructured":"Milligan GW, Cooper MC (1986) A study of the comparability of external criteria for hierarchical cluster analysis. Multivar Behav Res 21(4):441\u2013458. https:\/\/doi.org\/10.1207\/s15327906mbr2104_5 pMID: 26828221","journal-title":"Multivar Behav Res"},{"issue":"1","key":"362_CR28","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.gene.2008.11.001","volume":"432","author":"T Nakamoto","year":"2009","unstructured":"Nakamoto T (2009) Evolution and the universality of the mechanism of initiation of protein synthesis. Gene 432(1):1\u20136","journal-title":"Gene"},{"issue":"1","key":"362_CR29","doi-asserted-by":"publisher","first-page":"1","DOI":"10.2202\/1544-6115.1219","volume":"5","author":"G Nuel","year":"2006","unstructured":"Nuel G (2006) Numerical solutions for patterns statistics on markov chains. Stat Appl Genet Mol Biol 5(1):1\u20135","journal-title":"Stat Appl Genet Mol Biol"},{"key":"362_CR30","volume-title":"Mathematics of genome analysis","author":"JK Percus","year":"2002","unstructured":"Percus JK (2002) Mathematics of genome analysis, vol 17. Cambridge University Press, Cambridge"},{"issue":"1\u20133","key":"362_CR31","doi-asserted-by":"publisher","first-page":"259","DOI":"10.1016\/S0166-218X(00)00195-5","volume":"104","author":"M R\u00e9gnier","year":"2000","unstructured":"R\u00e9gnier M (2000) A unified approach to word occurrence probabilities. Discrete Appl Math 104(1\u20133):259\u2013280","journal-title":"Discrete Appl Math"},{"issue":"1\u20132","key":"362_CR32","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1089\/10665270050081360","volume":"7","author":"G Reinert","year":"2000","unstructured":"Reinert G, Schbath S, Waterman MS (2000) Probabilistic and statistical properties of words: an overview. J Comput Biol 7(1\u20132):1\u201346","journal-title":"J Comput Biol"},{"issue":"1","key":"362_CR33","doi-asserted-by":"publisher","first-page":"179","DOI":"10.1239\/jap\/1032374240","volume":"36","author":"S Robin","year":"1999","unstructured":"Robin S, Daudin JJ (1999) Exact distribution of word occurrences in a random sequence of letters. J Appl Probab 36(1):179\u2013193","journal-title":"J Appl Probab"},{"issue":"4","key":"362_CR34","doi-asserted-by":"publisher","first-page":"895","DOI":"10.1023\/A:1014633825822","volume":"53","author":"S Robin","year":"2001","unstructured":"Robin S, Daudin JJ (2001) Exact distribution of the distances between any occurrences of a set of words. Ann Inst Stat Math 53(4):895\u2013905","journal-title":"Ann Inst Stat Math"},{"issue":"6","key":"362_CR35","doi-asserted-by":"publisher","first-page":"761","DOI":"10.1089\/10665270260518254","volume":"9","author":"S Robin","year":"2002","unstructured":"Robin S, Daudin JJ, Richard H, Sagot MF, Schbath S (2002) Occurrence probability of structured motifs in random sequences. J Comput Biol 9(6):761\u2013773","journal-title":"J Comput Biol"},{"key":"362_CR36","volume-title":"DNA, words and models: statistics of exceptional words","author":"S Robin","year":"2005","unstructured":"Robin S, Rodolphe F, Schbath S (2005) DNA, words and models: statistics of exceptional words. Cambridge University Press, Cambridge"},{"key":"362_CR37","doi-asserted-by":"publisher","first-page":"871","DOI":"10.1080\/01621459.1984.10477105","volume":"79","author":"PJ Rousseeuw","year":"1984","unstructured":"Rousseeuw PJ (1984) Least median of squares regression. J Am Stat Assoc 79:871\u2013880","journal-title":"J Am Stat Assoc"},{"key":"362_CR38","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1016\/0377-0427(87)90125-7","volume":"20","author":"PJ Rousseeuw","year":"1987","unstructured":"Rousseeuw PJ (1987) Silhouettes: a graphical aid to the interpretation and validation of cluster analysis. J Comput Appl Math 20:53\u201365","journal-title":"J Comput Appl Math"},{"issue":"5","key":"362_CR39","doi-asserted-by":"publisher","first-page":"1412","DOI":"10.1073\/pnas.0510310103","volume":"103","author":"S Saxonov","year":"2006","unstructured":"Saxonov S, Berg P, Brutlag DL (2006) A genome-wide analysis of CpG dinucleotides in the human genome distinguishes two distinct classes of promoters. Proc Natl Acad Sci 103(5):1412\u20131417","journal-title":"Proc Natl Acad Sci"},{"key":"362_CR40","doi-asserted-by":"publisher","first-page":"666","DOI":"10.1214\/aoap\/1034801248","volume":"7","author":"V Stefanov","year":"1997","unstructured":"Stefanov V, Pakes AG (1997) Explicit distributional results in pattern formation. Ann Appl Probab 7:666\u2013678","journal-title":"Ann Appl Probab"},{"issue":"3","key":"362_CR41","doi-asserted-by":"publisher","first-page":"756","DOI":"10.1239\/jap\/1014842834","volume":"37","author":"VT Stefanov","year":"2000","unstructured":"Stefanov VT (2000) On some waiting time problems. J Appl Probab 37(3):756\u2013764","journal-title":"J Appl Probab"},{"issue":"4","key":"362_CR42","doi-asserted-by":"publisher","first-page":"881","DOI":"10.1239\/jap\/1067436088","volume":"40","author":"VT Stefanov","year":"2003","unstructured":"Stefanov VT (2003) The intersite distances between pattern occurrences in strings generated by general discrete-and continuous-time models: an algorithmic approach. J Appl Probab 40(4):881\u2013892","journal-title":"J Appl Probab"},{"issue":"2","key":"362_CR43","doi-asserted-by":"publisher","first-page":"261","DOI":"10.1037\/met0000049","volume":"21","author":"D Steinley","year":"2016","unstructured":"Steinley D, Brusco MJ, Hubert L (2016) The variance of the adjusted rand index. Psychol Methods 21(2):261","journal-title":"Psychol Methods"},{"key":"362_CR44","doi-asserted-by":"crossref","unstructured":"Tavares AH, Afreixo V, Rodrigues JM, Bastos CAC (2015) The symmetry of oligonucleotide distance distributions in the human genome. In: ICPRAM (2), pp 256\u2013263","DOI":"10.5220\/0005223102560263"},{"key":"362_CR45","unstructured":"Tavares AH, Afreixo V, Rodrigues JM, Bastos CAC, Pinho AJ, Ferreira PJSG, Brito P (2016) Detection of exceptional genomic words: a comparison between species. In: Proceedings of 22nd international conference on computational statistics (COMPSTAT), pp 255\u2013264"},{"issue":"1","key":"362_CR46","doi-asserted-by":"publisher","first-page":"728","DOI":"10.1038\/s41598-017-00646-2","volume":"7","author":"AHMP Tavares","year":"2017","unstructured":"Tavares AHMP, Pinho AJ, Silva RM, Rodrigues JMOS, Bastos CAC, Ferreira PJSG, Afreixo V (2017) DNA word analysis based on the distribution of the distances between symmetric words. Sci Rep 7(1):728","journal-title":"Sci Rep"}],"container-title":["Advances in Data Analysis and Classification"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11634-019-00362-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11634-019-00362-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11634-019-00362-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,19]],"date-time":"2022-09-19T01:19:34Z","timestamp":1663550374000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11634-019-00362-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,5,31]]},"references-count":46,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2020,3]]}},"alternative-id":["362"],"URL":"https:\/\/doi.org\/10.1007\/s11634-019-00362-x","relation":{},"ISSN":["1862-5347","1862-5355"],"issn-type":[{"type":"print","value":"1862-5347"},{"type":"electronic","value":"1862-5355"}],"subject":[],"published":{"date-parts":[[2019,5,31]]},"assertion":[{"value":"16 November 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 May 2019","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 May 2019","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 May 2019","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}