{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T11:14:40Z","timestamp":1761909280441,"version":"build-2065373602"},"reference-count":32,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,10,27]],"date-time":"2025-10-27T00:00:00Z","timestamp":1761523200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,10,27]],"date-time":"2025-10-27T00:00:00Z","timestamp":1761523200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/100009112","name":"Istituto Nazionale di Alta Matematica \u201cFrancesco Severi\u201d","doi-asserted-by":"publisher","award":["E53C23001670001","E53C23001670001","E53C23001670001"],"award-info":[{"award-number":["E53C23001670001","E53C23001670001","E53C23001670001"]}],"id":[{"id":"10.13039\/100009112","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100017142","name":"Gruppo Nazionale per il Calcolo Scientifico","doi-asserted-by":"publisher","award":["E53C24001950001","E53C24001950001","E53C24001950001"],"award-info":[{"award-number":["E53C24001950001","E53C24001950001","E53C24001950001"]}],"id":[{"id":"10.13039\/100017142","id-type":"DOI","asserted-by":"publisher"}]},{"name":"European Union under the Italian National Recovery and Resilience Plan","award":["E83C22004640001","E83C22004640001"],"award-info":[{"award-number":["E83C22004640001","E83C22004640001"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["BMC Bioinformatics"],"DOI":"10.1186\/s12859-025-06261-7","type":"journal-article","created":{"date-parts":[[2025,10,27]],"date-time":"2025-10-27T12:29:49Z","timestamp":1761568189000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["BioSet2Vec: extraction of k-mer dictionaries from multiple sets of biological sequences via big data technologies"],"prefix":"10.1186","volume":"26","author":[{"given":"Ylenia","family":"Galluzzo","sequence":"first","affiliation":[]},{"given":"Raffaele","family":"Giancarlo","sequence":"additional","affiliation":[]},{"given":"Simona E.","family":"Rombo","sequence":"additional","affiliation":[]},{"given":"Filippo","family":"Utro","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"issue":"3","key":"6261_CR1","doi-asserted-by":"publisher","first-page":"390","DOI":"10.1093\/bib\/bbt088","volume":"15","author":"R Giancarlo","year":"2014","unstructured":"Giancarlo R, Rombo SE, Utro F. Compressive biological sequence analysis and archival in the era of high-throughput sequencing technologies. Brief Bioinform. 2014;15(3):390\u2013406.","journal-title":"Brief Bioinform"},{"issue":"11","key":"6261_CR2","doi-asserted-by":"publisher","first-page":"000685","DOI":"10.1099\/mgen.0.000685","volume":"7","author":"O Schwengers","year":"2021","unstructured":"Schwengers O, Jelonek L, Dieckmann MA, Beyvers S, Blom J, Goesmann A. Bakta: rapid and standardized annotation of bacterial genomes via alignment-free sequence identification. Microb Genomics. 2021;7(11):000685.","journal-title":"Microb Genomics"},{"issue":"4","key":"6261_CR3","doi-asserted-by":"publisher","first-page":"925","DOI":"10.1093\/BIOINFORMATICS\/BTAB747","volume":"38","author":"G Cattaneo","year":"2022","unstructured":"Cattaneo G, Petrillo UF, Giancarlo R, Palini F, Romualdi C. The power of word-frequency-based alignment-free functions: a comprehensive large-scale experimental analysis. Bioinformatics. 2022;38(4):925\u201332. https:\/\/doi.org\/10.1093\/BIOINFORMATICS\/BTAB747.","journal-title":"Bioinformatics"},{"issue":"1","key":"6261_CR4","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1186\/s13015-024-00251-9","volume":"19","author":"J Fan","year":"2024","unstructured":"Fan J, Khan J, Singh NP, Pibiri GE, Patro R. Fulgor: a fast and compact k-mer index for large-scale matching and color queries. Algorithms Mol Biol. 2024;19(1):3.","journal-title":"Algorithms Mol Biol"},{"issue":"1","key":"6261_CR5","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1186\/s13015-023-00226-2","volume":"18","author":"GE Pibiri","year":"2023","unstructured":"Pibiri GE. On weighted k-mer dictionaries. Algorithms Mol Biol. 2023;18(1):3.","journal-title":"Algorithms Mol Biol"},{"issue":"20","key":"6261_CR6","doi-asserted-by":"publisher","first-page":"3454","DOI":"10.1093\/bioinformatics\/bty799","volume":"34","author":"R Giancarlo","year":"2018","unstructured":"Giancarlo R, Rombo SE, Utro F. In vitro versus in vivo compositional landscapes of histone sequence preferences in Eucaryotic genomes. Bioinformatics. 2018;34(20):3454\u201360. https:\/\/doi.org\/10.1093\/bioinformatics\/bty799.","journal-title":"Bioinformatics"},{"key":"6261_CR7","doi-asserted-by":"publisher","first-page":"107379","DOI":"10.1016\/j.compbiolchem.2020.107379","volume":"89","author":"Z Du","year":"2020","unstructured":"Du Z, He Y, Li J, Uversky VN. Deepadd: protein function prediction from k-mer embedding and additional features. Comput Biol Chem. 2020;89:107379.","journal-title":"Comput Biol Chem"},{"key":"6261_CR8","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1016\/j.ymeth.2023.02.009","volume":"212","author":"Z Teng","year":"2023","unstructured":"Teng Z, Shi L, Yu H, Wu C, Tian Z. Measuring functional similarity of lncrnas based on variable k-mer profiles of nucleotide sequences. Methods. 2023;212:21\u201330.","journal-title":"Methods"},{"issue":"19","key":"6261_CR9","doi-asserted-by":"publisher","first-page":"21920","DOI":"10.1007\/s10489-023-04618-0","volume":"53","author":"MS Nawaz","year":"2023","unstructured":"Nawaz MS, Fournier-Viger P, Aslam M, Li W, He Y, Niu X. Using alignment-free and pattern mining methods for SARS-CoV-2 genome analysis. Appl Intell. 2023;53(19):21920\u201343.","journal-title":"Appl Intell"},{"issue":"18","key":"6261_CR10","doi-asserted-by":"publisher","first-page":"2939","DOI":"10.1093\/bioinformatics\/btv295","volume":"31","author":"R Giancarlo","year":"2015","unstructured":"Giancarlo R, Rombo SE, Utro F. Epigenomic k-mer dictionaries: shedding light on how sequence composition influences in vivo nucleosome positioning. Bioinformatics. 2015;31(18):2939\u201346. https:\/\/doi.org\/10.1093\/bioinformatics\/btv295.","journal-title":"Bioinformatics"},{"key":"6261_CR11","first-page":"1","volume-title":"Data mining","author":"A Rajaraman","year":"2011","unstructured":"Rajaraman A, Ullman JD. Data mining. Cambridge: Cambridge University Press; 2011. p. 1\u201317."},{"key":"6261_CR12","unstructured":"Zaharia M, Chowdhury M, Das T, Dave A, Ma J, McCauly M, Franklin MJ, Shenker S, Stoica I. Resilient distributed datasets: a fault-tolerant abstraction for in-memory cluster computing. In: Presented as part of the 9th USENIX symposium on networked systems design and implementation (NSDI 12), San Jose, 2012; pp. 15\u201328."},{"issue":"7339","key":"6261_CR13","doi-asserted-by":"publisher","first-page":"480","DOI":"10.1038\/nature09725","volume":"471","author":"PV Kharchenko","year":"2011","unstructured":"Kharchenko PV, Alekseyenko AA, Schwartz YB, Minoda A, Riddle NC, Ernst J, et al. Comprehensive analysis of the chromatin landscape in Drosophila melanogaster. Nature. 2011;471(7339):480\u20135.","journal-title":"Nature"},{"key":"6261_CR14","unstructured":"Steen M, Tanenbaum A (2017) Distributed systems. Maarten van Steen Leiden. The Netherlands, 2017; pp. 206\u2013210."},{"key":"6261_CR15","unstructured":"Zaharia M, Chowdhury M, Franklin MJ, Shenker S, Stoica I. Spark: cluster computing with working sets. In: 2nd USENIX workshop on hot topics in cloud computing (HotCloud 10), 2010."},{"key":"6261_CR16","unstructured":"Zaharia M, Chowdhury M, Das T, Dave A, Ma J, McCauly M, Franklin MJ, Shenker S, Stoica I. Resilient distributed datasets: a {Fault-Tolerant} abstraction for {In-Memory} cluster computing. In: 9th USENIX symposium on networked systems design and implementation (NSDI 12), 2012; pp. 5\u201328."},{"key":"6261_CR17","doi-asserted-by":"crossref","unstructured":"Armbrust M, Xin RS, Lian C, Huai Y, Liu D, Bradley JK, Meng X, Kaftan T, Franklin MJ, Ghodsi A, et al. Spark sql: relational data processing in spark. In: Proceedings of the 2015 ACM SIGMOD international conference on management of data, 2015; pp. 1383\u20131394.","DOI":"10.1145\/2723372.2742797"},{"issue":"1\u201313","key":"6261_CR18","first-page":"2","volume":"53","author":"D Borthakur","year":"2008","unstructured":"Borthakur D, et al. Hdfs architecture guide. Hadoop Apache Project. 2008;53(1\u201313):2.","journal-title":"Hadoop Apache Project"},{"key":"6261_CR19","doi-asserted-by":"crossref","unstructured":"Vohra D, Vohra D. Apache parquet. Practical Hadoop Ecosystem: A Definitive Guide to Hadoop-Related Frameworks and Tools, 2016; pp. 325\u2013335.","DOI":"10.1007\/978-1-4842-2199-0_8"},{"key":"6261_CR20","unstructured":"Ramos J, et al. Using tf-idf to determine word relevance in document queries. In: Proceedings of the first instructional conference on machine learning. Citeseer. 2003; pp. 29\u201348."},{"key":"6261_CR21","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/978-3-642-79999-0_3","volume-title":"From data to knowledge: theoretical and practical aspects of classification, data analysis, and knowledge organization","author":"AD Gordon","year":"1996","unstructured":"Gordon AD. Null models in cluster validation. In: From data to knowledge: theoretical and practical aspects of classification, data analysis, and knowledge organization. Berlin: Springer; 1996. p. 32\u201344."},{"key":"6261_CR22","volume-title":"Data mining: concepts and techniques","author":"J Han","year":"2022","unstructured":"Han J, Pei J, Tong H. Data mining: concepts and techniques. Burlington: Morgan Kaufmann; 2022."},{"key":"6261_CR23","doi-asserted-by":"publisher","first-page":"028","DOI":"10.1093\/database\/bav028","volume":"2015","author":"J Pi\u00f1ero","year":"2015","unstructured":"Pi\u00f1ero J, Queralt-Rosinach N, Bravo A, Deu-Pons J, Bauer-Mehren A, Baron M, et al. Disgenet: a discovery platform for the dynamical exploration of human diseases and their genes. Database. 2015;2015:028.","journal-title":"Database"},{"issue":"3","key":"6261_CR24","doi-asserted-by":"publisher","first-page":"1737","DOI":"10.1007\/s12035-018-1173-y","volume":"56","author":"A Uddin","year":"2019","unstructured":"Uddin A, Chakraborty S. Codon usage pattern of genes involved in central nervous system. Mol Neurobiol. 2019;56(3):1737\u201348.","journal-title":"Mol Neurobiol"},{"key":"6261_CR25","doi-asserted-by":"publisher","first-page":"2100477","DOI":"10.1200\/PO.21.00477","volume":"6","author":"RW Madison","year":"2022","unstructured":"Madison RW, Hu X, Ramanan V, Xu Z, Huang RS, Sokol ES, et al. Clustered 8-oxo-guanine mutations and oncogenic gene fusions in microsatellite-unstable colorectal cancer. JCO Precis Oncol. 2022;6:2100477.","journal-title":"JCO Precis Oncol"},{"issue":"22","key":"6261_CR26","doi-asserted-by":"publisher","first-page":"3815","DOI":"10.1126\/sciadv.abn3815","volume":"8","author":"S-G Jin","year":"2022","unstructured":"Jin S-G, Meng Y, Johnson J, Szab\u00f3 PE, Pfeifer GP. Concordance of hydrogen peroxide-induced 8-oxo-guanine patterns with two cancer mutation signatures of upper GI tract tumors. Sci Adv. 2022;8(22):3815.","journal-title":"Sci Adv"},{"issue":"9","key":"6261_CR27","doi-asserted-by":"publisher","first-page":"551","DOI":"10.1038\/nrg.2016.83","volume":"17","author":"H Zhu","year":"2016","unstructured":"Zhu H, Wang G, Qian J. Transcription factors as readers and effectors of DNA methylation. Nat Rev Genet. 2016;17(9):551\u201365.","journal-title":"Nat Rev Genet"},{"key":"6261_CR28","doi-asserted-by":"crossref","unstructured":"McInnes, L., Healy, J., Melville, J.: Umap: uniform manifold approximation and projection for dimension reduction. arXiv preprint arXiv:1802.03426 (2018)","DOI":"10.21105\/joss.00861"},{"key":"6261_CR29","unstructured":"Angelov D. Top2vec: distributed representations of topics. arXiv preprint arXiv:2008.09470 (2020)."},{"issue":"7","key":"6261_CR30","doi-asserted-by":"publisher","first-page":"75","DOI":"10.1093\/nar\/gkt003","volume":"41","author":"H Yi","year":"2013","unstructured":"Yi H, Jin L. Co-phylog: an assembly-free phylogenomic approach for closely related organisms. Nucleic Acids Res. 2013;41(7):75.","journal-title":"Nucleic Acids Res"},{"key":"6261_CR31","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s13059-019-1755-7","volume":"20","author":"A Zielezinski","year":"2019","unstructured":"Zielezinski A, Girgis HZ, Bernard G, Leimeister C-A, Tang K, Dencker T, et al. Benchmarking of alignment-free sequence comparison methods. Genome Biol. 2019;20:1\u201318.","journal-title":"Genome Biol"},{"key":"6261_CR32","doi-asserted-by":"publisher","first-page":"192","DOI":"10.3389\/fpls.2012.00192","volume":"3","author":"K Hatje","year":"2012","unstructured":"Hatje K, Kollmar M. A phylogenetic analysis of the brassicales clade based on an alignment-free sequence comparison method. Front Plant Sci. 2012;3:192.","journal-title":"Front Plant Sci"}],"container-title":["BMC Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s12859-025-06261-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1186\/s12859-025-06261-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s12859-025-06261-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T11:12:05Z","timestamp":1761909125000},"score":1,"resource":{"primary":{"URL":"https:\/\/bmcbioinformatics.biomedcentral.com\/articles\/10.1186\/s12859-025-06261-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":32,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["6261"],"URL":"https:\/\/doi.org\/10.1186\/s12859-025-06261-7","relation":{},"ISSN":["1471-2105"],"issn-type":[{"type":"electronic","value":"1471-2105"}],"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"8 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 August 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 October 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"RG and SER are members of the Editorial Board of BMC Bioinformatics.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"264"}}