{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,20]],"date-time":"2026-04-20T18:23:03Z","timestamp":1776709383934,"version":"3.51.2"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2023,7,3]],"date-time":"2023-07-03T00:00:00Z","timestamp":1688342400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,7,3]],"date-time":"2023-07-03T00:00:00Z","timestamp":1688342400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Med Biol Eng Comput"],"published-print":{"date-parts":[[2023,10]]},"DOI":"10.1007\/s11517-023-02837-8","type":"journal-article","created":{"date-parts":[[2023,7,3]],"date-time":"2023-07-03T05:01:20Z","timestamp":1688360480000},"page":"2607-2626","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["ViralVectors: compact and scalable alignment-free virome feature generation"],"prefix":"10.1007","volume":"61","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8121-2168","authenticated-orcid":false,"given":"Sarwan","family":"Ali","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Prakash","family":"Chourasia","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zahra","family":"Tayebi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Babatunde","family":"Bello","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Murray","family":"Patterson","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,7,3]]},"reference":[{"key":"2837_CR1","doi-asserted-by":"crossref","unstructured":"Ali S, Ali TE, Khan MA, Khan I, Patterson M (2021) Effective and scalable clustering of SARS-COV-2 sequences. In: International conference on big data research (ICBDR). pp 42\u201349","DOI":"10.1145\/3505745.3505752"},{"issue":"3","key":"2837_CR2","doi-asserted-by":"publisher","first-page":"418","DOI":"10.3390\/biology11030418","volume":"11","author":"S Ali","year":"2022","unstructured":"Ali S, Bello B, Chourasia P, Punathil RT, Zhou Y, Patterson M (2022) PWM2VEC: An efficient embedding approach for viral host specification from coronavirus spike sequences. Biology 11(3):418","journal-title":"Biology"},{"key":"2837_CR3","doi-asserted-by":"crossref","unstructured":"Ali S, Patterson M (2021) Spike2Vec: An efficient and scalable embedding approach for COVID-19 spike sequences. In: IEEE international conference on big data (Big Data). pp 1533\u20131540","DOI":"10.1109\/BigData52589.2021.9671848"},{"key":"2837_CR4","doi-asserted-by":"crossref","unstructured":"Ali S, Sahoo B, Ullah N, Zelikovskiy A, Patterson M, Khan I (2021) A k-mer based approach for SARS-COV-2 variant identification. In: International symposium on bioinformatics research and applications. pp 153\u2013164","DOI":"10.1007\/978-3-030-91415-8_14"},{"issue":"1","key":"2837_CR5","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1080\/03610927408827101","volume":"3","author":"T Cali\u0144ski","year":"1974","unstructured":"Cali\u0144ski T, Harabasz J (1974) A dendrite method for cluster analysis. Comm Stats-theory Methods 3(1):1\u201327","journal-title":"Comm Stats-theory Methods"},{"issue":"11","key":"2837_CR6","doi-asserted-by":"publisher","first-page":"987","DOI":"10.1038\/nbt.2023","volume":"29","author":"PEC Compeau","year":"2011","unstructured":"Compeau PEC, Pevzner PA, Tesler G (2011) How to apply de Bruijn graphs to genome assembly. Nat Biotechnol 29(11):987\u2013991","journal-title":"Nat Biotechnol"},{"key":"2837_CR7","doi-asserted-by":"publisher","first-page":"224","DOI":"10.1109\/TPAMI.1979.4766909","volume":"2","author":"DL Davies","year":"1979","unstructured":"Davies DL, Bouldin DW (1979) A cluster separation measure. IEEE Trans Pattern Anal Mach Intell 2:224\u2013227","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"2837_CR8","doi-asserted-by":"crossref","unstructured":"De\u00a0Silva NH, Bhai J, Chakiachvili M, Contreras-Moreira B, Cummins C, Frankish A, Gall, A, Genez T, Howe KL, Hunt SE, et\u00a0al (2021) The Ensembl COVID-19 resource: Ongoing integration of public SARS-COV-2 data. bioRxiv pp 2020\u201312","DOI":"10.1101\/2020.12.18.422865"},{"key":"2837_CR9","unstructured":"Devijver P, Kittler J (1982) Pattern recognition: A statistical approach. In: London, GB: Prentice-Hall. pp 1\u2013448"},{"issue":"10","key":"2837_CR10","doi-asserted-by":"publisher","first-page":"958","DOI":"10.1016\/j.cels.2021.08.009","volume":"12","author":"B Ekim","year":"2021","unstructured":"Ekim B, Berger B, Chikhi R (2021) Minimizer-space de Bruijn graphs: Whole-genome assembly of long reads in min on a PC. Cell Syst 12(10):958-968.e6","journal-title":"Cell Syst"},{"issue":"1","key":"2837_CR11","first-page":"1","volume":"21","author":"H ElAbd","year":"2020","unstructured":"ElAbd H, Bromberg Y, Hoarfrost A, Lenz T, Franke A, Wendorff M (2020) Amino acid encoding for deep learning applications. Bioinformatics 21(1):1\u201314","journal-title":"Bioinformatics"},{"key":"2837_CR12","unstructured":"Farhan M, Tariq J, Zaman A, Shabbir M, Khan I (2017) Efficient approx algorithms for strings kernel based sequence classification. In: Advances in neural info processing sys (NeurIPS). pp 6935\u20136945"},{"key":"2837_CR13","doi-asserted-by":"publisher","first-page":"9","DOI":"10.1038\/nrg.2017.88","volume":"19","author":"J Gardy","year":"2018","unstructured":"Gardy J, Loman N (2018) Towards a genomics-informed, real-time, global pathogen surveillance system. Nat Rev Genet 19:9\u201320","journal-title":"Nat Rev Genet"},{"key":"2837_CR14","unstructured":"GISAID Website: https:\/\/www.gisaid.org\/. Accessed 5 Jan 2022"},{"key":"2837_CR15","doi-asserted-by":"publisher","first-page":"4121","DOI":"10.1093\/bioinformatics\/bty407","volume":"34","author":"J Hadfield","year":"2018","unstructured":"Hadfield J, Megill C, Bell S, Huddleston J, Potter B, Callender C, Sagulenko P, Bedford T, Neher R (2018) Nextstrain: real-time tracking of pathogen evo. Bioinformatics 34:4121\u20134123","journal-title":"Bioinformatics"},{"issue":"2","key":"2837_CR16","first-page":"105","volume":"4","author":"AE Hoerl","year":"1975","unstructured":"Hoerl AE, Kannard RW, Baldwin KF (1975) Ridge regression: some simulations. Comm Stat-Theory Methods 4(2):105-123","journal-title":"Comm Stat-Theory Methods"},{"issue":"D1","key":"2837_CR17","doi-asserted-by":"publisher","first-page":"D689","DOI":"10.1093\/nar\/gkz890","volume":"48","author":"KL Howe","year":"2020","unstructured":"Howe KL, Contreras-Moreira B, De Silva N, Maslen G, Akanni W, Allen J, Alvarez-Jarreta J, Barba M, Bolser DM, Cambell L et al (2020) Ensembl genomes 2020\u2013enabling non-vertebrate genomic research. Nucleic Acids Res 48(D1):D689\u2013D695","journal-title":"Nucleic Acids Res"},{"issue":"2","key":"2837_CR18","doi-asserted-by":"publisher","first-page":"195","DOI":"10.1006\/jmbi.1999.3091","volume":"292","author":"DT Jones","year":"1999","unstructured":"Jones DT (1999) Protein secondary structure prediction based on position-specific scoring matrices. J Mol Biol 292(2):195-202","journal-title":"J Mol Biol"},{"key":"2837_CR19","doi-asserted-by":"publisher","first-page":"D689","DOI":"10.1093\/nar\/gkz890","volume":"48","author":"KL Howe","year":"2020","unstructured":"Howe KL, Contreras-Moreira B, De Silva N, Maslen G, Akanni W, Allen J, Alvarez-Jarreta J, Barba M, Bolser MM, Cambell L et al (2020) Ensembl genomes 2020\u2013enabling non-vertebrate genomic research. Nucleic Acids Res 48:D689\u2013D695","journal-title":"Nucleic Acids Res"},{"key":"2837_CR20","doi-asserted-by":"publisher","first-page":"D884","DOI":"10.1093\/nar\/gkaa942","volume":"49","author":"KL Howe","year":"2021","unstructured":"Howe KL, Achuthan P, Allen J, Allen J, Alvarez-Jarreta J, Amode MR, Armean IM, Azov AG, Bennett R, Bhai J, Billis K (2021) Ensembl 2021. Nucleic Acids Res 49:D884\u2013D891","journal-title":"Nucleic Acids Res"},{"issue":"3","key":"2837_CR21","doi-asserted-by":"publisher","first-page":"553","DOI":"10.1016\/j.bbrc.2020.09.010","volume":"533","author":"K Kuzmin","year":"2020","unstructured":"Kuzmin K, Adeniyi AE, DaSouza AK Jr, Lim D, Nguyen H, Molina NR, Xiong L, Weber IT, Harrison RW (2020) Machine learning methods accurately predict host specificity of coronaviruses based on spike sequences alone. Biochem Biophys Res Commun 533(3):553\u2013558","journal-title":"Biochem Biophys Res Commun"},{"issue":"4","key":"2837_CR22","doi-asserted-by":"publisher","first-page":"467","DOI":"10.1093\/bioinformatics\/btg431","volume":"20","author":"CS Leslie","year":"2004","unstructured":"Leslie CS, Eskin E, Cohen A, Weston J, Noble WS (2004) Mismatch string kernels for discriminative protein classification. Bioinformatics 20(4):467\u2013476","journal-title":"Bioinformatics"},{"key":"2837_CR23","doi-asserted-by":"crossref","unstructured":"Li H (2016) Minimap and miniasm: fast mapping and de novo assembly for noisy long sequences. Bioinformatics 32:2103-2110","DOI":"10.1093\/bioinformatics\/btw152"},{"key":"2837_CR24","unstructured":"Lundberg SM, Lee SI (2017) A unified approach to interpreting model predictions. In: Guyon I, Luxburg UV, Bengio S, Wallach H, Fergus R, Vishwanathan S, Garnett R (eds) Advances in neural information processing systems, vol 30. pp 4765\u20134774"},{"key":"2837_CR25","doi-asserted-by":"publisher","first-page":"i13","DOI":"10.1093\/bioinformatics\/bty258","volume":"34","author":"G Mar\u00e7ais","year":"2018","unstructured":"Mar\u00e7ais G, DeBlasio D, Kingsford C (2018) Asymptotically optimal minimizers schemes. Bioinformatics 34:i13\u2013i22","journal-title":"Bioinformatics"},{"issue":"6","key":"2837_CR26","first-page":"775","volume":"80","author":"H Mei","year":"2005","unstructured":"Mei H, Liao ZH, Zhou Y, Li SZ (2005) A new set of amino acid descriptors and its application in peptide QSARs. Peptide Sci Original Res Biomol 80(6):775\u2013786","journal-title":"Peptide Sci Original Res Biomol"},{"key":"2837_CR27","doi-asserted-by":"crossref","unstructured":"M\u00f6lder F, Jab, K, Letcher B, et\u00a0al (2021) Sustainable data analysis with snakemake. F1000Res 10(33)","DOI":"10.12688\/f1000research.29032.1"},{"key":"2837_CR28","doi-asserted-by":"crossref","unstructured":"Ondov B, Treangen T, Melsted P, et\u00a0al (2016) Mash: fast genome and metagenome distance estimation using MinHash. Genome Biol 17(132)","DOI":"10.1186\/s13059-016-0997-x"},{"key":"2837_CR29","unstructured":"Phylogenetic assignment of named global outbreak LINeages (Pangolin): https:\/\/cov-lineages.org\/resources\/pangolin.html. Accessed 4 Jan 2022"},{"issue":"D1","key":"2837_CR30","doi-asserted-by":"publisher","first-page":"D593","DOI":"10.1093\/nar\/gkr859","volume":"40","author":"BE Pickett","year":"2012","unstructured":"Pickett BE, Sadat EL, Zhang Y, Noronha JM, Squires RB, Hunt V, Liu M, Kumar S, Zaremba S, Gu Z et al (2012) ViPR: an open bioinformatics database and analysis resource for virology research. Nucleic Acids Res 40(D1):D593\u2013D598","journal-title":"Nucleic Acids Res"},{"key":"2837_CR31","unstructured":"Rahimi A, Recht B, et\u00a0al (2007) Random features for large-scale kernel machines. In: NIPS, vol 3. p\u00a05"},{"key":"2837_CR32","doi-asserted-by":"publisher","first-page":"3363","DOI":"10.1093\/bioinformatics\/bth408","volume":"20","author":"M Roberts","year":"2004","unstructured":"Roberts M, Haynes W, Hunt B, Mount S, Yorke J (2004) Reducing storage requirements for biological sequence comparison. Bioinformatics 20:3363\u20139","journal-title":"Bioinformatics"},{"key":"2837_CR33","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1016\/0377-0427(87)90125-7","volume":"20","author":"PJ Rousseeuw","year":"1987","unstructured":"Rousseeuw PJ (1987) Silhouettes: a graphical aid to the interpretation and validation of cluster analysis. J Comput Appl Math 20:53\u201365","journal-title":"J Comput Appl Math"},{"issue":"10","key":"2837_CR34","doi-asserted-by":"publisher","first-page":"1569","DOI":"10.1093\/bioinformatics\/btv022","volume":"31","author":"S Deorowicz","year":"2015","unstructured":"Deorowicz S, Kokot M, Grabowski S, Debudaj-Grabysz A (2015) KMC 2: fast and resource-frugal k-mer counting. Bioinformatics 31(10):1569\u201376","journal-title":"Bioinformatics"},{"key":"2837_CR35","unstructured":"Silva NHD, Bhai J, Chakiachvili M, et\u00a0al (2021) The ensembl COVID-19 resource: ongoing integration of public SARS-COV-2 data. Nucleic Acids Research"},{"key":"2837_CR36","doi-asserted-by":"crossref","unstructured":"Solis-Reyes S, Avino M, Poon A, Kari L (2018) An open-source k-mer based machine learning tool for fast and accurate subtyping of HIV-1 genomes. PloS ONE","DOI":"10.1101\/362780"},{"issue":"9","key":"2837_CR37","doi-asserted-by":"publisher","first-page":"2997","DOI":"10.1093\/nar\/10.9.2997","volume":"10","author":"GD Stormo","year":"1982","unstructured":"Stormo GD, Schneider TD, Gold L, Ehrenfeucht A (1982) Use of the \u2018Perceptron\u2019 algorithm to distinguish translational initiation sites in E. coli. Nucleic Acids Res 10(9):2997\u20133011","journal-title":"Nucleic Acids Res"},{"issue":"1","key":"2837_CR38","doi-asserted-by":"crossref","first-page":"267","DOI":"10.1111\/j.2517-6161.1996.tb02080.x","volume":"58","author":"R Tibshirani","year":"1996","unstructured":"Tibshirani R (1996) Regression shrinkage and selection via the lasso. J Roy Stat Soc: Ser B (Methodol) 58(1):267\u2013288","journal-title":"J Roy Stat Soc: Ser B (Methodol)"},{"issue":"8","key":"2837_CR39","first-page":"1","volume":"11","author":"NC Toussaint","year":"2010","unstructured":"Toussaint NC, Widmer C, Kohlbacher O, R\u00e4tsch G (2010) Exploiting physico-chemical properties in string kernels. BMC Bioinforma 11(8):1\u20139","journal-title":"BMC Bioinforma"},{"key":"2837_CR40","unstructured":"Van DML, Hinton G (2008) Visualizing data using t-SNE. J Mach Learn Res (JMLR) 9(11)"},{"issue":"1\u20133","key":"2837_CR41","doi-asserted-by":"publisher","first-page":"37","DOI":"10.1016\/0169-7439(87)80084-9","volume":"2","author":"S Wold","year":"1987","unstructured":"Wold S, Esbensen K, Geladi P (1987) Principal component analysis. Chemom Intell Lab Syst 2(1\u20133):37\u201352","journal-title":"Chemom Intell Lab Syst"},{"key":"2837_CR42","doi-asserted-by":"crossref","unstructured":"Wood D, Salzberg S (2014) Kraken: ultrafast metagenomic sequence classification using exact alignments. Genome Biol 15","DOI":"10.1186\/gb-2014-15-3-r46"},{"issue":"3","key":"2837_CR43","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/gb-2014-15-3-r46","volume":"15","author":"DE Wood","year":"2014","unstructured":"Wood DE, Salzberg SL (2014) Kraken: ultrafast metagenomic sequence classification using exact alignments. Genome Biol 15(3):1\u201312","journal-title":"Genome Biol"},{"issue":"7798","key":"2837_CR44","doi-asserted-by":"publisher","first-page":"265","DOI":"10.1038\/s41586-020-2008-3","volume":"579","author":"F Wu","year":"2020","unstructured":"Wu F, Zhao S, Yu B, Chen YM, Wang W, Song ZG, Hu Y, Tao ZW, Tian JH, Pei YY et al (2020) A new coronavirus associate with human respiratory disease. Nature 579(7798):265\u2013269","journal-title":"Nature"},{"issue":"7","key":"2837_CR45","doi-asserted-by":"publisher","first-page":"e1002195","DOI":"10.1371\/journal.pbio.1002195","volume":"13","author":"ZD Stephens","year":"2015","unstructured":"Stephens ZD, Lee SY, Faghri F, Campbell RH, Zhai C, Efron MJ et al (2015) Big data: Astronomical or genomical? PLoS Biol 13(7):e1002195","journal-title":"PLoS Biol"},{"key":"2837_CR46","doi-asserted-by":"crossref","unstructured":"Zheng H, Kingsford C, Mar\u00e7ais G (2020) Lower density selection schemes via small universal hitting sets with short remaining path len. In: ICRCMB. Springer, pp 202\u2013217","DOI":"10.1007\/978-3-030-45257-5_13"}],"container-title":["Medical &amp; Biological Engineering &amp; Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11517-023-02837-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11517-023-02837-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11517-023-02837-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T14:15:55Z","timestamp":1729692955000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11517-023-02837-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,3]]},"references-count":46,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2023,10]]}},"alternative-id":["2837"],"URL":"https:\/\/doi.org\/10.1007\/s11517-023-02837-8","relation":{},"ISSN":["0140-0118","1741-0444"],"issn-type":[{"value":"0140-0118","type":"print"},{"value":"1741-0444","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,7,3]]},"assertion":[{"value":"23 June 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 March 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 July 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}