{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T07:34:51Z","timestamp":1769844891824,"version":"3.49.0"},"reference-count":14,"publisher":"Oxford University Press (OUP)","issue":"5","license":[{"start":{"date-parts":[[2025,4,26]],"date-time":"2025-04-26T00:00:00Z","timestamp":1745625600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,6]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:sec>\n                  <jats:title>Summary<\/jats:title>\n                  <jats:p>We present RabbitSketch, a highly optimized library of sketching algorithms such as MinHash, OrderMinHash, and HyperLogLog that can exploit the power of modern multi-core CPUs. It provides significant speedups compared to existing implementations, ranging from 2.30\u00d7 to 49.55\u00d7, as well as flexible and easy-to-use interfaces for both Python and C++. As a result, the similarity analysis of 455GB genomic data can be completed in only 5\u2009minutes using RabbitSketch with merely 20 lines of Python code. As a case study, we enhanced RabbitTClust by integrating RabbitSketch\u2019s Kssd algorithm, resulting in a 1.54\u00d7 speedup with no loss in accuracy.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Availability and implementation<\/jats:title>\n                  <jats:p>RabbitSketch is available at https:\/\/github.com\/RabbitBio\/RabbitSketch with an archived version at Zenodo: https:\/\/doi.org\/10.5281\/zenodo.14903962. Detailed API documentation is available at https:\/\/rabbitsketch.readthedocs.io\/en\/latest.<\/jats:p>\n               <\/jats:sec>","DOI":"10.1093\/bioinformatics\/btaf249","type":"journal-article","created":{"date-parts":[[2025,4,26]],"date-time":"2025-04-26T17:08:00Z","timestamp":1745687280000},"source":"Crossref","is-referenced-by-count":2,"title":["RabbitSketch: a high-performance sketching library for genome analysis"],"prefix":"10.1093","volume":"41","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9723-5645","authenticated-orcid":false,"given":"Tong","family":"Zhang","sequence":"first","affiliation":[{"name":"School of Software, Shandong University , Jinan 250101,","place":["China"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zekun","family":"Yin","sequence":"additional","affiliation":[{"name":"School of Software, Shandong University , Jinan 250101,","place":["China"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0370-2222","authenticated-orcid":false,"given":"Xiaoming","family":"Xu","sequence":"additional","affiliation":[{"name":"School of Software, Shandong University , Jinan 250101,","place":["China"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9610-1268","authenticated-orcid":false,"given":"Lifeng","family":"Yan","sequence":"additional","affiliation":[{"name":"School of Software, Shandong University , Jinan 250101,","place":["China"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fangjin","family":"Zhu","sequence":"additional","affiliation":[{"name":"School of Software, Shandong University , Jinan 250101,","place":["China"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaohui","family":"Duan","sequence":"additional","affiliation":[{"name":"School of Software, Shandong University , Jinan 250101,","place":["China"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2597-8331","authenticated-orcid":false,"given":"Bertil","family":"Schmidt","sequence":"additional","affiliation":[{"name":"Institute for Computer Science, Johannes Gutenberg University , Mainz 55128,","place":["Germany"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weiguo","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Software, Shandong University , Jinan 250101,","place":["China"]}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"286","published-online":{"date-parts":[[2025,4,26]]},"reference":[{"key":"2025051307513050100_btaf249-B1","doi-asserted-by":"crossref","first-page":"265","DOI":"10.1186\/s13059-019-1875-0","article-title":"Dashing: fast and accurate genomic distances with hyperloglog","volume":"20","author":"Baker","year":"2019","journal-title":"Genome Biol"},{"key":"2025051307513050100_btaf249-B2","first-page":"1218","article-title":"Genomic sketching with multiplicities and locality-sensitive hashing using dashing 2","volume":"33","author":"Baker","year":"2023","journal-title":"Genome Res"},{"key":"2025051307513050100_btaf249-B3","doi-asserted-by":"crossref","first-page":"5217","DOI":"10.1093\/nar\/gkaa265","article-title":"To petabytes and beyond: recent advances in probabilistic and signal processing algorithms and their application to metagenomics","volume":"48","author":"Elworth","year":"2020","journal-title":"Nucleic Acids Res"},{"key":"2025051307513050100_btaf249-B4","doi-asserted-by":"crossref","first-page":"293","DOI":"10.14778\/2735508.2735518","article-title":"Faster set intersection with simd instructions by reducing branch mispredictions","volume":"8","author":"Inoue","year":"2014","journal-title":"Proc VLDB Endow"},{"key":"2025051307513050100_btaf249-B5","doi-asserted-by":"crossref","first-page":"i127","DOI":"10.1093\/bioinformatics\/btz354","article-title":"Locality-sensitive hashing for the edit distance","volume":"35","author":"Mar\u00e7ais","year":"2019","journal-title":"Bioinformatics"},{"key":"2025051307513050100_btaf249-B6","doi-asserted-by":"crossref","first-page":"132","DOI":"10.1186\/s13059-016-0997-x","article-title":"Mash: fast genome and metagenome distance estimation using minhash","volume":"17","author":"Ondov","year":"2016","journal-title":"Genome Biol"},{"key":"2025051307513050100_btaf249-B7","doi-asserted-by":"crossref","first-page":"199","DOI":"10.1186\/s13059-019-1809-x","article-title":"When the levee breaks: a practical guide to sketching algorithms for processing the flood of genomic data","volume":"20","author":"Rowe","year":"2019","journal-title":"Genome Biol"},{"key":"2025051307513050100_btaf249-B8","doi-asserted-by":"crossref","first-page":"342","DOI":"10.1186\/s12859-024-05965-6","article-title":"Cudasw++ 4.0: ultra-fast gpu-based smith\u2013waterman protein sequence database search","volume":"25","author":"Schmidt","year":"2024","journal-title":"BMC Bioinformatics"},{"key":"2025051307513050100_btaf249-B9","doi-asserted-by":"crossref","first-page":"btad695","DOI":"10.1093\/bioinformatics\/btad695","article-title":"Rabbitkssd: accelerating genome distance estimation on modern multi-core architectures","volume":"39","author":"Xu","year":"2023","journal-title":"Bioinformatics"},{"key":"2025051307513050100_btaf249-B10","doi-asserted-by":"crossref","first-page":"121","DOI":"10.1186\/s13059-023-02961-6","article-title":"Rabbittclust: enabling fast clustering analysis of millions of bacteria genomes with minhash sketches","volume":"24","author":"Xu","year":"2023","journal-title":"Genome Biol"},{"key":"2025051307513050100_btaf249-B11","doi-asserted-by":"crossref","first-page":"84","DOI":"10.1186\/s13059-021-02303-4","article-title":"Kssd: sequence dimensionality reduction by k-mer substring space sampling enables real-time large-scale datasets analysis","volume":"22","author":"Yi","year":"2021","journal-title":"Genome Biol"},{"key":"2025051307513050100_btaf249-B12","doi-asserted-by":"crossref","first-page":"403","DOI":"10.1016\/j.csbj.2017.07.004","article-title":"Computing platforms for big biological data analytics: perspectives and challenges","volume":"15","author":"Yin","year":"2017","journal-title":"Comput Struct Biotechnol J"},{"key":"2025051307513050100_btaf249-B13","doi-asserted-by":"crossref","first-page":"873","DOI":"10.1093\/bioinformatics\/btaa754","article-title":"Rabbitmash: accelerating hash-based genome analysis on modern multi-core architectures","volume":"37","author":"Yin","year":"2021","journal-title":"Bioinformatics"},{"key":"2025051307513050100_btaf249-B14","doi-asserted-by":"crossref","first-page":"2341","DOI":"10.1109\/TCBB.2022.3219114","article-title":"Rabbitfx: efficient framework for fasta\/q file parsing on modern multi-core platforms","volume":"20","author":"Zhang","year":"2023","journal-title":"IEEE\/ACM Trans Comput Biol Bioinform"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/advance-article-pdf\/doi\/10.1093\/bioinformatics\/btaf249\/63014384\/btaf249.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/41\/5\/btaf249\/63014384\/btaf249.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/41\/5\/btaf249\/63014384\/btaf249.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,13]],"date-time":"2025-05-13T11:51:40Z","timestamp":1747137100000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/doi\/10.1093\/bioinformatics\/btaf249\/8120415"}},"subtitle":[],"editor":[{"given":"Lenore","family":"Cowen","sequence":"additional","affiliation":[],"role":[{"role":"editor","vocabulary":"crossref"}]}],"short-title":[],"issued":{"date-parts":[[2025,4,26]]},"references-count":14,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,5,6]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/btaf249","relation":{},"ISSN":["1367-4811"],"issn-type":[{"value":"1367-4811","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2025,5]]},"published":{"date-parts":[[2025,4,26]]},"article-number":"btaf249"}}