{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T23:37:30Z","timestamp":1768952250952,"version":"3.49.0"},"reference-count":66,"publisher":"Oxford University Press (OUP)","issue":"1","license":[{"start":{"date-parts":[[2025,8,25]],"date-time":"2025-08-25T00:00:00Z","timestamp":1756080000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/academic.oup.com\/pages\/standard-publication-reuse-rights"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,1,13]]},"abstract":"<jats:title>Abstract<\/jats:title>\n                  <jats:p>Large-alphabet strings, prevalent in information retrieval and natural language processing, pose unique storage and processing challenges. This paper explores the efficient implementation of the alphabet-partition approach, introducing a compressed data structure that efficiently supports the operations ${\\mathsf{rank}}$ and ${\\mathsf{select}}$. Our implementation significantly outperforms existing methods, improving the ${\\mathsf{select}}$ operation speed by 80% with only 11% additional space. We demonstrate the utility of our structure in various applications, including inverted list intersections, run-length compressed strings, and distributed computation of ${\\mathsf{rank}}$ and ${\\mathsf{select}}$. Notably, for run-length compressed strings using the Burrows\u2013Wheeler transform, our data structure requires only 0.98\u20131.09 times the space of state-of-the-art RLFM-indexes to achieve 1.23\u20132.33 times faster pattern occurrence counting while also providing better theoretical guarantees.<\/jats:p>","DOI":"10.1093\/comjnl\/bxaf102","type":"journal-article","created":{"date-parts":[[2025,8,4]],"date-time":"2025-08-04T11:21:36Z","timestamp":1754306496000},"page":"108-132","source":"Crossref","is-referenced-by-count":1,"title":["Engineering rank\/select data structures for large-alphabet strings"],"prefix":"10.1093","volume":"69","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2509-8097","authenticated-orcid":false,"given":"Diego","family":"Arroyuelo","sequence":"first","affiliation":[{"name":"Department of Computer Science, Escuela de Ingenier\u00eda, Pontificia Universidad Cat\u00f3lica de Chile & Millennium Institute for Foundational Research on Data (IMFD) , Vicu\u00f1a Mackenna 4860, Santiago, 7820436 ,","place":["Chile"]}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1454-2940","authenticated-orcid":false,"given":"Gabriel","family":"Carmona","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Pisa , Largo Bruno Pontecorvo, 3, Pisa, 56127 ,","place":["Italy"]}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1752-6124","authenticated-orcid":false,"given":"H\u00e9ctor","family":"Larra\u00f1aga","sequence":"additional","affiliation":[{"name":"Department of Informatics, Universidad T\u00e9cnica Federico Santa Mar\u00eda , Vicu\u00f1a Mackenna 3939, Santiago, 8940897 ,","place":["Chile"]}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2782-877X","authenticated-orcid":false,"given":"Francisco","family":"Riveros","sequence":"additional","affiliation":[{"name":"Department of Informatics, Universidad T\u00e9cnica Federico Santa Mar\u00eda , Vicu\u00f1a Mackenna 3939, Santiago, 8940897 ,","place":["Chile"]}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3491-5053","authenticated-orcid":false,"given":"Carlos Eugenio","family":"Rojas-Morales","sequence":"additional","affiliation":[{"name":"Department of Informatics, Universidad T\u00e9cnica Federico Santa Mar\u00eda , Vicu\u00f1a Mackenna 3939, Santiago, 8940897 ,","place":["Chile"]}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9918-9540","authenticated-orcid":false,"given":"Erick","family":"Sep\u00falveda","sequence":"additional","affiliation":[{"name":"Department of Informatics, Universidad T\u00e9cnica Federico Santa Mar\u00eda , Vicu\u00f1a Mackenna 3939, Santiago, 8940897 ,","place":["Chile"]}]}],"member":"286","published-online":{"date-parts":[[2025,8,25]]},"reference":[{"key":"2026011907123885300_ref1","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9781316588284","volume-title":"Compact Data Structures - a Practical Approach","author":"Navarro","year":"2016"},{"key":"2026011907123885300_ref2","doi-asserted-by":"publisher","first-page":"2","DOI":"10.1016\/j.jda.2013.07.004","article-title":"Wavelet trees for all","volume":"25","author":"Navarro","year":"2014","journal-title":"J Discrete Algorithms"},{"key":"2026011907123885300_ref3","doi-asserted-by":"publisher","first-page":"73","DOI":"10.1016\/j.is.2015.08.008","article-title":"Practical compressed string dictionaries","volume":"56","author":"Mart\u00ednez-Prieto","year":"2016","journal-title":"Inf Syst"},{"key":"2026011907123885300_ref4","doi-asserted-by":"publisher","first-page":"104794","DOI":"10.1016\/j.ic.2021.104794","article-title":"c-trie++: a dynamic trie tailored for fast prefix searches","volume":"285","author":"Tsuruta","year":"2022","journal-title":"Inf Comput"},{"key":"2026011907123885300_ref5","doi-asserted-by":"publisher","first-page":"101466","DOI":"10.1016\/j.is.2019.101466","article-title":"To index or not to index: time-space trade-offs for positional ranking functions in search engines","volume":"89","author":"Arroyuelo","year":"2020","journal-title":"Inf Syst"},{"key":"2026011907123885300_ref6","doi-asserted-by":"crossref","first-page":"43","DOI":"10.1007\/978-3-642-16321-0_5","article-title":"Compressed self-indices supporting conjunctive queries on document collections","volume-title":"Proceedings of 17th String Processing and Information Retrieval (SPIRE\u201910)","author":"Arroyuelo","year":"2010"},{"key":"2026011907123885300_ref7","doi-asserted-by":"publisher","first-page":"819","DOI":"10.1016\/j.ipm.2011.01.008","article-title":"Distributed search based on self-indexed compressed text","volume":"48","author":"Arroyuelo","year":"2012","journal-title":"Inf Process Manage"},{"key":"2026011907123885300_ref8","doi-asserted-by":"publisher","first-page":"232","DOI":"10.1007\/s00453-012-9726-3","article-title":"Efficient fully-compressed sequence representations","volume":"69","author":"Barbay","year":"2014","journal-title":"Algorithmica"},{"key":"2026011907123885300_ref9","first-page":"1","article-title":"Data structures: time, I\/Os, entropy, joules","volume-title":"Proceedings 18th Annual European Symposium on Algorithms (ESA\u201910)","author":"Ferragina","year":"2010"},{"key":"2026011907123885300_ref10","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/3416971","article-title":"Three success stories about compact data structures","volume":"63","author":"Arroyuelo","year":"2020","journal-title":"Commun ACM"},{"key":"2026011907123885300_ref11","first-page":"841","article-title":"High-order entropy-compressed text indexes","volume-title":"Proceedings of the 14th annual ACM-SIAM symposium on Discrete algorithms (SODA\u201903)","author":"Grossi","year":"2003"},{"key":"2026011907123885300_ref12","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1016\/j.is.2014.06.002","article-title":"The wavelet matrix: an efficient wavelet tree for large alphabets","volume":"47","author":"Claude","year":"2015","journal-title":"Inf Syst"},{"key":"2026011907123885300_ref13","volume-title":"Compact PAT Trees","author":"Clark","year":"1997"},{"key":"2026011907123885300_ref14","doi-asserted-by":"crossref","first-page":"37","DOI":"10.1007\/3-540-62034-6_35","article-title":"Tables","volume-title":"Proceedings 16th Conference on Foundations of Software Technology and Theoretical Computer Science (FSTTCS\u201996)","author":"Munro","year":"1996"},{"key":"2026011907123885300_ref15","first-page":"1287","article-title":"Optimized succinct data structures for massive data","volume":"44","author":"Gog","year":"2014","journal-title":"Softw: Pract Exper"},{"key":"2026011907123885300_ref16","doi-asserted-by":"crossref","first-page":"257","DOI":"10.1007\/978-3-031-20643-6_19","article-title":"Engineering compact data structures for rank and select queries on bit vectors","volume-title":"Proceedings of 29th International Symposium on String Processing and Information Retrieval (SPIRE\u201922)","author":"Kurpicz","year":"2022"},{"key":"2026011907123885300_ref17","author":"Clarke","year":"2023"},{"key":"2026011907123885300_ref18","author":"The Lemur Project","year":"2023"},{"key":"2026011907123885300_ref19","doi-asserted-by":"publisher","first-page":"407","DOI":"10.1145\/382780.382782","article-title":"An analysis of the Burrows\u2013Wheeler transform","volume":"48","author":"Manzini","year":"2001","journal-title":"J ACM"},{"key":"2026011907123885300_ref20","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3375890","article-title":"Fully functional suffix trees and optimal text searching in BWT-runs bounded space","volume":"67","author":"Gagie","year":"2020","journal-title":"J ACM"},{"key":"2026011907123885300_ref21","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1145\/1290672.1290680","article-title":"Succinct indexable dictionaries with applications to encoding k-ary trees, prefix sums and multisets","volume":"3","author":"Raman","year":"2007","journal-title":"ACM Trans Algorithms"},{"key":"2026011907123885300_ref22","first-page":"368","article-title":"Rank\/select operations on large alphabets: a tool for text indexing","volume-title":"Proceedings of the 17th Annual ACM-SIAM Symposium on Discrete Algorithm (SODA\u201906)","author":"Golynski","year":"2006"},{"key":"2026011907123885300_ref23","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1145\/1240233.1240243","article-title":"Compressed representations of sequences and full-text indexes","volume":"3","author":"Ferragina","year":"2007","journal-title":"ACM Trans Algorithms"},{"key":"2026011907123885300_ref24","volume-title":"Multiary Wavelet Trees in Practice","author":"Bowe","year":"2010"},{"key":"2026011907123885300_ref25","volume-title":"Information Retrieval: Implementing and Evaluating Search Engines","author":"B\u00fcttcher","year":"2010"},{"key":"2026011907123885300_ref26","doi-asserted-by":"crossref","first-page":"315","DOI":"10.1109\/DCC.2018.00040","article-title":"Run compressed rank\/select for large alphabets","volume-title":"Proc. Data Compression Conference (DCC\u201918)","author":"Fuentes-Sep\u00falveda","year":"2018"},{"key":"2026011907123885300_ref27","doi-asserted-by":"publisher","first-page":"40","DOI":"10.1007\/11496656_5","article-title":"Succinct suffix arrays based on run-length encoding","volume":"3537","author":"M\u00e4kinen","year":"2005","journal-title":"Nordic JComput"},{"key":"2026011907123885300_ref28","doi-asserted-by":"crossref","first-page":"452","DOI":"10.1007\/978-3-030-32686-9_32","article-title":"A practical alphabet-partitioning rank\/select data structure","volume-title":"Proceedings of 26th International Symposium on String Processing and Information Retrieval (SPIRE\u201919)","author":"Arroyuelo","year":"2019"},{"key":"2026011907123885300_ref29","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1137\/1.9781611972870.6","article-title":"Practical entropy-compressed rank\/select dictionary","volume-title":"Proceedings of the 9th Workshop on Algorithm Engineering and Experiments (ALENEX\u201907)","author":"Okanohara","year":"2007"},{"key":"2026011907123885300_ref30","article-title":"Succincter","volume-title":"proceedings of 49th annual IEEE symposium on foundations of computer science (FOCS\u201908)","author":"Pa\u030ctra\u015fcu","year":"2008"},{"key":"2026011907123885300_ref31","doi-asserted-by":"publisher","first-page":"694","DOI":"10.1007\/s00453-021-00872-1","article-title":"Adaptive succinctness","volume":"84","author":"Arroyuelo","year":"2022","journal-title":"Algorithmica"},{"key":"2026011907123885300_ref32","doi-asserted-by":"publisher","DOI":"10.1145\/2629339","article-title":"Optimal lower and upper bounds for representing sequences","volume":"11","author":"Belazzougui","year":"2015","journal-title":"ACM Trans Algorithms"},{"key":"2026011907123885300_ref33","first-page":"148","article-title":"On the redundancy of succinct data structures","volume-title":"Proceedings of 11th Scandinavian Workshop on Algorithm Theory (SWAT\u201908), Gothenburg, Sweden, 2-4 July","author":"Golynski","year":"2008"},{"key":"2026011907123885300_ref34","first-page":"678","article-title":"Optimal trade-offs for succinct string indexes","volume-title":"Proceedings of 37th International Colloquium on Automata, Languages and Programming (ICALP\u201910), Part I","author":"Grossi","year":"2010"},{"key":"2026011907123885300_ref35","first-page":"102","article-title":"Worst-case optimal graph joins in almost no space","volume-title":"Proceedings of International Conference on Management of Data (SIGMOD\u201921), Virtual Event, China, 20\u201325 June","author":"Arroyuelo","year":"2021"},{"key":"2026011907123885300_ref36","doi-asserted-by":"publisher","first-page":"349","DOI":"10.1007\/s00778-023-00811-2","article-title":"Optimizing RPQs over a compact graph representation","volume":"33","author":"Arroyuelo","year":"2024","journal-title":"VLDB J"},{"key":"2026011907123885300_ref37","first-page":"84","article-title":"Succinct trees in practice","volume-title":"Proceedings of 12th Workshop on Algorithm Engineering and Experiments (ALENEX\u201910), Austin, Texas, USA, 16 January","author":"Arroyuelo","year":"2010"},{"key":"2026011907123885300_ref38","doi-asserted-by":"crossref","first-page":"345","DOI":"10.1007\/3-540-45061-0_29","article-title":"Succinct representations of permutations","volume-title":"Proceedings of 30th International Colloquium on Automata, Languages and Programming (ICALP\u201903)","author":"Munro","year":"2003"},{"key":"2026011907123885300_ref39","volume-title":"A Block Sorting Lossless Data Compression Algorithm","author":"Burrows","year":"1994"},{"key":"2026011907123885300_ref40","doi-asserted-by":"crossref","DOI":"10.1109\/DCC.2005.35","article-title":"Efficient alphabet partitioning algorithms for low-complexity entropy coding","volume-title":"Proceedings of data compression conference (DCC\u201905)","author":"Said"},{"key":"2026011907123885300_ref41","doi-asserted-by":"publisher","first-page":"552","DOI":"10.1145\/1082036.1082039","article-title":"Indexing compressed text","volume":"52","author":"Ferragina","year":"2005","journal-title":"J ACM"},{"key":"2026011907123885300_ref42","first-page":"383","article-title":"Efficient suffix trees on secondary storage (extended abstract)","volume-title":"Proceedings of 7th Annual ACM-SIAM Symposium on Discrete Algorithms (SODA\u201996), Atlanta, Georgia, USA, 28-30 January","author":"Clark","year":"1996"},{"key":"2026011907123885300_ref43","doi-asserted-by":"crossref","unstructured":"Arroyuelo D, Carmona G, Larra\u00f1aga H, Riveros F, Rojas-Morales CE, Sep\u00falveda E (2025) Engineering Rank\/Select Data Structures for Large-Alphabet Strings\u2014Source Code Repository. 10.5281\/zenodo.16582318.","DOI":"10.1093\/comjnl\/bxaf102"},{"key":"2026011907123885300_ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-02298-2","volume-title":"Scalability Challenges in Web Search Engines","author":"Cambazoglu","year":"2015"},{"key":"2026011907123885300_ref45","first-page":"127","article-title":"Fast generation of result snippets in web search","volume-title":"Proceedings of 30th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR\u201907), Amsterdam, The Netherlands, 23\u201327 July","author":"Turpin","year":"2007"},{"key":"2026011907123885300_ref46","first-page":"509","article-title":"Document compaction for efficient query biased snippet generation","volume-title":"Proceedings of 31th European Conference on IR Research (ECIR\u201909), Toulouse, France, 6-9 April","author":"Tsegay","year":"2009"},{"key":"2026011907123885300_ref47","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1328911.1328915","article-title":"Alternation and redundancy analysis of the intersection problem","volume":"4","author":"Barbay","year":"2008","journal-title":"ACM Trans Algorithms"},{"key":"2026011907123885300_ref48","first-page":"401","article-title":"Inverted index compression and query processing with optimized document ordering","volume-title":"Proceedings of the 18th International Conference on World Wide Web (WWW\u201909), Madrid, Spain, 20\u201324 April","author":"Yan","year":"2009"},{"key":"2026011907123885300_ref49","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3052773","article-title":"Clustered Elias-Fano indexes","volume":"36","author":"Pibiri","year":"2017","journal-title":"ACM Trans Inf Syst"},{"key":"2026011907123885300_ref50","doi-asserted-by":"crossref","first-page":"1308","DOI":"10.1016\/j.ipm.2018.05.007","article-title":"Hybrid compression of inverted lists for reordered document collections","volume":"54","author":"Arroyuelo","year":"2018","journal-title":"Inf Process Manag"},{"key":"2026011907123885300_ref51","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/SWAT.1973.13","article-title":"Linear pattern matching algorithms","volume-title":"Proceedings of 14th Annual Symposium on Switching and Automata Theory (SWAT\u201973)","author":"Weiner","year":"1973"},{"key":"2026011907123885300_ref52","doi-asserted-by":"publisher","first-page":"262","DOI":"10.1145\/321941.321946","article-title":"A space-economical suffix tree construction algorithm","volume":"23","author":"McCreight","year":"1976","journal-title":"J ACM"},{"key":"2026011907123885300_ref53","doi-asserted-by":"publisher","first-page":"66","DOI":"10.1145\/2810036","article-title":"40 years of suffix trees","volume":"59","author":"Apostolico","year":"2016","journal-title":"Commun ACM"},{"key":"2026011907123885300_ref54","doi-asserted-by":"publisher","first-page":"935","DOI":"10.1137\/0222058","article-title":"Suffix arrays: a new method for on-line string searches","volume":"22","author":"Manber","year":"1993","journal-title":"SIAM J Comput"},{"key":"2026011907123885300_ref55","doi-asserted-by":"publisher","first-page":"54","DOI":"10.1007\/s00453-010-9443-8","article-title":"Stronger Lempel-Ziv based compressed text indexing","volume":"62","author":"Arroyuelo","year":"2012","journal-title":"Algorithmica"},{"key":"2026011907123885300_ref56","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9781139940023","volume-title":"Genome-Scale Algorithm Design: Biological Sequence Analysis in the Era of High-Throughput Sequencing","author":"M\u00e4kinen","year":"2015"},{"key":"2026011907123885300_ref57","first-page":"245","article-title":"Optimizing positional index structures for versioned document collections","volume-title":"Proceedings of 35th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR\u201912)","author":"He","year":"2012"},{"key":"2026011907123885300_ref58","article-title":"Compressed text indexes: from theory to practice","volume":"13","author":"Ferragina","year":"2008","journal-title":"ACM J Exp Algorithmics"},{"key":"2026011907123885300_ref59","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3524060","article-title":"A learned approach to design compressed rank\/select data structures","volume":"18","author":"Boffa","year":"2022","journal-title":"ACM Trans Algorithms"},{"key":"2026011907123885300_ref60","first-page":"273","article-title":"Partitioned Elias-Fano indexes","volume-title":"Proceedings of 37th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR\u201914)","author":"Ottaviano","year":"2014"},{"key":"2026011907123885300_ref61","first-page":"154","article-title":"Broadword implementation of rank\/select queries","volume-title":"Proceedings of 7th International Workshop on Experimental Algorithms (WEA\u201908)","author":"Sebastiano"},{"key":"2026011907123885300_ref62","first-page":"1","article-title":"A hybrid compressed data structure supporting rank and select on bit sequences","volume-title":"Proceedings of 39th International Conference of the Chilean Computer Science Society (SCCC\u201920)","author":"Arroyuelo","year":"2020"},{"key":"2026011907123885300_ref63","doi-asserted-by":"crossref","first-page":"302","DOI":"10.1109\/DCC.2014.87","article-title":"Hybrid compression of bitvectors for the FM-index","volume-title":"Proceedings of Data Compression Conference (DCC\u201914)","author":"K\u00e4rkk\u00e4inen","year":"2014"},{"key":"2026011907123885300_ref64","first-page":"133","article-title":"Bitvectors with runs and the successor\/predecessor problem","volume-title":"Proceedings of Data Compression Conference (DCC\u201920)","author":"G\u00f3mez-Brand\u00f3n"},{"key":"2026011907123885300_ref65","first-page":"134","article-title":"Engineering the LOUDS succinct tree representation","volume-title":"Proceedings of 5th International Workshop on Experimental Algorithms (WEA\u201906)","author":"Delpratt","year":"2006"},{"key":"2026011907123885300_ref66","volume-title":"SDSL\u2014Succinct Data Structure Library (Fork)","author":"Fuentes-Sep\u00falveda","year":"2023"}],"container-title":["The Computer Journal"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/comjnl\/article-pdf\/69\/1\/108\/64119113\/bxaf102.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/comjnl\/article-pdf\/69\/1\/108\/64119113\/bxaf102.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T12:12:49Z","timestamp":1768824769000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/comjnl\/article\/69\/1\/108\/8240602"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,25]]},"references-count":66,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,8,25]]},"published-print":{"date-parts":[[2026,1,13]]}},"URL":"https:\/\/doi.org\/10.1093\/comjnl\/bxaf102","relation":{},"ISSN":["0010-4620","1460-2067"],"issn-type":[{"value":"0010-4620","type":"print"},{"value":"1460-2067","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2026,1]]},"published":{"date-parts":[[2025,8,25]]}}}