{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,6]],"date-time":"2026-01-06T13:10:44Z","timestamp":1767705044586,"version":"3.41.2"},"reference-count":18,"publisher":"Oxford University Press (OUP)","issue":"12","license":[{"start":{"date-parts":[[2023,12,7]],"date-time":"2023-12-07T00:00:00Z","timestamp":1701907200000},"content-version":"vor","delay-in-days":6,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100004359","name":"Swedish Research Council","doi-asserted-by":"publisher","award":["2018-4135"],"award-info":[{"award-number":["2018-4135"]}],"id":[{"id":"10.13039\/501100004359","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,12,1]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:sec>\n                  <jats:title>Summary<\/jats:title>\n                  <jats:p>The profusion of sequenced genomes across the bacterial and archeal domains offers unprecedented possibilities for phylogenetic and comparative genomic analyses. In general, phylogenetic reconstruction is improved by the use of more data. However, including all available data is (i) not computationally tractable, and (ii) prone to biases, as the abundance of genomes is very unequally distributed over the biological diversity. Thus, in most cases, subsampling taxa to build a phylogeny is necessary. Currently, though, there is no available software to perform that handily. Here we present TADA, a taxonomic-aware dataset selection workflow that allows sampling across user-defined portions of the prokaryotic diversity with variable granularity, while setting constraints on genome quality and balance between branches.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Availability and implementation<\/jats:title>\n                  <jats:p>TADA is implemented as a snakemake workflow and is freely available at https:\/\/github.com\/emilhaegglund\/TADA.<\/jats:p>\n               <\/jats:sec>","DOI":"10.1093\/bioinformatics\/btad742","type":"journal-article","created":{"date-parts":[[2023,12,7]],"date-time":"2023-12-07T22:35:17Z","timestamp":1701988517000},"source":"Crossref","is-referenced-by-count":3,"title":["TADA: taxonomy-aware dataset aggregator"],"prefix":"10.1093","volume":"39","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0829-9882","authenticated-orcid":false,"given":"Emil","family":"H\u00e4gglund","sequence":"first","affiliation":[{"name":"Molecular Evolution, Department of Cell and Molecular Biology, Science for Life Laboratory, Biomedical Centre, Uppsala University , SE-751 24 Uppsala, Sweden"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0864-0259","authenticated-orcid":false,"given":"Siv G E","family":"Andersson","sequence":"additional","affiliation":[{"name":"Molecular Evolution, Department of Cell and Molecular Biology, Science for Life Laboratory, Biomedical Centre, Uppsala University , SE-751 24 Uppsala, Sweden"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8354-2398","authenticated-orcid":false,"given":"Lionel","family":"Guy","sequence":"additional","affiliation":[{"name":"Department of Medical Biochemistry and Microbiology, Science for Life Laboratory, Biomedical Centre, Uppsala University , SE-751 23 Uppsala, Sweden"}]}],"member":"286","published-online":{"date-parts":[[2023,12,7]]},"reference":[{"key":"2023122107463326600_btad742-B1","doi-asserted-by":"crossref","first-page":"403","DOI":"10.1016\/S0022-2836(05)80360-2","article-title":"Basic local alignment search tool","volume":"215","author":"Altschul","year":"1990","journal-title":"J Mol Biol"},{"key":"2023122107463326600_btad742-B2","doi-asserted-by":"crossref","first-page":"59","DOI":"10.1038\/nmeth.3176","article-title":"Fast and sensitive protein alignment using DIAMOND","volume":"12","author":"Buchfink","year":"2014","journal-title":"Nat Methods"},{"key":"2023122107463326600_btad742-B3","doi-asserted-by":"crossref","first-page":"100","DOI":"10.2307\/2413420","article-title":"Phylogenetics: the theory and practice of phylogenetic systematics","volume":"31","author":"Colless","year":"1982","journal-title":"Syst Zool"},{"key":"2023122107463326600_btad742-B4","doi-asserted-by":"crossref","first-page":"giad022","DOI":"10.1093\/gigascience\/giad022","article-title":"The GEN-ERA toolbox: unified and reproducible workflows for research in microbial genomics","volume":"12","author":"Cornet","year":"2022","journal-title":"Gigascience"},{"key":"2023122107463326600_btad742-B5","doi-asserted-by":"crossref","first-page":"238","DOI":"10.1186\/s13059-019-1832-y","article-title":"OrthoFinder: phylogenetic orthology inference for comparative genomics","volume":"20","author":"Emms","year":"2019","journal-title":"Genome Biol"},{"key":"2023122107463326600_btad742-B6","doi-asserted-by":"crossref","first-page":"3","DOI":"10.1038\/s41564-020-00834-3","article-title":"Community-led, integrated, reproducible multi-omics with anvi\u2019o","volume":"6","author":"Eren","year":"2021","journal-title":"Nat Microbiol"},{"key":"2023122107463326600_btad742-B7","doi-asserted-by":"crossref","first-page":"27","DOI":"10.2307\/2412810","article-title":"The number of evolutionary trees","volume":"27","author":"Felsenstein","year":"1978","journal-title":"Syst Zool"},{"key":"2023122107463326600_btad742-B8","doi-asserted-by":"crossref","first-page":"1230","DOI":"10.1093\/bioinformatics\/btw824","article-title":"PhyloSkeleton: taxon selection, data retrieval and marker identification for phylogenomics","volume":"33","author":"Guy","year":"2017","journal-title":"Bioinformatics"},{"key":"2023122107463326600_btad742-B9","doi-asserted-by":"crossref","first-page":"1635","DOI":"10.1093\/molbev\/msw046","article-title":"ETE 3: reconstruction, analysis, and visualization of phylogenomic data","volume":"33","author":"Huerta-Cepas","year":"2016","journal-title":"Mol Biol Evol"},{"key":"2023122107463326600_btad742-B10","doi-asserted-by":"crossref","first-page":"5114","DOI":"10.1038\/s41467-018-07641-9","article-title":"High throughput ANI analysis of 90K prokaryotic genomes reveals clear species boundaries","volume":"9","author":"Jain","year":"2018","journal-title":"Nat Commun"},{"key":"2023122107463326600_btad742-B11","doi-asserted-by":"crossref","first-page":"4162","DOI":"10.1093\/bioinformatics\/btz188","article-title":"GToTree: a user-friendly workflow for phylogenomics","volume":"35","author":"Lee","year":"2019","journal-title":"Bioinformatics"},{"key":"2023122107463326600_btad742-B12","doi-asserted-by":"crossref","DOI":"10.7554\/eLife.46923","article-title":"Consistent and correctable bias in metagenomic sequencing experiments","volume":"8","author":"McLaren","year":"2019","journal-title":"Elife"},{"key":"2023122107463326600_btad742-B13","doi-asserted-by":"crossref","first-page":"164","DOI":"10.1186\/s12859-018-2164-8","article-title":"Treemmer: a tool to reduce large phylogenetic datasets with minimal loss of diversity","volume":"19","author":"Menardo","year":"2018","journal-title":"BMC Bioinformatics"},{"key":"2023122107463326600_btad742-B14","doi-asserted-by":"crossref","first-page":"33","DOI":"10.12688\/f1000research.29032.2","article-title":"Sustainable data analysis with snakemake","volume":"10","author":"M\u00f6lder","year":"2021","journal-title":"F1000Res"},{"key":"2023122107463326600_btad742-B15","doi-asserted-by":"crossref","first-page":"D785","DOI":"10.1093\/nar\/gkab776","article-title":"GTDB: an ongoing census of bacterial and archaeal diversity through a phylogenetically consistent, rank normalized and complete Genome-Based taxonomy","volume":"50","author":"Parks","year":"2022","journal-title":"Nucleic Acids Res"},{"key":"2023122107463326600_btad742-B16","doi-asserted-by":"crossref","first-page":"167","DOI":"10.1007\/978-1-0716-2691-7_8","article-title":"Testing phylogenetic stability with variable taxon sampling","volume":"2569","author":"Powell","year":"2022","journal-title":"Methods Mol Biol"},{"key":"2023122107463326600_btad742-B17","doi-asserted-by":"crossref","first-page":"2068","DOI":"10.1093\/bioinformatics\/btu153","article-title":"Prokka: rapid prokaryotic genome annotation","volume":"30","author":"Seemann","year":"2014","journal-title":"Bioinformatics"},{"key":"2023122107463326600_btad742-B18","doi-asserted-by":"crossref","first-page":"1056","DOI":"10.1038\/nature08656","article-title":"A phylogeny-driven genomic encyclopaedia of bacteria and Archaea","volume":"462","author":"Wu","year":"2009","journal-title":"Nature"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/advance-article-pdf\/doi\/10.1093\/bioinformatics\/btad742\/54083135\/btad742.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/39\/12\/btad742\/54717190\/btad742.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/39\/12\/btad742\/54717190\/btad742.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,21]],"date-time":"2023-12-21T08:35:55Z","timestamp":1703147755000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/doi\/10.1093\/bioinformatics\/btad742\/7461186"}},"subtitle":[],"editor":[{"given":"Russell","family":"Schwartz","sequence":"additional","affiliation":[]}],"short-title":[],"issued":{"date-parts":[[2023,12,1]]},"references-count":18,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2023,12,1]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/btad742","relation":{},"ISSN":["1367-4811"],"issn-type":[{"type":"electronic","value":"1367-4811"}],"subject":[],"published-other":{"date-parts":[[2023,12,1]]},"published":{"date-parts":[[2023,12,1]]},"article-number":"btad742"}}