{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T23:30:59Z","timestamp":1775086259211,"version":"3.50.1"},"reference-count":13,"publisher":"Oxford University Press (OUP)","issue":"4","license":[{"start":{"date-parts":[[2023,3,24]],"date-time":"2023-03-24T00:00:00Z","timestamp":1679616000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["2019R1A6A1A10073437"],"award-info":[{"award-number":["2019R1A6A1A10073437"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["2020M3A9G7103933"],"award-info":[{"award-number":["2020M3A9G7103933"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["2021R1C1C102065"],"award-info":[{"award-number":["2021R1C1C102065"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["2021M3A9I4021220"],"award-info":[{"award-number":["2021M3A9I4021220"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,4,3]]},"abstract":"<jats:title>Abstract<\/jats:title>\n                  <jats:sec>\n                    <jats:title>Summary<\/jats:title>\n                    <jats:p>Highly accurate protein structure predictors have generated hundreds of millions of protein structures; these pose a challenge in terms of storage and processing. Here, we present Foldcomp, a novel lossy structure compression algorithm, and indexing system to address this challenge. By using a combination of internal and Cartesian coordinates and a bi-directional NeRF-based strategy, Foldcomp improves the compression ratio by a factor of three compared to the next best method. Its reconstruction error of 0.08\u2009\u00c5 is comparable to the best lossy compressor. It is five times faster than the next fastest compressor and competes with the fastest decompressors. With its multi-threading implementation and a Python interface that allows for easy database downloads and efficient querying of protein structures by accession, Foldcomp is a powerful tool for managing and analysing large collections of protein structures.<\/jats:p>\n                  <\/jats:sec>\n                  <jats:sec>\n                    <jats:title>Availability and implementation<\/jats:title>\n                    <jats:p>Foldcomp is a free open-source software (GPLv3) and available for Linux, macOS, and Windows at https:\/\/foldcomp.foldseek.com. Foldcomp provides the AlphaFold Swiss-Prot (2.9GB), TrEMBL (1.1TB), and ESMatlas HQ (114GB) database ready-for-download.<\/jats:p>\n                  <\/jats:sec>","DOI":"10.1093\/bioinformatics\/btad153","type":"journal-article","created":{"date-parts":[[2023,3,24]],"date-time":"2023-03-24T10:06:51Z","timestamp":1679652411000},"source":"Crossref","is-referenced-by-count":26,"title":["Foldcomp: a library and format for compressing and indexing large protein structure sets"],"prefix":"10.1093","volume":"39","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4662-3943","authenticated-orcid":false,"given":"Hyunbin","family":"Kim","sequence":"first","affiliation":[{"name":"Interdisciplinary Program in Bioinformatics, Seoul National University , Seoul 08826, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8637-6719","authenticated-orcid":false,"given":"Milot","family":"Mirdita","sequence":"additional","affiliation":[{"name":"School of Biological Sciences, Seoul National University , Seoul 08826, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8781-9753","authenticated-orcid":false,"given":"Martin","family":"Steinegger","sequence":"additional","affiliation":[{"name":"Interdisciplinary Program in Bioinformatics, Seoul National University , Seoul 08826, South Korea"},{"name":"School of Biological Sciences, Seoul National University , Seoul 08826, South Korea"},{"name":"Institute of Molecular Biology and Genetics, Seoul National University , Seoul 08826, South Korea"},{"name":"Artificial Intelligence Institute, Seoul National University , Seoul 08826, South Korea"}]}],"member":"286","published-online":{"date-parts":[[2023,3,24]]},"reference":[{"key":"2023041017481455000_","doi-asserted-by":"crossref","first-page":"e1005575","DOI":"10.1371\/journal.pcbi.1005575","article-title":"MMTF-An efficient file format for the transmission, visualization, and analysis of macromolecular structures","volume":"13","author":"Bradley","year":"2017","journal-title":"PLoS Comput Biol"},{"key":"2023041017481455000_","doi-asserted-by":"crossref","first-page":"583","DOI":"10.1038\/s41586-021-03819-2","article-title":"Highly accurate protein structure prediction with AlphaFold","volume":"596","author":"Jumper","year":"2021","journal-title":"Nature"},{"issue":"6637","key":"2023041017481455000_","doi-asserted-by":"crossref","first-page":"1123","DOI":"10.1126\/science.ade2574","article-title":"Evolutionary-scale prediction of atomic-level protein structure with a language model","volume":"379","author":"Lin","year":"2023","journal-title":"Science"},{"key":"2023041017481455000_","doi-asserted-by":"crossref","first-page":"1063","DOI":"10.1002\/jcc.20237","article-title":"Practical conversion from torsion space to cartesian space for in silico protein synthesis","volume":"26","author":"Parsons","year":"2005","journal-title":"J Comput Chem"},{"key":"2023041017481455000_","doi-asserted-by":"crossref","first-page":"e4511","DOI":"10.1002\/pro.4511","article-title":"Quantifying the effects of lossy compression on energies calculated from molecular dynamics trajectories","volume":"31","author":"Roe","year":"2022","journal-title":"Protein Sci"},{"issue":"9","key":"2023041017481455000_","doi-asserted-by":"crossref","first-page":"1460","DOI":"10.1002\/jcc.20906","article-title":"Fast procedure for reconstruction of fullatom protein models from reduced representations","volume":"29","author":"Rotkiewicz","year":"2008","journal-title":"J Comput Chem"},{"key":"2023041017481455000_","doi-asserted-by":"crossref","first-page":"e1008247","DOI":"10.1371\/journal.pcbi.1008247","article-title":"BinaryCIF and CIFTools-Lightweight, efficient and extensible macromolecular data management","volume":"16","author":"Sehnal","year":"2020","journal-title":"PLoS Comput Biol"},{"key":"2023041017481455000_","article-title":"Image-centric compression of protein structures improves space savings","author":"Staniscia","year":"2022","journal-title":"bioRxiv"},{"key":"2023041017481455000_","doi-asserted-by":"crossref","first-page":"1026","DOI":"10.1038\/nbt.3988","article-title":"MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets","volume":"35","author":"Steinegger","year":"2017","journal-title":"Nat Biotechnol"},{"key":"2023041017481455000_","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1371\/journal.pone.0174846","article-title":"Towards an efficient compression of 3d coordinates of macromolecular structures","volume":"12","author":"Valasatava","year":"2017","journal-title":"PLoS ONE"},{"key":"2023041017481455000_","article-title":"Foldseek: fast and accurate protein structure search","author":"van Kempen","year":"2022","journal-title":"bioRxiv"},{"key":"2023041017481455000_","doi-asserted-by":"crossref","first-page":"D439","DOI":"10.1093\/nar\/gkab1061","article-title":"AlphaFold protein structure database: massively expanding the structural coverage of protein-sequence space with high-accuracy models","volume":"50","author":"Varadi","year":"2021","journal-title":"Nucleic Acids Res"},{"key":"2023041017481455000_","doi-asserted-by":"crossref","first-page":"167599","DOI":"10.1016\/j.jmb.2022.167599","article-title":"PDBx\/mmCIF Ecosystem: Foundational Semantic Tools for Structural Biology","volume":"434","author":"Westbrook","year":"2022","journal-title":"J Mol Biol"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/advance-article-pdf\/doi\/10.1093\/bioinformatics\/btad153\/49628100\/btad153.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/39\/4\/btad153\/49807920\/btad153.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/39\/4\/btad153\/49807920\/btad153.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,9]],"date-time":"2023-12-09T10:23:43Z","timestamp":1702117423000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/doi\/10.1093\/bioinformatics\/btad153\/7085592"}},"subtitle":[],"editor":[{"given":"Lenore","family":"Cowen","sequence":"additional","affiliation":[]}],"short-title":[],"issued":{"date-parts":[[2023,3,24]]},"references-count":13,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2023,4,3]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/btad153","relation":{"has-preprint":[{"id-type":"doi","id":"10.1101\/2022.12.09.519715","asserted-by":"object"}]},"ISSN":["1367-4811"],"issn-type":[{"value":"1367-4811","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2023,4,1]]},"published":{"date-parts":[[2023,3,24]]},"article-number":"btad153"}}