{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T03:58:45Z","timestamp":1775707125589,"version":"3.50.1"},"reference-count":14,"publisher":"Oxford University Press (OUP)","issue":"2","license":[{"start":{"date-parts":[[2024,2,11]],"date-time":"2024-02-11T00:00:00Z","timestamp":1707609600000},"content-version":"vor","delay-in-days":10,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","award":["P30CA016042"],"award-info":[{"award-number":["P30CA016042"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","award":["U2CCA271894"],"award-info":[{"award-number":["U2CCA271894"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","award":["U24CA248265"],"award-info":[{"award-number":["U24CA248265"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","award":["U54HG012517"],"award-info":[{"award-number":["U54HG012517"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,2,1]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:sec>\n                  <jats:title>Motivation<\/jats:title>\n                  <jats:p>The volume of biomedical data generated each year is growing exponentially as high-throughput molecular, imaging and mHealth technologies expand. This rise in data volume has contributed to an increasing reliance on and demand for computational methods, and consequently to increased attention to software quality and data integrity.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Results<\/jats:title>\n                  <jats:p>To simplify data verification in diverse data-processing pipelines, we created PipeVal, a light-weight, easy-to-use, extensible tool for file validation. It is open-source, easy to integrate with complex workflows, and modularized for extensibility for new file formats. PipeVal can be rapidly inserted into existing methods and pipelines to automatically validate and verify inputs and outputs. This can reduce wasted compute time attributed to file corruption or invalid file paths, and significantly improve the quality of data-intensive software.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Availability and implementation<\/jats:title>\n                  <jats:p>PipeVal is an open-source Python package under the GPLv2 license and it is freely available at https:\/\/github.com\/uclahs-cds\/package-PipeVal. The docker image is available at: https:\/\/github.com\/uclahs-cds\/package-PipeVal\/pkgs\/container\/pipeval.<\/jats:p>\n               <\/jats:sec>","DOI":"10.1093\/bioinformatics\/btae079","type":"journal-article","created":{"date-parts":[[2024,2,11]],"date-time":"2024-02-11T10:30:11Z","timestamp":1707647411000},"source":"Crossref","is-referenced-by-count":3,"title":["PipeVal: light-weight extensible tool for file validation"],"prefix":"10.1093","volume":"40","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3113-7010","authenticated-orcid":false,"given":"Yash","family":"Patel","sequence":"first","affiliation":[{"name":"Jonsson Comprehensive Cancer Center, University of California, Los Angeles , Los Angeles, CA 90095, United States"},{"name":"Institute for Precision Health, University of California, Los Angeles , Los Angeles, CA 90095, United States"}]},{"given":"Arpi","family":"Beshlikyan","sequence":"additional","affiliation":[{"name":"Jonsson Comprehensive Cancer Center, University of California, Los Angeles , Los Angeles, CA 90095, United States"},{"name":"Institute for Precision Health, University of California, Los Angeles , Los Angeles, CA 90095, United States"}]},{"given":"Madison","family":"Jordan","sequence":"additional","affiliation":[{"name":"Jonsson Comprehensive Cancer Center, University of California, Los Angeles , Los Angeles, CA 90095, United States"},{"name":"Institute for Precision Health, University of California, Los Angeles , Los Angeles, CA 90095, United States"},{"name":"Department of Human Genetics, University of California, Los Angeles , Los Angeles, CA 90095, United States"}]},{"given":"Gina","family":"Kim","sequence":"additional","affiliation":[{"name":"Jonsson Comprehensive Cancer Center, University of California, Los Angeles , Los Angeles, CA 90095, United States"},{"name":"Institute for Precision Health, University of California, Los Angeles , Los Angeles, CA 90095, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1012-7879","authenticated-orcid":false,"given":"Aaron","family":"Holmes","sequence":"additional","affiliation":[{"name":"Jonsson Comprehensive Cancer Center, University of California, Los Angeles , Los Angeles, CA 90095, United States"},{"name":"Institute for Precision Health, University of California, Los Angeles , Los Angeles, CA 90095, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1082-3871","authenticated-orcid":false,"given":"Takafumi N","family":"Yamaguchi","sequence":"additional","affiliation":[{"name":"Jonsson Comprehensive Cancer Center, University of California, Los Angeles , Los Angeles, CA 90095, United States"},{"name":"Institute for Precision Health, University of California, Los Angeles , Los Angeles, CA 90095, United States"},{"name":"Department of Human Genetics, University of California, Los Angeles , Los Angeles, CA 90095, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0553-7520","authenticated-orcid":false,"given":"Paul C","family":"Boutros","sequence":"additional","affiliation":[{"name":"Jonsson Comprehensive Cancer Center, University of California, Los Angeles , Los Angeles, CA 90095, United States"},{"name":"Institute for Precision Health, University of California, Los Angeles , Los Angeles, CA 90095, United States"},{"name":"Department of Human Genetics, University of California, Los Angeles , Los Angeles, CA 90095, United States"},{"name":"Department of Urology, University of California, Los Angeles , Los Angeles, CA 90095, United States"},{"name":"Broad Stem Cell Research Center, University of California, Los Angeles , Los Angeles, CA 90095, United States"}]}],"member":"286","published-online":{"date-parts":[[2024,2,10]]},"reference":[{"key":"2024022411334019400_btae079-B1","doi-asserted-by":"crossref","first-page":"giab007","DOI":"10.1093\/gigascience\/giab007","article-title":"HTSlib: C library for reading\/writing high-throughput sequencing data","volume":"10","author":"Bonfield","year":"2021","journal-title":"Gigascience"},{"key":"2024022411334019400_btae079-B2","doi-asserted-by":"crossref","first-page":"D30","DOI":"10.1093\/nar\/gks1175","article-title":"Facing growth in the European Nucleotide Archive","volume":"41","author":"Cochrane","year":"2012","journal-title":"Nucleic Acids Res"},{"key":"2024022411334019400_btae079-B3","doi-asserted-by":"crossref","first-page":"1767","DOI":"10.1093\/nar\/gkp1137","article-title":"The Sanger FASTQ file format for sequences with quality scores, and the Solexa\/Illumina FASTQ variants","volume":"38","author":"Cock","year":"2010","journal-title":"Nucleic Acids Res"},{"key":"2024022411334019400_btae079-B4","doi-asserted-by":"crossref","first-page":"138","DOI":"10.1016\/j.crbiot.2022.02.004","article-title":"Big data: historic advances and emerging trends in biomedical research","volume":"4","author":"Cremin","year":"2022","journal-title":"Curr Res Biotechnol"},{"key":"2024022411334019400_btae079-B5","doi-asserted-by":"crossref","first-page":"2156","DOI":"10.1093\/bioinformatics\/btr330","article-title":"The variant call format and VCFtools","volume":"27","author":"Danecek","year":"2011","journal-title":"Bioinformatics"},{"key":"2024022411334019400_btae079-B6","doi-asserted-by":"crossref","first-page":"54","DOI":"10.1186\/s40537-019-0217-0","article-title":"Big data in healthcare: management, analysis and future prospects","volume":"6","author":"Dash","year":"2019","journal-title":"J Big Data"},{"key":"2024022411334019400_btae079-B7","doi-asserted-by":"crossref","first-page":"316","DOI":"10.1038\/nbt.3820","article-title":"Nextflow enables reproducible computational workflows","volume":"35","author":"Di Tommaso","year":"2017","journal-title":"Nat Biotechnol"},{"key":"2024022411334019400_btae079-B8","article-title":"A survey of quality assurance practices in biomedical open source software projects","volume":"9;e8","author":"Koru","year":"2007","journal-title":"J Med Internet Res"},{"key":"2024022411334019400_btae079-B9","doi-asserted-by":"crossref","first-page":"2520","DOI":"10.1093\/bioinformatics\/bts480","article-title":"Snakemake\u2014a scalable bioinformatics workflow engine","volume":"28","author":"K\u00f6ster","year":"2012","journal-title":"Bioinformatics"},{"key":"2024022411334019400_btae079-B10","doi-asserted-by":"crossref","first-page":"2078","DOI":"10.1093\/bioinformatics\/btp352","article-title":"The Sequence Alignment\/Map format and SAMtools","volume":"25","author":"Li","year":"2009","journal-title":"Bioinformatics"},{"key":"2024022411334019400_btae079-B11","first-page":"239","article-title":"Docker: lightweight Linux containers for consistent development and deployment","volume":"2014","author":"Merkel","year":"2014","journal-title":"Linux J"},{"key":"2024022411334019400_btae079-B12","doi-asserted-by":"crossref","first-page":"33","DOI":"10.12688\/f1000research.29032.2","article-title":"Sustainable data analysis with Snakemake [version 1; peer review: 1 approved, 1 approved with reservations]","volume":"10","author":"M\u00f6lder","year":"2021","journal-title":"F1000Res"},{"key":"2024022411334019400_btae079-B13","author":"Patel","year":"2024"},{"key":"2024022411334019400_btae079-B14","doi-asserted-by":"crossref","first-page":"273","DOI":"10.12688\/f1000research.10750.2","article-title":"General guidelines for biomedical software development","volume":"6","author":"Silva","year":"2017","journal-title":"F1000Res"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/advance-article-pdf\/doi\/10.1093\/bioinformatics\/btae079\/56659512\/btae079.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/40\/2\/btae079\/56750164\/btae079.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/40\/2\/btae079\/56750164\/btae079.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,24]],"date-time":"2024-02-24T11:33:57Z","timestamp":1708774437000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/doi\/10.1093\/bioinformatics\/btae079\/7606334"}},"subtitle":[],"editor":[{"given":"Alfonso","family":"Valencia","sequence":"additional","affiliation":[]}],"short-title":[],"issued":{"date-parts":[[2024,2,1]]},"references-count":14,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2024,2,1]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/btae079","relation":{},"ISSN":["1367-4803","1367-4811"],"issn-type":[{"value":"1367-4803","type":"print"},{"value":"1367-4811","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2024,2,1]]},"published":{"date-parts":[[2024,2,1]]},"article-number":"btae079"}}