{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,2]],"date-time":"2026-03-02T22:09:10Z","timestamp":1772489350786,"version":"3.50.1"},"reference-count":7,"publisher":"Oxford University Press (OUP)","issue":"18","license":[{"start":{"date-parts":[[2017,6,5]],"date-time":"2017-06-05T00:00:00Z","timestamp":1496620800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/academic.oup.com\/journals\/pages\/about_us\/legal\/notices"}],"funder":[{"DOI":"10.13039\/501100003329","name":"MINECO","doi-asserted-by":"publisher","award":["TIN2016-76373-P"],"award-info":[{"award-number":["TIN2016-76373-P"]}],"id":[{"id":"10.13039\/501100003329","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003329","name":"MINECO","doi-asserted-by":"publisher","award":["TIN2014-54565-JIN"],"award-info":[{"award-number":["TIN2014-54565-JIN"]}],"id":[{"id":"10.13039\/501100003329","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100008530","name":"European Regional Development Fund","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100008530","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017,9,15]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:sec>\n                  <jats:title>Motivation<\/jats:title>\n                  <jats:p>One basic step in many bioinformatics analyses is the multiple sequence alignment. One of the state-of-the-art tools to perform multiple sequence alignment is PASTA (Practical Alignments using SAT\u00e9 and TrAnsitivity). PASTA supports multithreading but it is limited to process datasets on shared memory systems. In this work we introduce PASTASpark, a tool that uses the Big Data engine Apache Spark to boost the performance of the alignment phase of PASTA, which is the most expensive task in terms of time consumption.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Results<\/jats:title>\n                  <jats:p>Speedups up to 10\u00d7 \u2009with respect to single-threaded PASTA were observed, which allows to process an ultra-large dataset of 200\u2009000 sequences within the 24-h limit.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Availability and implementation<\/jats:title>\n                  <jats:p>PASTASpark is an Open Source tool available at https:\/\/github.com\/citiususc\/pastaspark<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Supplementary information<\/jats:title>\n                  <jats:p>Supplementary data are available at Bioinformatics online.<\/jats:p>\n               <\/jats:sec>","DOI":"10.1093\/bioinformatics\/btx354","type":"journal-article","created":{"date-parts":[[2017,6,1]],"date-time":"2017-06-01T11:09:52Z","timestamp":1496315392000},"page":"2948-2950","source":"Crossref","is-referenced-by-count":17,"title":["PASTASpark: multiple sequence alignment meets Big Data"],"prefix":"10.1093","volume":"33","author":[{"given":"Jos\u00e9 M","family":"Abu\u00edn","sequence":"first","affiliation":[{"name":"CiTIUS, Universidade de Santiago de Compostela, Santiago de Compostela, Spain"}]},{"given":"Tom\u00e1s F","family":"Pena","sequence":"additional","affiliation":[{"name":"CiTIUS, Universidade de Santiago de Compostela, Santiago de Compostela, Spain"}]},{"given":"Juan C","family":"Pichel","sequence":"additional","affiliation":[{"name":"CiTIUS, Universidade de Santiago de Compostela, Santiago de Compostela, Spain"}]}],"member":"286","published-online":{"date-parts":[[2017,6,5]]},"reference":[{"key":"2023020206412581300_btx354-B1","doi-asserted-by":"crossref","first-page":"483","DOI":"10.1145\/1465482.1465560","article-title":"Validity of the single processor approach to achieving large scale computing capabilities","author":"Amdahl","year":"1967","journal-title":"Proceedings of the AFIPS'67"},{"key":"2023020206412581300_btx354-B2","doi-asserted-by":"crossref","first-page":"511","DOI":"10.1093\/nar\/gki198","article-title":"MAFFT: improvement in accuracy of multiple sequence alignment","volume":"33","author":"Katoh","year":"2005","journal-title":"Nucleic Acids Res"},{"key":"2023020206412581300_btx354-B3","doi-asserted-by":"crossref","first-page":"90","DOI":"10.1093\/sysbio\/syr095","article-title":"SAT\u00e9-II: very fast and accurate simultaneous estimation of multiple sequence alignments and phylogenetic trees","volume":"61","author":"Liu","year":"2012","journal-title":"Syst. Biol"},{"key":"2023020206412581300_btx354-B4","doi-asserted-by":"crossref","first-page":"377","DOI":"10.1089\/cmb.2014.0156","article-title":"PASTA: ultra-large multiple sequence alignment for nucleotide and amino-acid sequences","volume":"22","author":"Mirarab","year":"2015","journal-title":"J. Comput. Biol"},{"key":"2023020206412581300_btx354-B5","doi-asserted-by":"crossref","first-page":"e9490.","DOI":"10.1371\/journal.pone.0009490","article-title":"FastTree2 \u2013 approximately maximum-likelihood trees for large alignments","volume":"5","author":"Price","year":"2010","journal-title":"PLoS One"},{"key":"2023020206412581300_btx354-B6","doi-asserted-by":"crossref","first-page":"i559","DOI":"10.1093\/bioinformatics\/btm226","article-title":"Multiple alignment by aligning alignments","volume":"23","author":"Wheeler","year":"2007","journal-title":"Bioinformatics"},{"key":"2023020206412581300_btx354-B7","first-page":"10","article-title":"Spark: cluster computing with working sets","author":"Zaharia","year":"2010","journal-title":"Proceedings of the USENIX Conference on Hot Topics in Cloud Computing"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/33\/18\/2948\/49041028\/bioinformatics_33_18_2948.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/33\/18\/2948\/49041028\/bioinformatics_33_18_2948.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,2,2]],"date-time":"2023-02-02T06:41:39Z","timestamp":1675320099000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/33\/18\/2948\/3861333"}},"subtitle":[],"editor":[{"given":"Alfonso","family":"Valencia","sequence":"additional","affiliation":[]}],"short-title":[],"issued":{"date-parts":[[2017,6,5]]},"references-count":7,"journal-issue":{"issue":"18","published-print":{"date-parts":[[2017,9,15]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/btx354","relation":{},"ISSN":["1367-4803","1367-4811"],"issn-type":[{"value":"1367-4803","type":"print"},{"value":"1367-4811","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2017,9,15]]},"published":{"date-parts":[[2017,6,5]]}}}