{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T13:00:26Z","timestamp":1768568426089,"version":"3.49.0"},"reference-count":31,"publisher":"Oxford University Press (OUP)","issue":"19","license":[{"start":{"date-parts":[[2019,4,8]],"date-time":"2019-04-08T00:00:00Z","timestamp":1554681600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/academic.oup.com\/journals\/pages\/open_access\/funder_policies\/chorus\/standard_publication_model"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["IIS-1661760"],"award-info":[{"award-number":["IIS-1661760"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["IIS-1661756"],"award-info":[{"award-number":["IIS-1661756"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["IIS-1715202"],"award-info":[{"award-number":["IIS-1715202"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019,10,1]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:sec>\n                  <jats:title>Motivation<\/jats:title>\n                  <jats:p>Principal Component Analysis is a key tool in the study of population structure in human genetics. As modern datasets become increasingly larger in size, traditional approaches based on loading the entire dataset in the system memory (Random Access Memory) become impractical and out-of-core implementations are the only viable alternative.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Results<\/jats:title>\n                  <jats:p>We present TeraPCA, a C++ implementation of the Randomized Subspace Iteration method to perform Principal Component Analysis of large-scale datasets. TeraPCA can be applied both in-core and out-of-core and is able to successfully operate even on commodity hardware with a system memory of just a few gigabytes. Moreover, TeraPCA has minimal dependencies on external libraries and only requires a working installation of the BLAS and LAPACK libraries. When applied to a dataset containing a million individuals genotyped on a million markers, TeraPCA requires &amp;lt;5\u00a0h (in multi-threaded mode) to accurately compute the 10 leading principal components. An extensive experimental analysis shows that TeraPCA is both fast and accurate and is competitive with current state-of-the-art software for the same task.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Availability and implementation<\/jats:title>\n                  <jats:p>Source code and documentation are both available at https:\/\/github.com\/aritra90\/TeraPCA.<\/jats:p>\n               <\/jats:sec>\n               <jats:sec>\n                  <jats:title>Supplementary information<\/jats:title>\n                  <jats:p>Supplementary data are available at Bioinformatics online.<\/jats:p>\n               <\/jats:sec>","DOI":"10.1093\/bioinformatics\/btz157","type":"journal-article","created":{"date-parts":[[2019,4,4]],"date-time":"2019-04-04T19:56:16Z","timestamp":1554407776000},"page":"3679-3683","source":"Crossref","is-referenced-by-count":37,"title":["TeraPCA: a fast and scalable software package to study genetic variation in tera-scale genotypes"],"prefix":"10.1093","volume":"35","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8665-056X","authenticated-orcid":false,"given":"Aritra","family":"Bose","sequence":"first","affiliation":[{"name":"Computer Science Department, Purdue University , West Lafayette, IN, USA"}]},{"given":"Vassilis","family":"Kalantzis","sequence":"additional","affiliation":[{"name":"IBM Research, Thomas J. Watson Research Center , Yorktown Heights, NY, USA"}]},{"given":"Eugenia-Maria","family":"Kontopoulou","sequence":"additional","affiliation":[{"name":"Computer Science Department, Purdue University , West Lafayette, IN, USA"}]},{"given":"Mai","family":"Elkady","sequence":"additional","affiliation":[{"name":"Computer Science Department, Purdue University , West Lafayette, IN, USA"}]},{"given":"Peristera","family":"Paschou","sequence":"additional","affiliation":[{"name":"Department of Biological Sciences, Purdue University , West Lafayette, IN, USA"}]},{"given":"Petros","family":"Drineas","sequence":"additional","affiliation":[{"name":"Computer Science Department, Purdue University , West Lafayette, IN, USA"}]}],"member":"286","published-online":{"date-parts":[[2019,4,8]]},"reference":[{"key":"2023020108351199800_btz157-B1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1371\/journal.pone.0093766","article-title":"Fast principal component analysis of large-scale genome-wide data","volume":"9","author":"Abraham","year":"2014","journal-title":"PLoS One"},{"key":"2023020108351199800_btz157-B2","doi-asserted-by":"crossref","first-page":"2776","DOI":"10.1093\/bioinformatics\/btx299","article-title":"FlashPCA2: principal component analysis of Biobank-scale genotype datasets","volume":"33","author":"Abraham","year":"2017","journal-title":"Bioinformatics"},{"key":"2023020108351199800_btz157-B3","doi-asserted-by":"crossref","first-page":"1655","DOI":"10.1101\/gr.094052.109","article-title":"Fast model-based estimation of ancestry in unrelated individuals","volume":"19","author":"Alexander","year":"2009","journal-title":"Genome Res"},{"key":"2023020108351199800_btz157-B4","doi-asserted-by":"crossref","DOI":"10.1137\/1.9780898719604","volume-title":"LAPACK Users\u2019 Guide","author":"Anderson","year":"1999","edition":"3rd edn"},{"key":"2023020108351199800_btz157-B5","article-title":"Dissecting Population Substructure in India via Correlation Optimization of Genetics and Geodemographics","author":"Bose","year":"2017","journal-title":"bioRxiv"},{"key":"2023020108351199800_btz157-B6","doi-asserted-by":"crossref","first-page":"261","DOI":"10.1126\/science.296.5566.261b","article-title":"A human genome diversity cell line panel","volume":"296","author":"Cann","year":"2002","journal-title":"Science"},{"key":"2023020108351199800_btz157-B7","doi-asserted-by":"crossref","first-page":"490.","DOI":"10.2307\/2058750","article-title":"The history and geography of human genes","volume":"54","author":"Chisholm","year":"1995","journal-title":"J. Asian Stud"},{"key":"2023020108351199800_btz157-B8","doi-asserted-by":"crossref","first-page":"80","DOI":"10.1145\/2842602","article-title":"RandNLA: randomized numerical linear algebra","volume":"59","author":"Drineas","year":"2016","journal-title":"Commun. ACM"},{"key":"2023020108351199800_btz157-B9","first-page":"1","volume-title":"The Mathematics of Data, IAS\/Park City Mathematics Series","author":"Drineas","year":"2018"},{"key":"2023020108351199800_btz157-B10","doi-asserted-by":"crossref","first-page":"567","DOI":"10.1137\/16M1091745","article-title":"Structural convergence results for low-rank approximations from block Krylov spaces","volume":"39","author":"Drineas","year":"2018","journal-title":"SIAM J. Matrix Anal. Appl"},{"key":"2023020108351199800_btz157-B11","doi-asserted-by":"crossref","first-page":"456","DOI":"10.1016\/j.ajhg.2015.12.022","article-title":"Fast principal-component analysis reveals convergent evolution of ADH1B in Europe and East Asia","volume":"98","author":"Galinsky","year":"2016","journal-title":"Am. J. Hum. Genet"},{"key":"2023020108351199800_btz157-B12","doi-asserted-by":"crossref","first-page":"1587","DOI":"10.1038\/ng.3710","article-title":"Scaling probabilistic models of genetic variation to millions of humans","volume":"48","author":"Gopalan","year":"2016","journal-title":"Nat. Genet"},{"key":"2023020108351199800_btz157-B13","doi-asserted-by":"crossref","first-page":"217","DOI":"10.1137\/090771806","article-title":"Finding structure with randomness: probabilistic algorithms for constructing approximate matrix decompositions","volume":"53","author":"Halko","year":"2011","journal-title":"SIAM Rev"},{"key":"2023020108351199800_btz157-B14","doi-asserted-by":"crossref","first-page":"417","DOI":"10.1037\/h0071325","article-title":"Analysis of a complex of statistical variables into principal components","volume":"24","author":"Hotelling","year":"1933","journal-title":"J. Educ. Psychol"},{"key":"2023020108351199800_btz157-B15","doi-asserted-by":"crossref","first-page":"321","DOI":"10.1093\/biomet\/28.3-4.321","article-title":"Relations between two sets of variates","volume":"28","author":"Hotelling","year":"1936","journal-title":"Biometrika"},{"key":"2023020108351199800_btz157-B16","doi-asserted-by":"crossref","first-page":"786","DOI":"10.1126\/science.356262","article-title":"Synthetic maps of human gene frequencies in Europeans","volume":"201","author":"Menozzi","year":"1978","journal-title":"Science"},{"key":"2023020108351199800_btz157-B17","first-page":"1396","volume-title":"Advances in Neural Information Processing Systems 28","author":"Musco","year":"2015"},{"key":"2023020108351199800_btz157-B18","doi-asserted-by":"crossref","first-page":"98","DOI":"10.1038\/nature07331","article-title":"Genes mirror geography within Europe","volume":"456","author":"Novembre","year":"2008","journal-title":"Nature"},{"key":"2023020108351199800_btz157-B19","doi-asserted-by":"crossref","DOI":"10.1137\/1.9781611971163","volume-title":"The Symmetric Eigenvalue Problem","author":"Parlett","year":"1998"},{"key":"2023020108351199800_btz157-B20","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1371\/journal.pgen.0030160","article-title":"PCA-correlated SNPs for structure identification in worldwide human populations","volume":"3","author":"Paschou","year":"2007","journal-title":"PLoS Genet"},{"key":"2023020108351199800_btz157-B21","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1371\/journal.pgen.1000114","article-title":"Tracing sub-structure in the European American population with PCA-informative markers","volume":"4","author":"Paschou","year":"2008","journal-title":"PLoS Genet"},{"key":"2023020108351199800_btz157-B22","doi-asserted-by":"crossref","first-page":"9211","DOI":"10.1073\/pnas.1320811111","article-title":"Maritime route of colonization of Europe","volume":"111","author":"Paschou","year":"2014","journal-title":"Proc. Natl. Acad. Sci. USA"},{"key":"2023020108351199800_btz157-B23","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1371\/journal.pgen.0020190","article-title":"Population structure and eigenanalysis","volume":"2","author":"Patterson","year":"2006","journal-title":"PLoS Genet"},{"key":"2023020108351199800_btz157-B24","doi-asserted-by":"crossref","first-page":"559","DOI":"10.1080\/14786440109462720","article-title":"On lines and planes of closest fit to systems of points in space","volume":"2","author":"Pearson","year":"1901","journal-title":"Lond. Edinb. Dubl. Phil. Mag"},{"key":"2023020108351199800_btz157-B25","doi-asserted-by":"crossref","first-page":"904","DOI":"10.1038\/ng1847","article-title":"Principal components analysis corrects for stratification in genome-wide association studies","volume":"38","author":"Price","year":"2006","journal-title":"Nat. Genet"},{"key":"2023020108351199800_btz157-B26","doi-asserted-by":"crossref","first-page":"459","DOI":"10.1038\/nrg2813","article-title":"New approaches to population stratification in genome-wide association studies","volume":"11","author":"Price","year":"2010","journal-title":"Nat. Rev. Genet"},{"key":"2023020108351199800_btz157-B27","doi-asserted-by":"crossref","first-page":"945","DOI":"10.1093\/genetics\/155.2.945","article-title":"Inference of population structure using multilocus genotype data","volume":"155","author":"Pritchard","year":"2000","journal-title":"Genetics"},{"key":"2023020108351199800_btz157-B28","doi-asserted-by":"crossref","first-page":"1100","DOI":"10.1137\/080736417","article-title":"A randomized algorithm for principal component analysis","volume":"31","author":"Rokhlin","year":"2010","journal-title":"SIAM J. Matrix Anal. Appl"},{"key":"2023020108351199800_btz157-B29","doi-asserted-by":"crossref","DOI":"10.1137\/1.9781611970739","volume-title":"Numerical Methods for Large Eigenvalue Problems","author":"Saad","year":"2011"},{"key":"2023020108351199800_btz157-B30","doi-asserted-by":"crossref","first-page":"13","DOI":"10.2202\/1544-6115.1493","article-title":"Comparing spatial maps of human population-genetic variation using procrustes analysis","volume":"9","author":"Wang","year":"2010","journal-title":"Stat. Appl. Genet. Mol. Biol"},{"key":"2023020108351199800_btz157-B31","first-page":"1358","article-title":"Estimating f-statistics for the analysis of population structure","volume":"38","author":"Weir","year":"1984","journal-title":"Evolution"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/academic.oup.com\/bioinformatics\/advance-article-pdf\/doi\/10.1093\/bioinformatics\/btz157\/28492008\/btz157.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/35\/19\/3679\/48976079\/bioinformatics_35_19_3679.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/35\/19\/3679\/48976079\/bioinformatics_35_19_3679.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,2,1]],"date-time":"2023-02-01T19:43:35Z","timestamp":1675280615000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/35\/19\/3679\/5430929"}},"subtitle":[],"editor":[{"given":"Russell","family":"Schwartz","sequence":"additional","affiliation":[]}],"short-title":[],"issued":{"date-parts":[[2019,4,8]]},"references-count":31,"journal-issue":{"issue":"19","published-print":{"date-parts":[[2019,10,1]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/btz157","relation":{},"ISSN":["1367-4803","1367-4811"],"issn-type":[{"value":"1367-4803","type":"print"},{"value":"1367-4811","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2019,10,1]]},"published":{"date-parts":[[2019,4,8]]}}}