{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T11:00:52Z","timestamp":1772190052589,"version":"3.50.1"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T00:00:00Z","timestamp":1772150400000},"content-version":"vor","delay-in-days":27,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["BMC Bioinformatics"],"DOI":"10.1186\/s12859-026-06372-9","type":"journal-article","created":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T09:53:05Z","timestamp":1769853185000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Sample size requirements for machine learning classification of binary outcomes in bulk RNA-Seq data"],"prefix":"10.1186","volume":"27","author":[{"given":"Scott","family":"Silvey","sequence":"first","affiliation":[]},{"given":"Amy","family":"Olex","sequence":"additional","affiliation":[]},{"given":"Shaojun","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Jinze","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,31]]},"reference":[{"key":"6372_CR1","doi-asserted-by":"publisher","first-page":"A1","DOI":"10.1016\/j.metabol.2018.08.002","volume":"87","author":"N Perakakis","year":"2018","unstructured":"Perakakis N, Yazdani A, Karniadakis GE, Mantzoros C. Omics, big data and machine learning as tools to propel understanding of biological mechanisms and to discover novel diagnostics and therapeutics. Metabolism. 2018;87:A1-9.","journal-title":"Metabolism"},{"issue":"6","key":"6372_CR2","doi-asserted-by":"publisher","first-page":"839","DOI":"10.1101\/gr.073262.107","volume":"18","author":"RA Holt","year":"2008","unstructured":"Holt RA, Jones SJM. The new paradigm of flow cell sequencing. Genome Res. 2008;18(6):839\u201346.","journal-title":"Genome Res"},{"key":"6372_CR3","doi-asserted-by":"crossref","unstructured":"Smail C, Montgomery SB. RNA Sequencing in Disease Diagnosis. Annual Review of Genomics and Human Genetics [Internet]. 2024 Feb 15 [cited 2024 Oct 10];25(1). Available from: https:\/\/pubmed.ncbi.nlm.nih.gov\/38360541\/","DOI":"10.1146\/annurev-genom-021623-121812"},{"issue":"4","key":"6372_CR4","doi-asserted-by":"publisher","first-page":"230","DOI":"10.1136\/svn-2017-000101","volume":"2","author":"F Jiang","year":"2017","unstructured":"Jiang F, Jiang Y, Zhi H, Dong Y, Li H, Ma S, et al. Artificial intelligence in healthcare: past, present and future. Stroke Vasc Neurol. 2017;2(4):230\u201343.","journal-title":"Stroke Vasc Neurol"},{"issue":"3","key":"6372_CR5","doi-asserted-by":"publisher","first-page":"540","DOI":"10.1136\/gutjnl-2019-318860","volume":"69","author":"S Yu","year":"2019","unstructured":"Yu S, Li Y, Liao Z, Wang Z, Wang Z, Li Y, et al. Plasma extracellular vesicle long RNA profiling identifies a diagnostic signature for the detection of pancreatic ductal adenocarcinoma. Gut. 2019;69(3):540\u201350.","journal-title":"Gut"},{"issue":"2","key":"6372_CR6","doi-asserted-by":"publisher","first-page":"145","DOI":"10.1165\/rcmb.2017-0430TR","volume":"59","author":"CM Koch","year":"2018","unstructured":"Koch CM, Chiu SF, Akbarpour M, Bharat A, Ridge KM, Bartom ET, et al. A Beginner\u2019s Guide to Analysis of RNA Sequencing Data. Am J Respir Cell Mol Biol. 2018;59(2):145\u201357.","journal-title":"Am J Respir Cell Mol Biol"},{"key":"6372_CR7","doi-asserted-by":"publisher","DOI":"10.1186\/s12859-019-3252-0","author":"BV Church","year":"2019","unstructured":"Church BV, Williams HT, Mar JC. Investigating skewness to understand gene expression heterogeneity in large patient cohorts. BMC Bioinformatics. 2019. https:\/\/doi.org\/10.1186\/s12859-019-3252-0.","journal-title":"BMC Bioinformatics"},{"key":"6372_CR8","first-page":"1","volume":"1","author":"M Teufel","year":"2022","unstructured":"Teufel M, Sobetzko P. Reducing costs for DNA and RNA sequencing by sample pooling using a metagenomic approach. BMC Genomics. 2022;1:1\u201310.","journal-title":"BMC Genomics"},{"key":"6372_CR9","doi-asserted-by":"crossref","unstructured":"Jones C, Gannon B, Wakai A, O\u2019Sullivan R. A systematic review of the cost of data collection for performance monitoring in hospitals. Systematic Reviews. 2015;4(1).","DOI":"10.1186\/s13643-015-0013-7"},{"key":"6372_CR10","doi-asserted-by":"publisher","DOI":"10.1186\/s12918-018-0650-2","author":"S Mallik","year":"2018","unstructured":"Mallik S, Zhao Z. Identification of gene signatures from RNA-seq data using Pareto-optimal cluster algorithm. BMC Syst Biol. 2018. https:\/\/doi.org\/10.1186\/s12918-018-0650-2.","journal-title":"BMC Syst Biol"},{"issue":"724","key":"6372_CR11","doi-asserted-by":"publisher","first-page":"150225","DOI":"10.1016\/j.bbrc.2024.150225","volume":"1","author":"Y Cheng","year":"2024","unstructured":"Cheng Y, Xu SM, Santucci K, Lindner G, Janitz M. Machine learning and related approaches in transcriptomics. Biochem Biophys Res Commun. 2024;1(724):150225\u201335.","journal-title":"Biochem Biophys Res Commun"},{"issue":"2","key":"6372_CR12","doi-asserted-by":"publisher","first-page":"221","DOI":"10.3390\/biom13020221","volume":"13","author":"H Jeon","year":"2023","unstructured":"Jeon H, Xie J, Jeon Y, Jung KJ, Gupta A, Chang W, et al. Statistical Power Analysis for Designing Bulk, Single-Cell, and Spatial Transcriptomics Experiments: Review, Tutorial, and Perspectives. Biomolecules. 2023;13(2):221\u201331.","journal-title":"Biomolecules"},{"key":"6372_CR13","unstructured":"Home - GEO - NCBI [Internet]. Nih.gov. 2019. Available from: https:\/\/www.ncbi.nlm.nih.gov\/geo\/"},{"key":"6372_CR14","unstructured":"National Cancer Institute. The Cancer Genome Atlas Program (TCGA) - NCI [Internet]. www.cancer.gov. 2022. Available from: https:\/\/www.cancer.gov\/ccg\/research\/genome-sequencing\/tcga"},{"issue":"2","key":"6372_CR15","doi-asserted-by":"publisher","DOI":"10.1016\/j.xpro.2021.100478","volume":"2","author":"P Sanchis","year":"2021","unstructured":"Sanchis P, Lavignolle R, Abbate M, Lage-Vickers S, Vazquez E, Cotignola J, et al. Analysis workflow of publicly available RNA-sequencing datasets. STAR Protoc. 2021;2(2):100478.","journal-title":"STAR Protoc"},{"issue":"3","key":"6372_CR16","doi-asserted-by":"publisher","DOI":"10.1016\/j.xgen.2021.100067","volume":"1","author":"A Thennavan","year":"2021","unstructured":"Thennavan A, Beca F, Xia Y, Garcia-Recio S, Allison K, Collins LC, et al. Molecular analysis of TCGA breast cancer histologic types. Cell Genomics. 2021;1(3):100067.","journal-title":"Cell Genomics."},{"issue":"2","key":"6372_CR17","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1016\/j.cell.2018.02.052","volume":"173","author":"J Liu","year":"2018","unstructured":"Liu J, Lichtenberg T, Hoadley KA, Poisson LM, Lazar AJ, Cherniack AD, et al. An integrated TCGA pan-cancer clinical data resource to drive high-quality survival outcome analytics. Cell. 2018;173(2):400-416.e11.","journal-title":"Cell"},{"key":"6372_CR18","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-020-77284-8","author":"D Anusewicz","year":"2020","unstructured":"Anusewicz D, Orzechowska M, Bednarek AK. Lung squamous cell carcinoma and lung adenocarcinoma differential gene expression regulation through pathways of Notch, Hedgehog, Wnt, and ErbB signalling. Sci Rep. 2020. https:\/\/doi.org\/10.1038\/s41598-020-77284-8.","journal-title":"Sci Rep"},{"key":"6372_CR19","doi-asserted-by":"crossref","unstructured":"\u0417axapo\u0432a \u0413C, Efimov V, Mikhail Raevskiy, Py\u043c\u044f\u043d\u0446e\u0432 \u041fO, Gudkov A, Oksana Yu. Belogurova-Ovchinnikova, et al. Reclassification of TCGA Diffuse Glioma Profiles Linked to Transcriptomic, Epigenetic, Genomic and Clinical Data, According to the 2021 WHO CNS Tumor Classification. International Journal of Molecular Sciences. 2022 Dec 21;24(1):157\u20137.","DOI":"10.3390\/ijms24010157"},{"issue":"2","key":"6372_CR20","doi-asserted-by":"publisher","first-page":"462","DOI":"10.1016\/j.cell.2013.09.034","volume":"155","author":"W Brennan Cameron","year":"2013","unstructured":"Brennan Cameron W, Verhaak Roel GW, McKenna A, Campos B, Noushmehr H, SalamaSofie R, et al. The Somatic Genomic Landscape of Glioblastoma. Cell. 2013;155(2):462\u201377.","journal-title":"Cell"},{"key":"6372_CR21","doi-asserted-by":"crossref","unstructured":"Akshata Desai KA. Triple Negative Breast Cancer \u2013 An Overview. Hereditary Genetics [Internet]. 2012; Available from: https:\/\/www.ncbi.nlm.nih.gov\/pmc\/articles\/PMC4181680\/","DOI":"10.4172\/2161-1041.S2-001"},{"key":"6372_CR22","volume-title":"Pattern Recognition: A Statistical Approach","author":"P Devijver","year":"1982","unstructured":"Devijver P, Kittler J. Pattern Recognition: A Statistical Approach. New Jersey: United States. Prentice-Hall; 1982."},{"key":"6372_CR23","doi-asserted-by":"crossref","unstructured":"Webb GI, Sammut C, Perlich C, Horv\u00e1th T, Wrobel S, Korb KB. Learning curves in machine learning. Encyclopedia of Machine Learning. 2011:577\u2013580.","DOI":"10.1007\/978-0-387-30164-8_452"},{"issue":"1","key":"6372_CR24","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1148\/radiology.143.1.7063747","volume":"143","author":"JA Hanley","year":"1982","unstructured":"Hanley JA, McNeil BJ. The meaning and use of the area under a receiver operating characteristic (ROC) curve. Radiology. 1982;143(1):29\u201336.","journal-title":"Radiology"},{"issue":"7","key":"6372_CR25","doi-asserted-by":"publisher","first-page":"1145","DOI":"10.1016\/S0031-3203(96)00142-2","volume":"30","author":"AP Bradley","year":"1997","unstructured":"Bradley AP. The use of the area under the ROC curve in the evaluation of machine learning algorithms. Pattern Recognit. 1997;30(7):1145\u201359.","journal-title":"Pattern Recognit"},{"issue":"4","key":"6372_CR26","doi-asserted-by":"publisher","first-page":"801","DOI":"10.1093\/jamia\/ocaa303","volume":"28","author":"D Kaur","year":"2021","unstructured":"Kaur D, Sobiesk M, Patil S, Liu J, Bhagat P, Gupta A, et al. Application of Bayesian networks to generate synthetic health data. J Am Med Inf Assoc JAMIA. 2021;28(4):801\u201311.","journal-title":"J Am Med Inf Assoc JAMIA."},{"issue":"12","key":"6372_CR27","doi-asserted-by":"publisher","DOI":"10.1186\/s13059-014-0550-8","volume":"15","author":"MI Love","year":"2014","unstructured":"Love MI, Huber W, Anders S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol. 2014;15(12):550.","journal-title":"Genome Biol"},{"key":"6372_CR28","doi-asserted-by":"crossref","unstructured":"Kursa MB, Rudnicki WR. Feature Selection with theBorutaPackage. Journal of Statistical Software. 2010;36(11).","DOI":"10.18637\/jss.v036.i11"},{"issue":"9","key":"6372_CR29","doi-asserted-by":"publisher","first-page":"e0264246","DOI":"10.1371\/journal.pone.0264246","volume":"17","author":"D Li","year":"2022","unstructured":"Li D, Zand MS, Dye TD, Goniewicz ML, Rahman I, Xie Z. An evaluation of RNA-seq differential analysis methods. PLoS ONE. 2022;17(9):e0264246.","journal-title":"PLoS ONE"},{"key":"6372_CR30","doi-asserted-by":"crossref","unstructured":"Bi R, Liu P. Sample size calculation while controlling false discovery rate for differential expression analysis with RNA-sequencing experiments. BMC Bioinformatics. 2016 Mar 31;17(1).","DOI":"10.1186\/s12859-016-0994-9"},{"issue":"12","key":"6372_CR31","doi-asserted-by":"publisher","first-page":"970","DOI":"10.1089\/cmb.2012.0283","volume":"20","author":"SN Hart","year":"2013","unstructured":"Hart SN, Therneau TM, Zhang Y, Poland GA, Pierre J. Calculating Sample Size Estimates for RNA Sequencing Data. J Comput Biol. 2013;20(12):970\u20138.","journal-title":"J Comput Biol"},{"issue":"1","key":"6372_CR32","doi-asserted-by":"publisher","first-page":"1","DOI":"10.18637\/jss.v077.i01","volume":"77","author":"MN Wright","year":"2017","unstructured":"Wright MN, Ziegler A. Ranger: a fast implementation of random forests for high dimensional data in C++ and R. J Stat Softw. 2017;77(1):1\u201317.","journal-title":"J Stat Softw"},{"key":"6372_CR33","doi-asserted-by":"crossref","unstructured":"Chen T, Guestrin C. XGBoost: a scalable tree boosting system. 2016. Presented at: Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining - KDD-16; August 13\u201317, 2016:785\u2013794; San Francisco, California, USA.","DOI":"10.1145\/2939672.2939785"},{"key":"6372_CR34","doi-asserted-by":"publisher","DOI":"10.1016\/j.health.2023.100216","volume":"4","author":"PK Mall","year":"2023","unstructured":"Mall PK, Singh PK, Srivastav S, Narayan V, Paprzycki M, Jaworska T, et al. A comprehensive review of deep neural networks for medical image processing: recent developments and future opportunities. Healthc Anal. 2023;4:100216.","journal-title":"Healthc Anal"},{"key":"6372_CR35","doi-asserted-by":"publisher","DOI":"10.1016\/j.ijmedinf.2021.104679","volume":"159","author":"EM Nwanosike","year":"2022","unstructured":"Nwanosike EM, Conway BR, Merchant HA, Hasan SS. Potential applications and performance of machine learning techniques and algorithms in clinical practice: a systematic review. Int J Med Inform. 2022;159:104679.","journal-title":"Int J Med Inform"},{"key":"6372_CR36","unstructured":"Foundation for Open Access Statistics. Fast scalable R with H20. 2015. URL: https:\/\/h2o.ai\/ [accessed 2024\u201311\u201325]"},{"issue":"26","key":"6372_CR37","doi-asserted-by":"publisher","DOI":"10.2196\/60231","volume":"17","author":"S Silvey","year":"2024","unstructured":"Silvey S, Liu J. Sample size requirements for popular classification algorithms in tabular clinical data: empirical study. J Med Internet Res. 2024;17(26):e60231.","journal-title":"J Med Internet Res"},{"key":"6372_CR38","volume-title":"Cambridge","author":"HJ Negative","year":"2007","unstructured":"Negative HJ, Regression B. Cambridge. England: Cambridge University Press; 2007."},{"issue":"3","key":"6372_CR39","doi-asserted-by":"publisher","first-page":"345","DOI":"10.1007\/BF02294361","volume":"52","author":"H Bozdogan","year":"1987","unstructured":"Bozdogan H. Model selection and Akaike\u2019s information criterion (AIC): the general theory and its analytical extensions. Psychometrika. 1987;52(3):345\u201370.","journal-title":"Psychometrika"},{"key":"6372_CR40","unstructured":"Vanegas L, Rond\u00f3n L, Paula G. _glmtoolbox: Set of tools to data analysis using generalized linear models_. 2024. URL: https:\/\/CRAN.R-project.org\/package=glmtoolbox [accessed 2024\u201311\u201319]"},{"issue":"3","key":"6372_CR41","doi-asserted-by":"publisher","first-page":"241","DOI":"10.1111\/j.1467-6419.1996.tb00013.x","volume":"10","author":"MR Veall","year":"2006","unstructured":"Veall MR, Zimmermann KF. Pseudo-R2 measures for some common limited dependent variable models. J Econ Surv. 2006;10(3):241\u201359.","journal-title":"J Econ Surv"},{"key":"6372_CR42","doi-asserted-by":"crossref","unstructured":"Wang Q, Armenia J, Zhang C, Penson AV, Reznik E, Zhang L, et al. Unifying cancer and normal RNA sequencing data from different sources. Scientific Data [Internet]. 2018 Apr 17 [cited 2019 Nov 27];5(1). Available from: https:\/\/www.nature.com\/articles\/sdata201861","DOI":"10.1038\/sdata.2018.61"},{"key":"6372_CR43","unstructured":"Argmann C, Hou R, Ungaro RC, Irizar H, Al-Taie Z, Huang R, et al. Biopsy and blood-based molecular biomarker of inflammation in IBD. Gut. 2022 Sep 15;gutjnl-2021-326451."},{"issue":"2","key":"6372_CR44","doi-asserted-by":"publisher","first-page":"295","DOI":"10.1016\/j.ajodo.2023.05.007","volume":"164","author":"B Tomasz","year":"2023","unstructured":"Tomasz B, Geubbelmans M, Rousseau AJ, Valkenborg D. Validation of machine learning algorithms. Am J Orthodontics and Dentofacial Orthopedics. 2023;164(2):295\u20137.","journal-title":"Am J Orthodontics and Dentofacial Orthopedics."},{"key":"6372_CR45","doi-asserted-by":"publisher","DOI":"10.1093\/gigascience\/giaf036\/8131472","author":"G Gallitto","year":"2025","unstructured":"Gallitto G, Englert R, Kincses B, Kotikalapudi R, Li J, Hoffschlag K, et al. External validation of machine learning models\u2014registered models and adaptive sample splitting. GigaScience. 2025. https:\/\/doi.org\/10.1093\/gigascience\/giaf036\/8131472.","journal-title":"GigaScience."},{"issue":"1","key":"6372_CR46","doi-asserted-by":"publisher","first-page":"9359","DOI":"10.1038\/s41598-023-35818-w","volume":"13","author":"M Antunes-Ferreira","year":"2023","unstructured":"Antunes-Ferreira M, D\u2019Ambrosi S, Arkani M, Post E, et al. Tumor-educated platelet blood tests for Non-Small Cell Lung Cancer detection and management. Sci Rep. 2023;13(1):9359.","journal-title":"Sci Rep"},{"key":"6372_CR47","doi-asserted-by":"crossref","unstructured":"Burcu V, Koh L, Funda MK, Shuvadeep M, Rendleman J, Choi H, et al. (2019) Exploiting interdata relationships in next-generation proteomics analysis. Molecular Cellular Proteomics. 18(8):5\u201314.","DOI":"10.1074\/mcp.MR118.001246"}],"container-title":["BMC Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/article\/10.1186\/s12859-026-06372-9","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s12859-026-06372-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s12859-026-06372-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T09:57:43Z","timestamp":1772186263000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1186\/s12859-026-06372-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,31]]},"references-count":47,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2026,12]]}},"alternative-id":["6372"],"URL":"https:\/\/doi.org\/10.1186\/s12859-026-06372-9","relation":{},"ISSN":["1471-2105"],"issn-type":[{"value":"1471-2105","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1,31]]},"assertion":[{"value":"21 August 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 January 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 January 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"All data is publicly available, thus no formal consent was necessary.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"The authors declare no competing interests.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"53"}}