{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,25]],"date-time":"2026-02-25T18:03:19Z","timestamp":1772042599586,"version":"3.50.1"},"reference-count":34,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2020,1,11]],"date-time":"2020-01-11T00:00:00Z","timestamp":1578700800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,1,11]],"date-time":"2020-01-11T00:00:00Z","timestamp":1578700800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Netw Model Anal Health Inform Bioinforma"],"published-print":{"date-parts":[[2020,12]]},"DOI":"10.1007\/s13721-020-0218-0","type":"journal-article","created":{"date-parts":[[2020,1,11]],"date-time":"2020-01-11T12:02:30Z","timestamp":1578744150000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":18,"title":["Sample size determination for biomedical big data with limited labels"],"prefix":"10.1007","volume":"9","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3269-867X","authenticated-orcid":false,"given":"Aaron N.","family":"Richter","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Taghi M.","family":"Khoshgoftaar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2020,1,11]]},"reference":[{"key":"218_CR1","unstructured":"Agarwal A, Chapelle O, Dudk M, Langford J (2014) Reliable effective terascale linear learning system. J Mach Learn Res 15:1111\u20131133. http:\/\/jmlr.org\/papers\/v15\/agarwal14a.html"},{"issue":"1pt2","key":"218_CR2","doi-asserted-by":"publisher","first-page":"347","DOI":"10.1111\/1475-6773.12139","volume":"49","author":"AM Audet","year":"2014","unstructured":"Audet AM, Squires D, Doty MM (2014) Where are we on the diffusion curve? trends and drivers of primary care physicians\u2019 use of health information technology. Health Serv Res 49(1pt2):347\u2013360. https:\/\/doi.org\/10.1111\/1475-6773.12139","journal-title":"Health Serv Res"},{"key":"218_CR3","doi-asserted-by":"publisher","unstructured":"Bacardit J, Widera P, Marquez-Chamorro A, Divina F, Aguilar-Ruiz JS, Krasnogor N (2012) Contact map prediction using a large-scale ensemble of rule sets and the fusion of multiple predicted structural features. Bioinformatics 28(19):2441\u20132448. https:\/\/doi.org\/10.1093\/bioinformatics\/bts472https:\/\/academic.oup.com\/bioinformatics\/article-lookup\/doi\/10.1093\/bioinformatics\/bts472","DOI":"10.1093\/bioinformatics\/bts472"},{"key":"218_CR4","doi-asserted-by":"publisher","DOI":"10.1186\/1471-2105-7-S5-S15","author":"A Baten","year":"2006","unstructured":"Baten A, Chang B, Halgamuge S, Li J (2006) Splice site identification using probabilistic parameters and SVM classification. BMC Bioinform. https:\/\/doi.org\/10.1186\/1471-2105-7-S5-S15","journal-title":"BMC Bioinform"},{"key":"218_CR5","doi-asserted-by":"publisher","unstructured":"Bauder RA, Khoshgoftaar TM, Hasanin T (2018) An empirical study on class rarity in big data. In: 2018 17th IEEE International Conference on Machine Learning and Applications (ICMLA), IEEE, Orlando, FL, pp 785\u2013790. https:\/\/doi.org\/10.1109\/ICMLA.2018.00125, https:\/\/ieeexplore.ieee.org\/document\/8614150\/","DOI":"10.1109\/ICMLA.2018.00125"},{"key":"218_CR6","doi-asserted-by":"crossref","unstructured":"Chang CC, Lin CJ (2011) LIBSVM: a library for support vector machines. ACM Transac Intell Syst Technol 2:27:1\u201327:27. http:\/\/www.csie.ntu.edu.tw\/~cjlin\/libsvm","DOI":"10.1145\/1961189.1961199"},{"key":"218_CR7","unstructured":"DARPA (2018) Learning with less labels (LwLL) - HR001118s0044 (Archived) - Federal Business Opportunities: Opportunities. https:\/\/www.fbo.gov\/index?s=opportunity&mode=form&id=e76d8e2ccbb9361a9e2810adfb50146f&tab=core&_cview=1"},{"key":"218_CR8","doi-asserted-by":"publisher","unstructured":"Figueroa RL, Zeng-Treitler Q, Kandula S, Ngo LH (2012) Predicting sample size required for classification performance. BMC Med Inform Decision Making 12(1). https:\/\/doi.org\/10.1186\/1472-6947-12-8, http:\/\/bmcmedinformdecismak.biomedcentral.com\/articles\/10.1186\/1472-6947-12-8","DOI":"10.1186\/1472-6947-12-8"},{"key":"218_CR9","doi-asserted-by":"publisher","unstructured":"Hajian-Tilaki K (2014) Sample size estimation in diagnostic test studies of biomedical informatics. J Biomed Inform 48:193\u2013204. https:\/\/doi.org\/10.1016\/j.jbi.2014.02.013https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1532046414000501","DOI":"10.1016\/j.jbi.2014.02.013"},{"issue":"1","key":"218_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/2196-1115-1-2","volume":"1","author":"M Herland","year":"2014","unstructured":"Herland M, Khoshgoftaar TM, Wald R (2014) A review of data mining using big data in health informatics. J Big Data 1(1):1\u201335. https:\/\/doi.org\/10.1186\/2196-1115-1-2","journal-title":"J Big Data"},{"key":"218_CR11","doi-asserted-by":"publisher","unstructured":"Herland M, Khoshgoftaar TM, Bauder RA (2018) Big data fraud detection using multiple medicare data sources. J Big Data 5(1). https:\/\/doi.org\/10.1186\/s40537-018-0138-3, https:\/\/journalofbigdata.springeropen.com\/articles\/10.1186\/s40537-018-0138-3","DOI":"10.1186\/s40537-018-0138-3"},{"key":"218_CR12","unstructured":"Jones E, Oliphant T, Peterson P (2014) SciPy: open source scientific tools for Python. http:\/\/www.scipy.org\/"},{"key":"218_CR13","unstructured":"Karpathy A (2017) Software 2.0. https:\/\/medium.com\/@karpathy\/software-2-0-a64152b37c35"},{"key":"218_CR14","doi-asserted-by":"publisher","unstructured":"Leevy JL, Khoshgoftaar TM, Bauder RA, Seliya N (2018) A survey on addressing high-class imbalance in big data. J Big Data 5(1). https:\/\/doi.org\/10.1186\/s40537-018-0151-6, https:\/\/journalofbigdata.springeropen.com\/articles\/10.1186\/s40537-018-0151-6","DOI":"10.1186\/s40537-018-0151-6"},{"key":"218_CR15","volume-title":"Sample size determination in health studies: a practical manual","author":"SK Lwanga","year":"1991","unstructured":"Lwanga SK, Lemeshow S, Organization WH et al (1991) Sample size determination in health studies: a practical manual. World Health Organization, Geneva"},{"key":"218_CR16","doi-asserted-by":"crossref","unstructured":"McKinney W (2010) Data structures for statistical computing in python. In: van\u00a0der Walt S, Millman J (eds) Proceedings of the 9th Python in Science Conference, pp 51 \u2013 56","DOI":"10.25080\/Majora-92bf1922-00a"},{"key":"218_CR17","doi-asserted-by":"publisher","unstructured":"Mukherjee S, Tamayo P, Rogers S, Rifkin R, Engle A, Campbell C, Golub TR, Mesirov JP (2003) Estimating dataset size requirements for classifying DNA microarray data. J Comput Biol 10(2):119\u2013142. https:\/\/doi.org\/10.1089\/106652703321825928, http:\/\/www.liebertpub.com\/","DOI":"10.1089\/106652703321825928"},{"key":"218_CR18","unstructured":"NIH (2018) Cancer statistics. https:\/\/www.cancer.gov\/about-cancer\/understanding\/statistics"},{"key":"218_CR19","first-page":"2825","volume":"12","author":"F Pedregosa","year":"2011","unstructured":"Pedregosa F, Varoquaux G, Gramfort A, Michel V, Thirion B, Grisel O, Blondel M, Prettenhofer P, Weiss R, Dubourg V, Vanderplas J, Passos A, Cournapeau D, Brucher M, Perrot M, Duchesnay E (2011) Scikit-learn: machine learning in python. J Mach Learn Res 12:2825\u20132830","journal-title":"J Mach Learn Res"},{"key":"218_CR20","doi-asserted-by":"publisher","unstructured":"Provost F, Jensen D, Oates T (1999) Efficient progressive sampling. In: Proceedings of the fifth ACM SIGKDD international conference on Knowledge discovery and data mining - KDD \u201999, ACM Press, San Diego, California, United States, pp 23\u201332. https:\/\/doi.org\/10.1145\/312129.312188, http:\/\/portal.acm.org\/citation.cfm?doid=312129.312188","DOI":"10.1145\/312129.312188"},{"key":"218_CR21","unstructured":"R Core Team (2017) R: a language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. https:\/\/www.R-project.org\/"},{"key":"218_CR22","doi-asserted-by":"publisher","unstructured":"Richter AN, Khoshgoftaar TM (2017a) Modernizing analytics for melanoma with a large-scale research dataset. In: 2017 IEEE International Conference on Information Reuse and Integration (IRI), IEEE, San Diego, CA, pp 551\u2013558. https:\/\/doi.org\/10.1109\/IRI.2017.45, http:\/\/ieeexplore.ieee.org\/document\/8102982\/","DOI":"10.1109\/IRI.2017.45"},{"key":"218_CR23","doi-asserted-by":"publisher","unstructured":"Richter AN, Khoshgoftaar TM (2017b) Predicting sentinel node status in melanoma from a real-world EHR dataset. In: 2017 IEEE International Conference on Bioinformatics and Biomedicine (BIBM), IEEE, Kansas City, MO, pp 1872\u20131878. https:\/\/doi.org\/10.1109\/BIBM.2017.8217945, http:\/\/ieeexplore.ieee.org\/document\/8217945\/","DOI":"10.1109\/BIBM.2017.8217945"},{"key":"218_CR24","doi-asserted-by":"publisher","unstructured":"Richter AN, Khoshgoftaar TM (2019) Melanoma risk modeling from limited positive samples. Netw Model Anal Health Inform Bioinform 8(1). https:\/\/doi.org\/10.1007\/s13721-019-0186-4, http:\/\/link.springer.com\/10.1007\/s13721-019-0186-4","DOI":"10.1007\/s13721-019-0186-4"},{"key":"218_CR25","doi-asserted-by":"publisher","unstructured":"Rio Sd, Benitez JM, Herrera F (2015) Analysis of data preprocessing increasing the oversampling ratio for extremely imbalanced big data classification. In: 2015 IEEE Trustcom\/BigDataSE\/ISPA, vol\u00a02, pp 180\u2013185. https:\/\/doi.org\/10.1109\/Trustcom.2015.579","DOI":"10.1109\/Trustcom.2015.579"},{"key":"218_CR26","unstructured":"Sam S (2019) Learning with limited labeled data. http:\/\/vision.cloudera.com\/learning-with-limited-labeled-data\/"},{"key":"218_CR27","first-page":"47","volume-title":"Active learning literature survey","author":"B Settles","year":"2009","unstructured":"Settles B (2009) Active learning literature survey. University of Wisconsin-Madison Department of Computer Sciences, Madison, p 47"},{"issue":"10","key":"218_CR28","doi-asserted-by":"publisher","first-page":"1135","DOI":"10.1038\/nbt1486","volume":"26","author":"J Shendure","year":"2008","unstructured":"Shendure J, Ji H (2008) Next-generation DNA sequencing. Nat Biotechnol 26(10):1135","journal-title":"Nat Biotechnol"},{"key":"218_CR29","unstructured":"Sonnenburg S, Franc V (2010) COFFIN: A computational framework for linear SVMs. In: ICML, pp 999\u20131006"},{"key":"218_CR30","doi-asserted-by":"publisher","unstructured":"Sun C, Shrivastava A, Singh S, Gupta A (2017) Revisiting unreasonable effectiveness of data in deep learning era. In: 2017 IEEE International Conference on Computer Vision (ICCV), IEEE, Venice, pp 843\u2013852. https:\/\/doi.org\/10.1109\/ICCV.2017.97, http:\/\/ieeexplore.ieee.org\/document\/8237359\/","DOI":"10.1109\/ICCV.2017.97"},{"key":"218_CR31","doi-asserted-by":"publisher","unstructured":"Triguero I, del Ro S, Lpez V, Bacardit J, Bentez JM, Herrera F (2015) ROSEFW-RF: The winner algorithm for the ECBDL14 big data competition: an extremely imbalanced big data bioinformatics problem. Knowledge-based systems 87:69\u201379. https:\/\/doi.org\/10.1016\/j.knosys.2015.05.027, http:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705115002130","DOI":"10.1016\/j.knosys.2015.05.027"},{"issue":"2","key":"218_CR32","doi-asserted-by":"publisher","first-page":"22","DOI":"10.1109\/MCSE.2011.37","volume":"13","author":"S van der Walt","year":"2011","unstructured":"van der Walt S, Colbert SC, Varoquaux G (2011) The numpy array: a structure for efficient numerical computation. Comput Sci Eng 13(2):22\u201330. https:\/\/doi.org\/10.1109\/MCSE.2011.37","journal-title":"Comput Sci Eng"},{"key":"218_CR33","doi-asserted-by":"crossref","unstructured":"van\u00a0der Ploeg T, Austin PC, Steyerberg EW (2014) Modern modelling techniques are data hungry: a simulation study for predicting dichotomous endpoints. BMC medical research methodology 14(1):137. https:\/\/bmcmedresmethodol.biomedcentral.com\/articles\/10.1186\/1471-2288-14-137","DOI":"10.1186\/1471-2288-14-137"},{"key":"218_CR34","doi-asserted-by":"crossref","unstructured":"Van\u00a0Hulse J, Khoshgoftaar TM, Napolitano A (2007) Experimental perspectives on learning from imbalanced data. In: Proceedings of the 24th international conference on Machine learning, ACM, pp 935\u2013942. http:\/\/dl.acm.org\/citation.cfm?id=1273614","DOI":"10.1145\/1273496.1273614"}],"container-title":["Network Modeling Analysis in Health Informatics and Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s13721-020-0218-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s13721-020-0218-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s13721-020-0218-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,1,10]],"date-time":"2021-01-10T00:59:37Z","timestamp":1610240377000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s13721-020-0218-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,1,11]]},"references-count":34,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2020,12]]}},"alternative-id":["218"],"URL":"https:\/\/doi.org\/10.1007\/s13721-020-0218-0","relation":{},"ISSN":["2192-6662","2192-6670"],"issn-type":[{"value":"2192-6662","type":"print"},{"value":"2192-6670","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,1,11]]},"assertion":[{"value":"1 September 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 January 2020","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 January 2020","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"12"}}