{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:42:56Z","timestamp":1776883376689,"version":"3.51.2"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,3,11]],"date-time":"2025-03-11T00:00:00Z","timestamp":1741651200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,3,11]],"date-time":"2025-03-11T00:00:00Z","timestamp":1741651200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Big Data"],"DOI":"10.1186\/s40537-025-01120-x","type":"journal-article","created":{"date-parts":[[2025,3,11]],"date-time":"2025-03-11T11:00:29Z","timestamp":1741690829000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["Unsupervised label generation for severely imbalanced fraud data"],"prefix":"10.1186","volume":"12","author":[{"given":"Mary Anne","family":"Walauskis","sequence":"first","affiliation":[]},{"given":"Taghi M.","family":"Khoshgoftaar","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,11]]},"reference":[{"key":"1120_CR1","unstructured":"Xie J, Girshick R, Farhadi A. Unsupervised deep embedding for clustering analysis. In: International Conference on Machine Learning, 2016;478\u2013487. PMLR, New York, New York, USA."},{"key":"1120_CR2","unstructured":"U.S. Department of Justice: Criminal Resource Manual: 1007. Fraud. Accessed from https:\/\/www.justice.gov\/archives\/jm\/criminal-resource-manual-1007-fraud. (n.d.)."},{"key":"1120_CR3","unstructured":"Security.org: Credit Card Fraud Report. Accessed from https:\/\/www.security.org\/digital-safety\/credit-card-fraud-report\/. 2024."},{"key":"1120_CR4","unstructured":"Federal Trade Commission: Nationwide fraud losses top \\$10 billion in 2023 as FTC steps up efforts to protect the public. Accessed from https:\/\/www.ftc.gov\/news-events\/news\/press-releases\/2024\/02\/nationwide-fraud-losses-top-10-billion-2023-ftc-steps-efforts-protect-public. 2024."},{"key":"1120_CR5","unstructured":"Bureau of Justice Statistics: Victims of Identity Theft, 2021. Accessed from https:\/\/bjs.ojp.gov\/press-release\/victims-identity-theft-2021. 2021."},{"key":"1120_CR6","unstructured":"Social Security Administration Blog: Medicare Fraud Prevention Week. Accessed from https:\/\/blog.ssa.gov\/medicare-fraud-prevention-week\/. 2024."},{"key":"1120_CR7","unstructured":"U.S. Government Accountability Office: Federal Fraud: Challenges and Costs. Accessed from https:\/\/www.gao.gov\/products\/gao-24-105833. 2024."},{"key":"1120_CR8","unstructured":"Civil Division, U.S. Department of Justice: Fraud statistics, overview. Accessed from https:\/\/www.justice.gov\/opa\/press-release\/file\/1354316\/download. 2020."},{"key":"1120_CR9","unstructured":"U.S. Department of Justice: Criminal Resource Manual: 976. Health Care Fraud Generally. Accessed from https:\/\/www.justice.gov\/archives\/jm\/criminal-resource-manual-976-health-care-fraud-generally. (n.d.)."},{"issue":"4","key":"1120_CR10","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1007\/s13748-016-0094-0","volume":"5","author":"B Krawczyk","year":"2016","unstructured":"Krawczyk B. Learning from imbalanced data: open challenges and future directions. Prog Artif Intell. 2016;5(4):221\u201332. https:\/\/doi.org\/10.1007\/s13748-016-0094-0.","journal-title":"Prog Artif Intell"},{"key":"1120_CR11","unstructured":"Kaggle: Credit Card Fraud Detection. https:\/\/www.kaggle.com\/mlg-ulb\/creditcardfraud. Accessed: [Insert date of access here]. 2018."},{"issue":"4","key":"1120_CR12","doi-asserted-by":"publisher","first-page":"389","DOI":"10.1007\/s42979-023-01134-5","volume":"4","author":"JM Johnson","year":"2023","unstructured":"Johnson JM, Khoshgoftaar TM. Data-centric ai for healthcare fraud detection. SN Comput Sci. 2023;4(4):389. https:\/\/doi.org\/10.1007\/s42979-023-01134-5.","journal-title":"SN Comput Sci"},{"key":"1120_CR13","unstructured":"Sulaiman RB, Schetinin V, Sant P. Review of credit card fraud detection using machine learning. 2020. https:\/\/api.semanticscholar.org\/CorpusID:262477642."},{"key":"1120_CR14","doi-asserted-by":"crossref","unstructured":"So A, Hooshyar D, Park KW, Lim HS. Early diagnosis of dementia from clinical data by machine learning techniques. Appl Sci. 2017;7(7).","DOI":"10.3390\/app7070651"},{"issue":"1","key":"1120_CR15","first-page":"4190023","volume":"2022","author":"A Revathi","year":"2022","unstructured":"Revathi A, Kaladevi R, Ramana K, Jhaveri RH, Kumar MR, Kumar MSP. Early detection of cognitive decline using machine learning algorithm and cognitive ability test. Secur Commun Netw. 2022;2022(1):4190023.","journal-title":"Secur Commun Netw"},{"key":"1120_CR16","doi-asserted-by":"crossref","unstructured":"Naby AAE, Hemdan, EE-D, El-Sayed A. Deep learning approach for credit card fraud detection. In: 2021 International Conference on Electronic Engineering (ICEEM), 2021;1\u20135.","DOI":"10.1109\/ICEEM52022.2021.9480639"},{"issue":"2","key":"1120_CR17","doi-asserted-by":"publisher","first-page":"8","DOI":"10.1109\/MIS.2009.36","volume":"24","author":"A Halevy","year":"2009","unstructured":"Halevy A, Norvig P, Pereira F. The unreasonable effectiveness of data. IEEE Intell Syst. 2009;24(2):8\u201312. https:\/\/doi.org\/10.1109\/MIS.2009.36.","journal-title":"IEEE Intell Syst"},{"key":"1120_CR18","doi-asserted-by":"publisher","unstructured":"Babu AM, Pratap A. Credit card fraud detection using deep learning. In: 2020 IEEE Recent Advances in Intelligent Computational Systems (RAICS), 2020;32\u201336. https:\/\/doi.org\/10.1109\/RAICS51191.2020.9332497.","DOI":"10.1109\/RAICS51191.2020.9332497"},{"key":"1120_CR19","doi-asserted-by":"publisher","unstructured":"Li J, Stones RJ, Wang G, Li Z, Liu X, Xiao K. Being accurate is not enough: new metrics for disk failure prediction. In: 2016 IEEE 35th Symposium on Reliable Distributed Systems (SRDS), 2016;71\u201380. https:\/\/doi.org\/10.1109\/SRDS.2016.019.","DOI":"10.1109\/SRDS.2016.019"},{"key":"1120_CR20","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1007\/978-3-030-88942-5_6","volume-title":"Discovery Sci","author":"JG Gaudreault","year":"2021","unstructured":"Gaudreault JG, Branco P, Gama J. An analysis of performance metrics for imbalanced classification. In: Soares C, Torgo L, editors. Discovery Sci. Cham: Springer; 2021. p. 67\u201377."},{"key":"1120_CR21","doi-asserted-by":"publisher","unstructured":"Zhou Y, Yan H, Wang J, Chen Z, Ma A. Knowledge transfer-based network from medium and high-resolution sar imagery for built-up extraction with class-imbalanced data. In: 2023 SAR in Big Data Era (BIGSARDATA), 2023;1\u20134. https:\/\/doi.org\/10.1109\/BIGSARDATA59007.2023.10294846.","DOI":"10.1109\/BIGSARDATA59007.2023.10294846"},{"issue":"5","key":"1120_CR22","doi-asserted-by":"publisher","first-page":"2621","DOI":"10.24200\/sci.2019.51110.2010","volume":"27","author":"F Moslehi","year":"2020","unstructured":"Moslehi F, Haeri A, Gholamian MR. A novel selective clustering framework for appropriate labeling of clusters based on k-means algorithm. Scientia Iranica. 2020;27(5):2621\u201334. https:\/\/doi.org\/10.24200\/sci.2019.51110.2010.","journal-title":"Scientia Iranica"},{"key":"1120_CR23","doi-asserted-by":"publisher","first-page":"317","DOI":"10.1016\/j.ins.2019.05.042","volume":"557","author":"F Carcillo","year":"2021","unstructured":"Carcillo F, Borgne YAL, Caelen O, Kessaci Y, Obl\u00e9 F, Bontempi G. Combining unsupervised and supervised learning in credit card fraud detection. Inf Sci. 2021;557:317\u201331. https:\/\/doi.org\/10.1016\/j.ins.2019.05.042.","journal-title":"Inf Sci"},{"key":"1120_CR24","first-page":"60527","volume":"36","author":"A Gadetsky","year":"2023","unstructured":"Gadetsky A, Brbic M. The pursuit of human labeling: a new perspective on unsupervised learning. Adv Neural Inf Proc Syst. 2023;36:60527\u201346.","journal-title":"Adv Neural Inf Proc Syst"},{"key":"1120_CR25","doi-asserted-by":"publisher","first-page":"02386","DOI":"10.1016\/j.sciaf.2024.e02386","volume":"26","author":"EF Agyemang","year":"2024","unstructured":"Agyemang EF. Anomaly detection using unsupervised machine learning algorithms: a simulation study. Sci Afr. 2024;26:02386. https:\/\/doi.org\/10.1016\/j.sciaf.2024.e02386.","journal-title":"Sci Afr"},{"key":"1120_CR26","doi-asserted-by":"crossref","unstructured":"Zhang J, Wang Y, Yang Y, Luo Y, Ratner A. Binary classification with positive labeling sources. In: Proceedings of the 31st ACM International Conference on Information & Knowledge Management. CIKM \u201922, 2022;4672\u20134676. Association for Computing Machinery, New York, NY, USA.","DOI":"10.1145\/3511808.3557552"},{"issue":"3","key":"1120_CR27","first-page":"2299","volume":"35","author":"D Shi","year":"2023","unstructured":"Shi D, Zhu L, Li J, Cheng Z, Liu Z. Binary label learning for semi-supervised feature selection. IEEE Trans Knowl Data Eng. 2023;35(3):2299\u2013312.","journal-title":"IEEE Trans Knowl Data Eng"},{"key":"1120_CR28","doi-asserted-by":"publisher","DOI":"10.1145\/3631326","author":"M Hort","year":"2024","unstructured":"Hort M, Chen Z, Zhang JM, Harman M, Sarro F. Bias mitigation for machine learning classifiers: a comprehensive survey. ACM J Responsib Comput. 2024. https:\/\/doi.org\/10.1145\/3631326.","journal-title":"ACM J Responsib Comput."},{"key":"1120_CR29","first-page":"2825","volume":"12","author":"F Pedregosa","year":"2011","unstructured":"Pedregosa F, Varoquaux G, Gramfort A, Michel V, Thirion B, Grisel O, Blondel M, Prettenhofer P, Weiss R, Dubourg V, Vanderplas J, Passos A, Cournapeau D, Brucher M, Perrot M, Duchesnay E. Scikit-learn: machine learning in python. J Mach Learn Res. 2011;12:2825\u201330.","journal-title":"J Mach Learn Res"},{"key":"1120_CR30","unstructured":"developers S-l. sklearn.preprocessing.normalize. https:\/\/scikit-learn.org\/1.5\/modules\/generated\/sklearn.preprocessing.normalize.html. Accessed: 2024-12-02. 2024."},{"key":"1120_CR31","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2022.109924","volume":"133","author":"LBV de Amorim","year":"2023","unstructured":"de Amorim LBV, Cavalcanti GDC, Cruz RMO. The choice of scaling technique matters for classification performance. Appl Soft Comput. 2023;133: 109924. https:\/\/doi.org\/10.1016\/j.asoc.2022.109924.","journal-title":"Appl Soft Comput"},{"key":"1120_CR32","unstructured":"Team P.D. pandas.DataFrame.sample \u2014 Pandas Documentation. https:\/\/pandas.pydata.org\/docs\/reference\/api\/pandas.DataFrame.sample.html. Accessed: 2024-12-02. 2024."},{"key":"1120_CR33","unstructured":"Yu Z, Jiang W, Alonso G. Efficient Tabular Data Preprocessing of ML Pipelines. 2024. arxiv:2409.14912. https:\/\/arxiv.org\/abs\/2409.14912."},{"issue":"19","key":"1120_CR34","doi-asserted-by":"publisher","first-page":"20149","DOI":"10.1007\/s11042-017-4566-4","volume":"76","author":"Q Zhan","year":"2017","unstructured":"Zhan Q, Mao Y. Improved spectral clustering based on nystr\u00f6m method. Multimed Tools Appl. 2017;76(19):20149\u201365. https:\/\/doi.org\/10.1007\/s11042-017-4566-4.","journal-title":"Multimed Tools Appl"},{"key":"1120_CR35","doi-asserted-by":"publisher","first-page":"158","DOI":"10.1016\/j.procs.2020.04.017","volume":"171","author":"E Patel","year":"2020","unstructured":"Patel E, Kushwaha DS. Clustering cloud workloads: K-means vs Gaussian mixture model. Procedia Comput Sci. 2020;171:158\u201367. https:\/\/doi.org\/10.1016\/j.procs.2020.04.017. (Third International Conference on Computing and Network Communications (CoCoNet &apos;19)).","journal-title":"Procedia Comput Sci"},{"key":"1120_CR36","doi-asserted-by":"publisher","first-page":"242","DOI":"10.1109\/ojsp.2020.3039330","volume":"1","author":"F Pourkamali-Anaraki","year":"2020","unstructured":"Pourkamali-Anaraki F. Scalable spectral clustering with nystr\u00f6m approximation: practical and theoretical aspects. IEEE Open J Signal Proc. 2020;1:242\u201356. https:\/\/doi.org\/10.1109\/ojsp.2020.3039330.","journal-title":"IEEE Open J Signal Proc"},{"key":"1120_CR37","unstructured":"developers S-l. sklearn.kernel_approximation.Nystroem. Accessed: 2024-12-02. 2024."},{"key":"1120_CR38","doi-asserted-by":"publisher","unstructured":"Liu FT, Ting KM, Zhou Z-H. Isolation forest. In: 2008 Eighth IEEE International Conference on Data Mining, 2008;413\u2013422. https:\/\/doi.org\/10.1109\/ICDM.2008.17.","DOI":"10.1109\/ICDM.2008.17"},{"key":"1120_CR39","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1016\/j.patcog.2016.03.028","volume":"58","author":"SM Erfani","year":"2016","unstructured":"Erfani SM, Rajasegarar S, Karunasekera S, Leckie C. High-dimensional and large-scale anomaly detection using a linear one-class svm with deep learning. Pattern Recogn. 2016;58:121\u201334. https:\/\/doi.org\/10.1016\/j.patcog.2016.03.028.","journal-title":"Pattern Recogn"},{"key":"1120_CR40","doi-asserted-by":"publisher","DOI":"10.3390\/e24050611","author":"C Shao","year":"2022","unstructured":"Shao C, Du X, Yu J, Chen J. Cluster-based improved isolation forest. Entropy. 2022. https:\/\/doi.org\/10.3390\/e24050611.","journal-title":"Entropy."},{"key":"1120_CR41","doi-asserted-by":"publisher","unstructured":"Leevy JL, Khoshgoftaar TM, Hancock J. Evaluating performance metrics for credit card fraud classification. In: 2022 IEEE 34th International Conference on Tools with Artificial Intelligence (ICTAI), 2022;1336\u20131341. https:\/\/doi.org\/10.1109\/ICTAI56018.2022.00202.","DOI":"10.1109\/ICTAI56018.2022.00202"},{"key":"1120_CR42","unstructured":"for Medicare C, Services M. Medicare Part D Prescribers \u2014 By Provider and Drug. https:\/\/data.cms.gov\/providersummary-by-type-of-service\/medicare-part-dprescribers\/medicare-part-d-prescribers-by-provider-and-drug. Accessed: 2024-11-26. 2024."},{"key":"1120_CR43","unstructured":"of Health USD, Human\u00a0Services OoIG. LEIE Downloadable Databases. Accessed: 2024-11-26. 2024."},{"key":"1120_CR44","doi-asserted-by":"publisher","unstructured":"Bauder R, Khoshgoftaar TM. A novel method for fraudulent medicare claims detection from expected payment deviations (application paper). In: 2016 IEEE 17th International Conference on Information Reuse and Integration (IRI), 2016;11\u201319. https:\/\/doi.org\/10.1109\/IRI.2016.11.","DOI":"10.1109\/IRI.2016.11"},{"key":"1120_CR45","unstructured":"M\u00fcller D, Soto-Rey I, Kramer F. Towards a guideline for evaluation metrics in medical image segmentation. 2022. arxiv:2202.05273. https:\/\/arxiv.org\/abs\/2202.05273."},{"issue":"6","key":"1120_CR46","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s12864-019-6413-7","volume":"21","author":"D Chicco","year":"2020","unstructured":"Chicco D, Jurman G. The advantages of the matthews correlation coefficient (mcc) over f1 score and accuracy in binary classification evaluation. BMC Genomics. 2020;21(6):1\u201313. https:\/\/doi.org\/10.1186\/s12864-019-6413-7.","journal-title":"BMC Genomics"},{"key":"1120_CR47","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1007\/978-981-99-6346-1_10","volume-title":"Data analytics and learning","author":"KT Vasudev","year":"2024","unstructured":"Vasudev KT, Manohara Pai MM, Pai RM. Comparative analysis of generic outlier detection techniques. In: Guru DS, Kumar NV, Javed M, editors. Data analytics and learning. Singapore: Springer; 2024. p. 117\u201326."},{"issue":"1","key":"1120_CR48","doi-asserted-by":"publisher","first-page":"155","DOI":"10.1016\/j.datak.2007.01.002","volume":"63","author":"R Gelbard","year":"2007","unstructured":"Gelbard R, Goldman O, Spiegler I. Investigating diversity of clustering methods: an empirical comparison. Data Knowl Eng. 2007;63(1):155\u201366. https:\/\/doi.org\/10.1016\/j.datak.2007.01.002.","journal-title":"Data Knowl Eng"}],"container-title":["Journal of Big Data"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s40537-025-01120-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1186\/s40537-025-01120-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s40537-025-01120-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,11]],"date-time":"2025-03-11T11:00:48Z","timestamp":1741690848000},"score":1,"resource":{"primary":{"URL":"https:\/\/journalofbigdata.springeropen.com\/articles\/10.1186\/s40537-025-01120-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,11]]},"references-count":48,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["1120"],"URL":"https:\/\/doi.org\/10.1186\/s40537-025-01120-x","relation":{},"ISSN":["2196-1115"],"issn-type":[{"value":"2196-1115","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,3,11]]},"assertion":[{"value":"23 December 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 February 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 March 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"The authors declare that they have no Conflict of interest.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"63"}}