{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T09:56:51Z","timestamp":1773482211744,"version":"3.50.1"},"reference-count":85,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,12,3]],"date-time":"2025-12-03T00:00:00Z","timestamp":1764720000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,12,3]],"date-time":"2025-12-03T00:00:00Z","timestamp":1764720000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Data Sci Anal"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1007\/s41060-025-00966-x","type":"journal-article","created":{"date-parts":[[2025,12,3]],"date-time":"2025-12-03T08:29:27Z","timestamp":1764750567000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A similarity-based oversampling method for multi-label imbalanced text data"],"prefix":"10.1007","volume":"21","author":[{"given":"Ismail Hakki","family":"Karaman","sequence":"first","affiliation":[]},{"given":"Gulser","family":"Koksal","sequence":"additional","affiliation":[]},{"given":"Levent","family":"Eriskin","sequence":"additional","affiliation":[]},{"given":"Salih","family":"Salihoglu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,12,3]]},"reference":[{"key":"966_CR1","doi-asserted-by":"crossref","unstructured":"Aggarwal, C.C., Zhai, C.: A survey of text clustering algorithms. Mining text data, 77\u2013128 (2012)","DOI":"10.1007\/978-1-4614-3223-4_4"},{"key":"966_CR2","doi-asserted-by":"crossref","unstructured":"Fredriksson, T., Mattos, D.I., Bosch, J., Olsson, H.H.: Data labeling: An empirical investigation into industrial challenges and mitigation strategies. In: International Conference on Product-Focused Software Process Improvement, pp.202\u2013216. Springer (2020)","DOI":"10.1007\/978-3-030-64148-1_13"},{"key":"966_CR3","doi-asserted-by":"publisher","unstructured":"Singh, A.K., Shashi, M. (2019): Vectorization of text documents for identifying unifiable news articles. Int. J. Adv. Comput. Sci. Appl. https:\/\/doi.org\/10.14569\/IJACSA.2019.0100742","DOI":"10.14569\/IJACSA.2019.0100742"},{"issue":"9","key":"966_CR4","doi-asserted-by":"publisher","first-page":"10345","DOI":"10.1007\/s10462-023-10419-1","volume":"56","author":"DS Asudani","year":"2023","unstructured":"Asudani, D.S., Nagwani, N.K., Singh, P.: Impact of word embedding models on text analytics in deep learning environment: a review. Artif. Intell. Rev. 56(9), 10345\u201310425 (2023)","journal-title":"Artif. Intell. Rev."},{"key":"966_CR5","doi-asserted-by":"crossref","unstructured":"Ruzzetti, E.S., Ranaldi, L., Mastromattei, M., Fallucchi, F., Zanzotto, F.M.: Lacking the embedding of a word? look it up into a traditional dictionary. arXiv preprint arXiv:2109.11763 (2021)","DOI":"10.18653\/v1\/2022.findings-acl.208"},{"key":"966_CR6","doi-asserted-by":"crossref","unstructured":"Salihoglu, S., Koksal, G., Abar, O.: Enhancing next destination prediction: a novel long short-term memory neural network approach using real-world airline data. Engineering Applications of Artificial Intelligence, 109266 (2024)","DOI":"10.1016\/j.engappai.2024.109266"},{"issue":"4","key":"966_CR7","doi-asserted-by":"publisher","first-page":"150","DOI":"10.3390\/info10040150","volume":"10","author":"K Kowsari","year":"2019","unstructured":"Kowsari, K., Jafari Meimandi, K., Heidarysafa, M., Mendu, S., Barnes, L., Brown, D.: Text classification algorithms: a survey. Information 10(4), 150 (2019)","journal-title":"Information"},{"key":"966_CR8","unstructured":"Forman, G.: An extensive empirical study of feature selection metrics for text classification. J. Mach. Learn. Res. 1289\u20131305 (2003)"},{"key":"966_CR9","doi-asserted-by":"crossref","unstructured":"Balakrishnan, V., Lloyd-Yemoh, E.: Stemming and lemmatization: a comparison of retrieval performances (2014)","DOI":"10.7763\/LNSE.2014.V2.134"},{"key":"966_CR10","doi-asserted-by":"crossref","unstructured":"Kim, S.-B., Rim, H.-C., Yook, D., Lim, H.-S.: Effective methods for improving naive bayes text classifiers. In: Pacific Rim International Conference on Artificial Intelligence, pp.414\u2013423. Springer (2002)","DOI":"10.1007\/3-540-45683-X_45"},{"issue":"2","key":"966_CR11","doi-asserted-by":"publisher","first-page":"482","DOI":"10.1016\/j.dss.2007.06.002","volume":"44","author":"M Chau","year":"2008","unstructured":"Chau, M., Chen, H.: A machine learning approach to web page filtering using content and structure analysis. Decis. Support Syst. 44(2), 482\u2013494 (2008)","journal-title":"Decis. Support Syst."},{"key":"966_CR12","doi-asserted-by":"crossref","unstructured":"Joachims, T.: Text categorization with support vector machines: Learning with many relevant features. In: European Conference on Machine Learning, 137\u2013142. Springer, (1998)","DOI":"10.1007\/BFb0026683"},{"key":"966_CR13","doi-asserted-by":"crossref","unstructured":"Wang, Z.-Q., Sun, X., Zhang, D.-X., Li, X.: An optimal svm-based text classification algorithm. In: 2006 International Conference on Machine Learning and Cybernetics, 1378\u20131381. IEEE (2006)","DOI":"10.1109\/ICMLC.2006.258708"},{"issue":"3","key":"966_CR14","doi-asserted-by":"publisher","first-page":"290","DOI":"10.1007\/s11633-015-0912-z","volume":"15","author":"M Goudjil","year":"2018","unstructured":"Goudjil, M., Koudil, M., Bedda, M., Ghoggali, N.: A novel active learning method using svm for text classification. Int. J. Autom. Comput. 15(3), 290\u2013298 (2018)","journal-title":"Int. J. Autom. Comput."},{"key":"966_CR15","doi-asserted-by":"crossref","unstructured":"Yuan, P., Chen, Y., Jin, H., Huang, L.: Msvm-knn: Combining svm and k-nn for multi-class text classification. In: IEEE International Workshop on Semantic Computing and Systems, pp.133\u2013140, IEEE (2008)","DOI":"10.1109\/WSCS.2008.36"},{"issue":"4","key":"966_CR16","first-page":"21","volume":"13","author":"S Dumais","year":"1998","unstructured":"Dumais, S.: Using svms for text categorization. IEEE Intell. Syst. 13(4), 21\u201323 (1998)","journal-title":"IEEE Intell. Syst."},{"key":"966_CR17","unstructured":"Lewis, D.D., Ringuette, M.: A comparison of two learning algorithms for text categorization. In: Third Annual Symposium on Document Analysis and Information Retrieval, 33, 81\u201393 (1994)"},{"key":"966_CR18","doi-asserted-by":"crossref","unstructured":"Ho, T.K.: Random decision forests. In: Proceedings of 3rd International Conference on Document Analysis and Recognition, 1, 278\u2013282. IEEE (1995)","DOI":"10.1109\/ICDAR.1995.598994"},{"key":"966_CR19","doi-asserted-by":"crossref","unstructured":"Pranckevi\u010dius, T., Marcinkevi\u010dius, V.: Application of logistic regression with part-of-the-speech tagging for multi-class text classification. In: 2016 IEEE 4th Workshop on Advances in Information, Electronic and Electrical Engineering (AIEEE), 1\u20135. IEEE (2016)","DOI":"10.1109\/AIEEE.2016.7821805"},{"key":"966_CR20","doi-asserted-by":"crossref","unstructured":"G\u00fcrdil, H., So\u011fuksu, Y.B., Salihoglu, S., Co\u015fkun, F.: E\u011ftmde \u00f6\u00e7mede yapay zekanin entegrasyonu: Madde tepk kurami kapsaminda ver\u00fcretmnde chatgpt\u2019nn etk\u011f. Trakya E\u011fitim Dergisi 15(2), 887\u2013918","DOI":"10.24315\/tred.1509299"},{"key":"966_CR21","unstructured":"Kalla, D., Smith, N., Samaah, F., Kuraku, S.: Study and analysis of chat gpt and its impact on different fields of study. Int. J. Innov. Sci. Res. Technol.8(3) (2023)"},{"issue":"4","key":"966_CR22","doi-asserted-by":"publisher","first-page":"43","DOI":"10.3390\/bdcc8040043","volume":"8","author":"J Liu","year":"2024","unstructured":"Liu, J., Yang, L.: Knowledge-enhanced prompt learning for few-shot text classification. Big Data Cognit. Comput. 8(4), 43 (2024)","journal-title":"Big Data Cognit. Comput."},{"key":"966_CR23","doi-asserted-by":"crossref","unstructured":"Gupta, M., Varma, V., Damani, S., Narahari, K.N.: Compression of deep learning models for nlp. In: Proceedings of the 29th ACM International Conference on Information & Knowledge Management, 3507\u20133508 (2020)","DOI":"10.1145\/3340531.3412171"},{"key":"966_CR24","doi-asserted-by":"publisher","first-page":"68675","DOI":"10.1109\/ACCESS.2021.3077350","volume":"9","author":"S Singh","year":"2021","unstructured":"Singh, S., Mahmood, A.: The NLP cookbook: modern recipes for transformer based deep learning architectures. IEEE Access 9, 68675\u201368702 (2021)","journal-title":"IEEE Access"},{"issue":"4","key":"966_CR25","first-page":"1","volume":"52","author":"H Kaur","year":"2019","unstructured":"Kaur, H., Pannu, H.S., Malhi, A.K.: A systematic review on imbalanced data challenges in machine learning: Applications and solutions. ACM Comput. Surv. (CSUR) 52(4), 1\u201336 (2019)","journal-title":"ACM Comput. Surv. (CSUR)"},{"key":"966_CR26","doi-asserted-by":"publisher","first-page":"220","DOI":"10.1016\/j.eswa.2016.12.035","volume":"73","author":"G Haixiang","year":"2017","unstructured":"Haixiang, G., Yijing, L., Shang, J., Mingyun, G., Yuanyue, H., Bing, G.: Learning from class-imbalanced data: Review of methods and applications. Expert Syst. Appl. 73, 220\u2013239 (2017)","journal-title":"Expert Syst. Appl."},{"key":"966_CR27","doi-asserted-by":"publisher","first-page":"321","DOI":"10.1613\/jair.953","volume":"16","author":"NV Chawla","year":"2002","unstructured":"Chawla, N.V., Bowyer, K.W., Hall, L.O., Kegelmeyer, W.P.: Smote: synthetic minority over-sampling technique. J. Artif. Intell. Res. 16, 321\u2013357 (2002)","journal-title":"J. Artif. Intell. Res."},{"key":"966_CR28","doi-asserted-by":"publisher","first-page":"863","DOI":"10.1613\/jair.1.11192","volume":"61","author":"A Fern\u00e1ndez","year":"2018","unstructured":"Fern\u00e1ndez, A., Garcia, S., Herrera, F., Chawla, N.V.: Smote for learning from imbalanced data: progress and challenges, marking the 15-year anniversary. J. Artif. Intell. Res. 61, 863\u2013905 (2018)","journal-title":"J. Artif. Intell. Res."},{"key":"966_CR29","doi-asserted-by":"crossref","unstructured":"Charte, F., Rivera, A., Jesus, M.J.d., Herrera, F.: A first approach to deal with imbalance in multi-label datasets. In: International Conference on Hybrid Artificial Intelligence Systems, 150\u2013160. Springer (2013)","DOI":"10.1007\/978-3-642-40846-5_16"},{"key":"966_CR30","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/j.neucom.2014.08.091","volume":"163","author":"F Charte","year":"2015","unstructured":"Charte, F., Rivera, A.J., Jesus, M.J., Herrera, F.: Addressing imbalance in multilabel classification: measures and random resampling algorithms. Neurocomputing 163, 3\u201316 (2015)","journal-title":"Neurocomputing"},{"key":"966_CR31","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1016\/j.neucom.2016.08.158","volume":"326","author":"F Charte","year":"2019","unstructured":"Charte, F., Rivera, A.J., Jesus, M.J., Herrera, F.: Dealing with difficult minority labels in imbalanced mutilabel data sets. Neurocomputing 326, 39\u201353 (2019)","journal-title":"Neurocomputing"},{"key":"966_CR32","doi-asserted-by":"crossref","unstructured":"Charte, F., Rivera, A.J., Jesus, M.J.d., Herrera, F.: Mlenn: a first approach to heuristic multilabel undersampling. In: International Conference on Intelligent Data Engineering and Automated Learning, 1\u20139. Springer (2014)","DOI":"10.1007\/978-3-319-10840-7_1"},{"key":"966_CR33","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1016\/j.neucom.2019.11.076","volume":"383","author":"RM Pereira","year":"2020","unstructured":"Pereira, R.M., Costa, Y.M., Silla, C.N., Jr.: Mltl: a multi-label approach for the tomek link undersampling algorithm. Neurocomputing 383, 95\u2013105 (2020)","journal-title":"Neurocomputing"},{"key":"966_CR34","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2024.107107","volume":"184","author":"Q Dai","year":"2025","unstructured":"Dai, Q., Wang, L., Zhang, J., Ding, W., Chen, L.: Gqeo: Nearest neighbor graph-based generalized quadrilateral element oversampling for class-imbalance problem. Neural Netw. 184, 107107 (2025)","journal-title":"Neural Netw."},{"key":"966_CR35","doi-asserted-by":"crossref","unstructured":"Dai, Q., Liu, J.-w., Shi, Y.-h.: Class-overlap undersampling based on schur decomposition for class-imbalance problems. Expert Syst. Appl. 221, 119735 (2023)","DOI":"10.1016\/j.eswa.2023.119735"},{"issue":"1","key":"966_CR36","doi-asserted-by":"publisher","first-page":"11849","DOI":"10.1038\/s41598-021-91189-0","volume":"11","author":"X Tang","year":"2021","unstructured":"Tang, X., Mou, H., Liu, J., Du, X.: Research on automatic labeling of imbalanced texts of customer complaints based on text enhancement and layer-by-layer semantic matching. Sci. Rep. 11(1), 11849 (2021)","journal-title":"Sci. Rep."},{"issue":"3","key":"966_CR37","first-page":"290","volume":"9","author":"LD Cahya","year":"2023","unstructured":"Cahya, L.D., Luthfiarta, A., Krisna, J.I.T., Winarno, S., Nugraha, A.: Improving multi-label classification performance on imbalanced datasets through smote technique and data augmentation using indobert model. J. Nas. Teknol. Sist. Inf. 9(3), 290\u2013298 (2023)","journal-title":"J. Nas. Teknol. Sist. Inf."},{"issue":"3","key":"966_CR38","first-page":"423","volume":"20","author":"AY Taha","year":"2021","unstructured":"Taha, A.Y., Tiun, S., Abd Rahman, A.H., Sabah, A.: Multilabel over-sampling and under-sampling with class alignment for imbalanced multilabel text classification. J. Inf. Commun. Technol. 20(3), 423\u2013456 (2021)","journal-title":"J. Inf. Commun. Technol."},{"key":"966_CR39","doi-asserted-by":"crossref","unstructured":"Luo, Y., Feng, H., Weng, X., Huang, K., Zheng, H.: A novel oversampling method based on seqgan for imbalanced text classification. In: 2019 IEEE International Conference on Big Data (Big Data), 2891\u20132894. IEEE (2019)","DOI":"10.1109\/BigData47090.2019.9006138"},{"issue":"2","key":"966_CR40","doi-asserted-by":"publisher","first-page":"869","DOI":"10.3390\/app11020869","volume":"11","author":"S Shaikh","year":"2021","unstructured":"Shaikh, S., Daudpota, S.M., Imran, A.S., Kastrati, Z.: Towards improved classification accuracy on highly imbalanced text dataset using deep neural language models. Appl. Sci. 11(2), 869 (2021)","journal-title":"Appl. Sci."},{"key":"966_CR41","doi-asserted-by":"crossref","unstructured":"Chen, T., Xu, R., Lu, Q., Liu, B., Xu, J., Yao, L., He, Z.: A sentence vector based over-sampling method for imbalanced emotion classification. In: International Conference on Intelligent Text Processing and Computational Linguistics, 62\u201372. Springer (2014)","DOI":"10.1007\/978-3-642-54903-8_6"},{"key":"966_CR42","doi-asserted-by":"crossref","unstructured":"Moreo, A., Esuli, A., Sebastiani, F.: Distributional random oversampling for imbalanced text classification. In: Proceedings of the 39th International ACM SIGIR Conference on Research and Development in Information Retrieval, 805\u2013808 (2016)","DOI":"10.1145\/2911451.2914722"},{"key":"966_CR43","doi-asserted-by":"crossref","unstructured":"Mohasseb, A., Bader-El-Den, M., Cocea, M., Liu, H.: Improving imbalanced question classification using structured smote based approach. In: 2018 International Conference on Machine Learning and Cybernetics (ICMLC), 2, 593\u2013597. IEEE (2018)","DOI":"10.1109\/ICMLC.2018.8527028"},{"key":"966_CR44","doi-asserted-by":"crossref","unstructured":"Morris, J.X., Lifland, E., Yoo, J.Y., Grigsby, J., Jin, D., Qi, Y.: Textattack: A framework for adversarial attacks, data augmentation, and adversarial training in nlp. arXiv preprint arXiv:2005.05909 (2020)","DOI":"10.18653\/v1\/2020.emnlp-demos.16"},{"key":"966_CR45","doi-asserted-by":"crossref","unstructured":"Qiu, S., Xu, B., Zhang, J., Wang, Y., Shen, X., De\u00a0Melo, G., Long, C., Li, X.: Easyaug: An automatic textual data augmentation platform for classification tasks. In: Companion Proceedings of the Web Conference 2020, 249\u2013252 (2020)","DOI":"10.1145\/3366424.3383552"},{"key":"966_CR46","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.knosys.2018.06.019","volume":"160","author":"Y Li","year":"2018","unstructured":"Li, Y., Guo, H., Zhang, Q., Gu, M., Yang, J.: Imbalanced text sentiment classification using universal and domain-specific knowledge. Knowl. Based Syst. 160, 1\u201315 (2018)","journal-title":"Knowl. Based Syst."},{"key":"966_CR47","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2021.115067","volume":"179","author":"J Jang","year":"2021","unstructured":"Jang, J., Kim, Y., Choi, K., Suh, S.: Sequential targeting: a continual learning approach for data imbalance in text classification. Expert Syst. Appl. 179, 115067 (2021)","journal-title":"Expert Syst. Appl."},{"key":"966_CR48","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2020.105833","volume":"196","author":"G Shi","year":"2020","unstructured":"Shi, G., Feng, C., Xu, W., Liao, L., Huang, H.: Penalized multiple distribution selection method for imbalanced data classification. Knowl. Based Syst. 196, 105833 (2020)","journal-title":"Knowl. Based Syst."},{"key":"966_CR49","unstructured":"Tian, J., Chen, S., Zhang, X., Feng, Z.: A graph-based measurement for text imbalance classification. In: ECAI 2020, 2188\u20132195. IOS Press (2020)"},{"key":"966_CR50","doi-asserted-by":"publisher","first-page":"83","DOI":"10.1016\/j.eswa.2017.03.020","volume":"80","author":"M Pavlinek","year":"2017","unstructured":"Pavlinek, M., Podgorelec, V.: Text classification method based on self-training and lda topic models. Expert Syst. Appl. 80, 83\u201393 (2017)","journal-title":"Expert Syst. Appl."},{"issue":"3","key":"966_CR51","doi-asserted-by":"publisher","first-page":"909","DOI":"10.1109\/TII.2017.2737827","volume":"14","author":"D Wu","year":"2017","unstructured":"Wu, D., Luo, X., Wang, G., Shang, M., Yuan, Y., Yan, H.: A highly accurate framework for self-labeled semisupervised classification in industrial applications. IEEE Trans. Industr. Inf. 14(3), 909\u2013920 (2017)","journal-title":"IEEE Trans. Industr. Inf."},{"key":"966_CR52","doi-asserted-by":"crossref","unstructured":"Meng, Y., Shen, J., Zhang, C., Han, J.: Weakly-supervised neural text classification. In: Proceedings of the 27th ACM International Conference on Information and Knowledge Management, 983\u2013992 (2018)","DOI":"10.1145\/3269206.3271737"},{"issue":"3","key":"966_CR53","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1007\/s10791-008-9083-7","volume":"12","author":"R Guzm\u00e1n-Cabrera","year":"2009","unstructured":"Guzm\u00e1n-Cabrera, R., Montes-y-G\u00f3mez, M., Rosso, P., Villasenor-Pineda, L.: Using the web as corpus for self-training text categorization. Inf. Retrieval 12(3), 400\u2013415 (2009)","journal-title":"Inf. Retrieval"},{"issue":"6","key":"966_CR54","first-page":"65","volume":"29","author":"B Zhang","year":"2007","unstructured":"Zhang, B., Bai, B., Su, J.: Semi-supervised text classification based on self-training em algorithm. J. Natl. Univ. Def. Technol. 29(6), 65 (2007)","journal-title":"J. Natl. Univ. Def. Technol."},{"key":"966_CR55","doi-asserted-by":"crossref","unstructured":"Karisani, P., Karisani, N.: Semi-supervised text classification via self pretraining. In: Proceedings of the 14th ACM International Conference on Web Search and Data Mining, 40\u201348 (2021)","DOI":"10.1145\/3437963.3441814"},{"key":"966_CR56","doi-asserted-by":"crossref","unstructured":"Lanquillon, C.: Partially supervised text classification: Combining labeled and unlabeled documents using an em-like scheme. In: European Conference on Machine Learning, 229\u2013237 (2000). Springer","DOI":"10.1007\/3-540-45164-1_24"},{"key":"966_CR57","doi-asserted-by":"crossref","unstructured":"Xu, Z., Iwaihara, M.: Semantic space-based self-training for semi-supervised multi-label text classification. In: DEIM Forum E24-2 (2021)","DOI":"10.1007\/978-3-030-91669-5_20"},{"key":"966_CR58","unstructured":"Desmond, M., Duesterwald, E., Brimijoin, K., Brachman, M., Pan, Q.: Semi-automated data labeling. In: NeurIPS 2020 Competition and Demonstration Track, 156\u2013169. PMLR (2021)"},{"issue":"11","key":"966_CR59","doi-asserted-by":"publisher","first-page":"2154","DOI":"10.14778\/3476249.3476269","volume":"14","author":"H Zhang","year":"2021","unstructured":"Zhang, H., Cao, L., Madden, S., Rundensteiner, E.: Lancet: labeling complex data at scale. Proc. VLDB Endow. 14(11), 2154\u20132166 (2021)","journal-title":"Proc. VLDB Endow."},{"key":"966_CR60","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1016\/j.ins.2015.04.003","volume":"317","author":"MS Hajmohammadi","year":"2015","unstructured":"Hajmohammadi, M.S., Ibrahim, R., Selamat, A., Fujita, H.: Combination of active learning and self-training for cross-lingual sentiment classification with density analysis of unlabelled samples. Inf. Sci. 317, 67\u201377 (2015)","journal-title":"Inf. Sci."},{"key":"966_CR61","doi-asserted-by":"crossref","unstructured":"Luo, M., Shi, X., Ji, Q., Shang, M., He, X., Tao, W.: A deep self-learning classification framework for incomplete medical patents with multi-label. In: The International Conference on Natural Computation, Fuzzy Systems and Knowledge Discovery, 566\u2013573. Springer (2019)","DOI":"10.1007\/978-3-030-32591-6_61"},{"key":"966_CR62","unstructured":"Li, H., Caragea, D., Caragea, C.: Combining self-training with deep learning for disaster tweet classification. In: The 18th International Conference on Information Systems for Crisis Response and Management (ISCRAM 2021) (2021)"},{"key":"966_CR63","doi-asserted-by":"crossref","unstructured":"Meng, Y., Zhang, Y., Huang, J., Xiong, C., Ji, H., Zhang, C., Han, J.: Text classification using label names only: A language model self-training approach. arXiv preprint arXiv:2010.07245 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.724"},{"key":"966_CR64","doi-asserted-by":"crossref","unstructured":"Ye, Z., Geng, Y., Chen, J., Chen, J., Xu, X., Zheng, S., Wang, F., Zhang, J., Chen, H.: Zero-shot text classification via reinforced self-training. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, 3014\u20133024 (2020)","DOI":"10.18653\/v1\/2020.acl-main.272"},{"issue":"11","key":"966_CR65","doi-asserted-by":"publisher","first-page":"3535","DOI":"10.1007\/s10489-020-01732-1","volume":"50","author":"J Li","year":"2020","unstructured":"Li, J., Zhu, Q.: A boosting self-training framework based on instance generation with natural neighbors for k nearest neighbor. Appl. Intell. 50(11), 3535\u20133553 (2020)","journal-title":"Appl. Intell."},{"key":"966_CR66","doi-asserted-by":"publisher","DOI":"10.1016\/j.swevo.2020.100736","volume":"58","author":"Z Donyavi","year":"2020","unstructured":"Donyavi, Z., Asadi, S.: Using decomposition-based multi-objective evolutionary algorithm as synthetic example optimization for self-labeling. Swarm Evol. Comput. 58, 100736 (2020)","journal-title":"Swarm Evol. Comput."},{"key":"966_CR67","first-page":"21199","volume":"33","author":"S Mukherjee","year":"2020","unstructured":"Mukherjee, S., Awadallah, A.: Uncertainty-aware self-training for few-shot text classification. Adv. Neural. Inf. Process. Syst. 33, 21199\u201321212 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"9","key":"966_CR68","doi-asserted-by":"publisher","first-page":"421","DOI":"10.3390\/info11090421","volume":"11","author":"J Wang","year":"2020","unstructured":"Wang, J., Dong, Y.: Measurement of text similarity: a survey. Information 11(9), 421 (2020)","journal-title":"Information"},{"key":"966_CR69","unstructured":"Norouzi, M., Fleet, D.J., Salakhutdinov, R.R.: Hamming distance metric learning. Adv. Neural Inf. process. Syst.25 (2012)"},{"key":"966_CR70","doi-asserted-by":"crossref","unstructured":"Magara, M.B., Ojo, S.O., Zuva, T.: A comparative analysis of text similarity measures and algorithms in research paper recommender systems. In: 2018 Conference on Information Communications Technology and Society (ICTAS), 1\u20135. IEEE (2018)","DOI":"10.1109\/ICTAS.2018.8368766"},{"key":"966_CR71","doi-asserted-by":"crossref","unstructured":"Clark, S.: Vector space models of lexical meaning. The Handbook of Contemporary semantic theory, 493\u2013522 (2015)","DOI":"10.1002\/9781118882139.ch16"},{"key":"966_CR72","doi-asserted-by":"publisher","first-page":"7940","DOI":"10.1109\/ACCESS.2016.2619719","volume":"4","author":"A Amin","year":"2016","unstructured":"Amin, A., Anwar, S., Adnan, A., Nawaz, M., Howard, N., Qadir, J., Hawalah, A., Hussain, A.: Comparing oversampling techniques to handle the class imbalance problem: A customer churn prediction case study. IEEE Access 4, 7940\u20137957 (2016)","journal-title":"IEEE Access"},{"issue":"3","key":"966_CR73","doi-asserted-by":"publisher","first-page":"1","DOI":"10.4018\/jdwm.2007070101","volume":"3","author":"G Tsoumakas","year":"2007","unstructured":"Tsoumakas, G., Katakis, I.: Multi-label classification: an overview. Int. J. Data Warehousing Min. (IJDWM) 3(3), 1\u201313 (2007)","journal-title":"Int. J. Data Warehousing Min. (IJDWM)"},{"issue":"1","key":"966_CR74","first-page":"19","volume":"5","author":"D Ganda","year":"2018","unstructured":"Ganda, D., Buch, R.: A survey on multi label classification. Recent Trends Program. Lang. 5(1), 19\u201323 (2018)","journal-title":"Recent Trends Program. Lang."},{"issue":"8","key":"966_CR75","doi-asserted-by":"publisher","first-page":"1819","DOI":"10.1109\/TKDE.2013.39","volume":"26","author":"M-L Zhang","year":"2013","unstructured":"Zhang, M.-L., Zhou, Z.-H.: A review on multi-label learning algorithms. IEEE Trans. Knowl. Data Eng. 26(8), 1819\u20131837 (2013)","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"966_CR76","doi-asserted-by":"publisher","first-page":"176","DOI":"10.1016\/j.neucom.2020.07.055","volume":"417","author":"S Nazmi","year":"2020","unstructured":"Nazmi, S., Yan, X., Homaifar, A., Doucette, E.: Evolving multi-label classification rules by exploiting high-order label correlations. Neurocomputing 417, 176\u2013186 (2020)","journal-title":"Neurocomputing"},{"issue":"3","key":"966_CR77","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2716262","volume":"47","author":"E Gibaja","year":"2015","unstructured":"Gibaja, E., Ventura, S.: A tutorial on multilabel learning. ACM Comput. Surv. (CSUR) 47(3), 1\u201338 (2015)","journal-title":"ACM Comput. Surv. (CSUR)"},{"key":"966_CR78","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.107965","volume":"118","author":"AN Tarekegn","year":"2021","unstructured":"Tarekegn, A.N., Giacobini, M., Michalak, K.: A review of methods for imbalanced multi-label classification. Pattern Recogn. 118, 107965 (2021)","journal-title":"Pattern Recogn."},{"key":"966_CR79","doi-asserted-by":"crossref","unstructured":"Tsoumakas, G., Katakis, I., Vlahavas, I.: Mining multi-label data. Data mining and knowledge discovery handbook, 667\u2013685 (2009)","DOI":"10.1007\/978-0-387-09823-4_34"},{"key":"966_CR80","unstructured":"Gao, W., Zhou, Z.-H.: On the consistency of multi-label learning. In: Proceedings of the 24th Annual Conference on Learning Theory, 341\u2013358. JMLR Workshop and Conference Proceedings (2011)"},{"issue":"3","key":"966_CR81","doi-asserted-by":"publisher","first-page":"1083","DOI":"10.1016\/j.eswa.2014.08.036","volume":"42","author":"SM Liu","year":"2015","unstructured":"Liu, S.M., Chen, J.-H.: A multi-label classification based approach for sentiment classification. Expert Syst. Appl. 42(3), 1083\u20131093 (2015)","journal-title":"Expert Syst. Appl."},{"key":"966_CR82","doi-asserted-by":"crossref","unstructured":"Feng, Y., Zhou, M., Tong, X.: Imbalanced classification: a paradigm-based review. Stat. Anal. Data Min. The ASA Data Sci. J. 14(5), 383\u2013406 (2021)","DOI":"10.1002\/sam.11538"},{"key":"966_CR83","doi-asserted-by":"crossref","unstructured":"Wilson, S., Schaub, F., Dara, A.A., Liu, F., Cherivirala, S., Leon, P.G., Andersen, M.S., Zimmeck, S., Sathyendra, K.M., Russell, N.C., : The creation and analysis of a website privacy policy corpus. In: Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 1330\u20131340 (2016)","DOI":"10.18653\/v1\/P16-1126"},{"key":"966_CR84","doi-asserted-by":"crossref","unstructured":"Bernardini, F.C., Silva, R.B., Rodovalho, R.M., Meza, E.B.M.: Cardinality and density measures and their influence to multi-label learning methods. Submitted to Learning and Nonlinear Models (2014)","DOI":"10.21528\/LNLM-vol12-no1-art4"},{"key":"966_CR85","doi-asserted-by":"crossref","unstructured":"Han, H., Wang, W.-Y., Mao, B.-H.: Borderline-smote: a new over-sampling method in imbalanced data sets learning. In: International Conference on Intelligent Computing, 878\u2013887. Springer (2005)","DOI":"10.1007\/11538059_91"}],"container-title":["International Journal of Data Science and Analytics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s41060-025-00966-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s41060-025-00966-x","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s41060-025-00966-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T09:35:31Z","timestamp":1773480931000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s41060-025-00966-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,3]]},"references-count":85,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,6]]}},"alternative-id":["966"],"URL":"https:\/\/doi.org\/10.1007\/s41060-025-00966-x","relation":{},"ISSN":["2364-415X","2364-4168"],"issn-type":[{"value":"2364-415X","type":"print"},{"value":"2364-4168","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,12,3]]},"assertion":[{"value":"1 November 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 August 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 December 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest or Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"The authors declare no Conflict of interest.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"45"}}