{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T01:53:59Z","timestamp":1769824439776,"version":"3.49.0"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T00:00:00Z","timestamp":1740355200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T00:00:00Z","timestamp":1740355200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SN COMPUT. SCI."],"DOI":"10.1007\/s42979-025-03707-w","type":"journal-article","created":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T06:52:00Z","timestamp":1740379920000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Inverted Index for Similar Document Detection: A Case Study at Can Tho University Journal of Science"],"prefix":"10.1007","volume":"6","author":[{"given":"Hai Thanh","family":"Nguyen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ky Hoa","family":"Duong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Linh Thuy Thi","family":"Pham","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Phuong Ha Dang","family":"Bui","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nguyen","family":"Thai-Nghe","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5208-2843","authenticated-orcid":false,"given":"Tran Thanh","family":"Dien","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,2,24]]},"reference":[{"issue":"2","key":"3707_CR1","doi-asserted-by":"publisher","first-page":"224","DOI":"10.1504\/ijicbm.2016.074482","volume":"12","author":"M Arora","year":"2016","unstructured":"Arora M, Kanjilal U, Varshney D. Evaluation of information retrieval: precision and recall. Int J Indian Cult Bus Manag. 2016;12(2):224. https:\/\/doi.org\/10.1504\/ijicbm.2016.074482.","journal-title":"Int J Indian Cult Bus Manag"},{"key":"3707_CR2","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1016\/j.procs.2017.06.038","volume":"111","author":"K Baba","year":"2017","unstructured":"Baba K, Nakatoh T, Minami T. Plagiarism detection using document similarity based on distributed representation. Proc Comput Sci. 2017;111:382\u20137. https:\/\/doi.org\/10.1016\/j.procs.2017.06.038.","journal-title":"Proc Comput Sci"},{"issue":"4","key":"3707_CR3","doi-asserted-by":"publisher","first-page":"525","DOI":"10.1037\/amp0000882","volume":"77","author":"J Berger","year":"2022","unstructured":"Berger J, Packard G. Using natural language processing to understand people and culture. Am Psychol. 2022;77(4):525.","journal-title":"Am Psychol"},{"key":"3707_CR4","doi-asserted-by":"crossref","unstructured":"Bevilacqua M, Pasini T, Raganato A, Navigli R. Recent trends in word sense disambiguation: a survey. In: International joint conference on artificial intelligence. 2021. p. 4330\u201338.","DOI":"10.24963\/ijcai.2021\/593"},{"key":"3707_CR5","doi-asserted-by":"publisher","unstructured":"Bruch S, Nardini FM, Rulli C, Venturini R. Efficient inverted indexes for approximate retrieval over learned sparse representations. In: Proceedings of the 47th international ACM SIGIR conference on research and development in information retrieval, SIGIR 2024, vol.\u00a042. ACM; 2024. p. 152\u2013162. https:\/\/doi.org\/10.1145\/3626772.3657769.","DOI":"10.1145\/3626772.3657769"},{"key":"3707_CR6","doi-asserted-by":"crossref","unstructured":"Cao S, Vo H, Le HTT, Dinh D. Hybrid approach for text similarity detection in Vietnamese based on sentence-BERT and WordNet. In: 2022 4th international conference on information technology and computer communications (ITCC). New York: ACM; 2022.","DOI":"10.1145\/3548636.3548645"},{"issue":"1","key":"3707_CR7","doi-asserted-by":"publisher","first-page":"363","DOI":"10.1007\/s11227-023-05472-0","volume":"80","author":"CY Chang","year":"2023","unstructured":"Chang CY, Jhang SJ, Wu SJ, Roy DS. Jcf: joint coarse- and fine-grained similarity comparison for plagiarism detection based on nlp. J Supercomput. 2023;80(1):363\u201394. https:\/\/doi.org\/10.1007\/s11227-023-05472-0.","journal-title":"J Supercomput"},{"key":"3707_CR8","doi-asserted-by":"publisher","unstructured":"Dey S, Moharana B, De UC, Samant T, Behera TM, Banerjee S. Search engine for qna using distributed inverted index system. In: 2024 3rd international conference for innovation in technology (INOCON). IEEE; 2024. https:\/\/doi.org\/10.1109\/inocon60754.2024.10511792.","DOI":"10.1109\/inocon60754.2024.10511792"},{"key":"3707_CR9","doi-asserted-by":"publisher","DOI":"10.1088\/1742-6596\/1196\/1\/012069","volume":"1196","author":"NE Diana","year":"2019","unstructured":"Diana NE, Ulfa IH. Measuring performance of n-gram and jaccard-similarity metrics in document plagiarism application. J Phys Conf Ser. 2019;1196: 012069. https:\/\/doi.org\/10.1088\/1742-6596\/1196\/1\/012069.","journal-title":"J Phys Conf Ser"},{"key":"3707_CR10","doi-asserted-by":"publisher","unstructured":"Dien TT, Han HN, Thai-Nghe N. An approach for plagiarism detection in learning resources. Springer International Publishing; 2019. p. 722\u201330. https:\/\/doi.org\/10.1007\/978-3-030-35653-8_52.","DOI":"10.1007\/978-3-030-35653-8_52"},{"issue":"1","key":"3707_CR11","doi-asserted-by":"publisher","first-page":"2609","DOI":"10.1007\/s11042-023-15703-4","volume":"83","author":"MA El-Rashidy","year":"2023","unstructured":"El-Rashidy MA, Mohamed RG, El-Fishawy NA, Shouman MA. An effective text plagiarism detection system based on feature selection and svm techniques. Multimed Tools Appl. 2023;83(1):2609\u201346. https:\/\/doi.org\/10.1007\/s11042-023-15703-4.","journal-title":"Multimed Tools Appl"},{"issue":"1","key":"3707_CR12","doi-asserted-by":"publisher","DOI":"10.1088\/1742-6596\/1566\/1\/012112","volume":"1566","author":"M Faisal","year":"2020","unstructured":"Faisal M, Zamzami EM. Sutarman: comparative analysis of inter-centroid k-means performance using Euclidean distance, Canberra distance and Manhattan distance. J Phys Conf Ser. 2020;1566(1): 012112. https:\/\/doi.org\/10.1088\/1742-6596\/1566\/1\/012112.","journal-title":"J Phys Conf Ser"},{"key":"3707_CR13","doi-asserted-by":"crossref","unstructured":"Habibi M, Priadana A, Saputra AB, Cahyo PW. Topic modelling of germas related content on instagram using latent Dirichlet allocation (lda). In: International conference on health and medical sciences (AHMS 2020). Atlantis Press; 2021. p. 260\u201364.","DOI":"10.2991\/ahsr.k.210127.060"},{"key":"3707_CR14","doi-asserted-by":"publisher","unstructured":"Han J, Kamber M, Pei J. Getting to know your data. Elsevier; 2012, p. 39\u201382. https:\/\/doi.org\/10.1016\/b978-0-12-381479-1.00002-2.","DOI":"10.1016\/b978-0-12-381479-1.00002-2"},{"key":"3707_CR15","doi-asserted-by":"publisher","unstructured":"He R, Qu Y. Partitioned inverted index compression using hierarchical Dirichlet process. In: 2024 4th international conference on neural networks, information and communication (NNICE). IEEE; 2024. https:\/\/doi.org\/10.1109\/nnice61279.2024.10499155.","DOI":"10.1109\/nnice61279.2024.10499155"},{"issue":"2S11","key":"3707_CR16","doi-asserted-by":"publisher","first-page":"473","DOI":"10.35940\/ijrte.b1073.0982s1119","volume":"8","author":"PH Ho","year":"2019","unstructured":"Ho PH, Vo TH, Nguyen NAT, Nguyen HHC. A narrative method for evaluating documents similarity based on unique strings. Int J Recent Technol Eng. 2019;8(2S11):473\u20139. https:\/\/doi.org\/10.35940\/ijrte.b1073.0982s1119.","journal-title":"Int J Recent Technol Eng"},{"key":"3707_CR17","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.110212","volume":"148","author":"K Hu","year":"2024","unstructured":"Hu K, Zhong Z, Sun L, Huo Q. Mathematical formula detection in document images: a new dataset and a new approach. Pattern Recogn. 2024;148: 110212. https:\/\/doi.org\/10.1016\/j.patcog.2023.110212.","journal-title":"Pattern Recogn"},{"issue":"3","key":"3707_CR18","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2019.102188","volume":"57","author":"MJ Hussain","year":"2020","unstructured":"Hussain MJ, Wasti SH, Huang G, Wei L, Jiang Y, Tang Y. An approach for measuring semantic similarity between Wikipedia concepts using multiple inheritances. Inf Process Manag. 2020;57(3): 102188.","journal-title":"Inf Process Manag"},{"issue":"2","key":"3707_CR19","first-page":"469","volume":"11","author":"S Jain","year":"2023","unstructured":"Jain S, Vishwakarma S, Jain S. Analysis of term weighting schemes in vector space model for text classification. J Integr Sci Technol. 2023;11(2):469\u2013469.","journal-title":"J Integr Sci Technol"},{"key":"3707_CR20","doi-asserted-by":"publisher","unstructured":"Kalbaliyev E, Rustamov S. Text similarity detection using machine learning algorithms with character-based similarity measures. In: Digital interaction and machine intelligence. Springer International Publishing; 2021. pp. 11\u20139. https:\/\/doi.org\/10.1007\/978-3-030-74728-2_2.","DOI":"10.1007\/978-3-030-74728-2_2"},{"key":"3707_CR21","doi-asserted-by":"publisher","unstructured":"Lamiya K, Mohan A. A document similarity computation method based on word embedding and citation analysis. Springer Singapore; 2018. p. 161\u201368. https:\/\/doi.org\/10.1007\/978-981-10-8633-5_17.","DOI":"10.1007\/978-981-10-8633-5_17"},{"key":"3707_CR22","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1155\/2022\/7923262","volume":"2022","author":"F Lan","year":"2022","unstructured":"Lan F. Research on text similarity measurement hybrid algorithm with term semantic information and TF-IDF method. Adv Multimed. 2022;2022:1\u201311. https:\/\/doi.org\/10.1155\/2022\/7923262.","journal-title":"Adv Multimed"},{"issue":"2","key":"3707_CR23","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3495162","volume":"13","author":"Q Li","year":"2022","unstructured":"Li Q, Peng H, Li J, Xia C, Yang R, Sun L, Yu PS, He L. A survey on text classification: from traditional to deep learning. ACM Trans Intell Syst Technol. 2022;13(2):1\u201341. https:\/\/doi.org\/10.1145\/3495162.","journal-title":"ACM Trans Intell Syst Technol"},{"key":"3707_CR24","doi-asserted-by":"crossref","unstructured":"Li X, Yao C, Fan F, Yu X. A text similarity measurement method based on singular value decomposition and semantic relevance. J Inf Process Syst. 2017.","DOI":"10.3745\/JIPS.02.0067"},{"issue":"3","key":"3707_CR25","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3439726","volume":"54","author":"S Minaee","year":"2021","unstructured":"Minaee S, Kalchbrenner N, Cambria E, Nikzad N, Chenaghlu M, Gao J. Deep learning-based text classification. ACM Comput Surv. 2021;54(3):1\u201340. https:\/\/doi.org\/10.1145\/3439726.","journal-title":"ACM Comput Surv"},{"key":"3707_CR26","doi-asserted-by":"publisher","unstructured":"Nguyen HT, Huynh TN, Le AD, Dien TT. Topic classification based on scientific article structure: a case study at Can Tho University Journal of Science. Springer Nature Singapore; 2023. p. 208\u201315. https:\/\/doi.org\/10.1007\/978-981-99-7649-2_16.","DOI":"10.1007\/978-981-99-7649-2_16"},{"key":"3707_CR27","doi-asserted-by":"publisher","unstructured":"Nguyen HT, Le AD, Thai-Nghe N, Dien TT. An approach for similarity Vietnamese documents detection from English documents. Springer Nature Singapore; 2022. p. 574\u201387. https:\/\/doi.org\/10.1007\/978-981-19-8069-5_39.","DOI":"10.1007\/978-981-19-8069-5_39"},{"key":"3707_CR28","first-page":"17","volume":"53","author":"TPT Nguyen","year":"2019","unstructured":"Nguyen TPT, et al. Using and improving cosine similarity algorithm for building and managing question bank. J Tech Educ Sci. 2019;53:17\u201324.","journal-title":"J Tech Educ Sci"},{"key":"3707_CR29","unstructured":"Niwattanakul S, Singthongchai J, Naenudorn E, Wanapu S. Using of jaccard coefficient for keywords similarity. In: Proceedings of the international multiconference of engineers and computer scientists, vol. 6. 2013. p. 380\u201384."},{"issue":"5","key":"3707_CR30","doi-asserted-by":"publisher","first-page":"396","DOI":"10.1080\/08839514.2020.1723868","volume":"34","author":"K Park","year":"2020","unstructured":"Park K, Hong JS, Kim W. A methodology combining cosine similarity with classifier for text classification. Appl Artif Intell. 2020;34(5):396\u2013411. https:\/\/doi.org\/10.1080\/08839514.2020.1723868.","journal-title":"Appl Artif Intell"},{"key":"3707_CR31","unstructured":"Pham VHT, Pham LH. A research on the text comparison method using cosine similarity. Vietnam J Sci Technol Eng. 2017;59(1)."},{"key":"3707_CR32","unstructured":"Phuc NH, Lai PV, Vi Bao Ngoc NCC, Truong NM. A new method for evaluating semantic similarity of vietnamese texts based on word2vec model. J Mil Sci Technol. 2018:103\u2013111."},{"key":"3707_CR33","doi-asserted-by":"crossref","unstructured":"Qader WA, Ameen MM, Ahmed BI. An overview of bag of words; importance, implementation, applications, and challenges. In: 2019 international engineering conference (IEC). IEEE; 2019. p. 200\u201304.","DOI":"10.1109\/IEC47844.2019.8950616"},{"key":"3707_CR34","doi-asserted-by":"publisher","unstructured":"Rabiah S. Language as a tool for communication and cultural reality discloser. 2018. https:\/\/doi.org\/10.31227\/osf.io\/nw94m.","DOI":"10.31227\/osf.io\/nw94m"},{"key":"3707_CR35","unstructured":"Ramos J, et al. Using tf-idf to determine word relevance in document queries. In: Proceedings of the first instructional conference on machine learning, vol. 242. Citeseer; 2003. p. 29\u201348."},{"issue":"2","key":"3707_CR36","doi-asserted-by":"publisher","first-page":"305","DOI":"10.33395\/sinkron.v5i2.10909","volume":"5","author":"OA Resta","year":"2021","unstructured":"Resta OA, Aditya A, Purwiantono FE. Plagiarism detection in students\u2019 theses using the cosine similarity method. SinkrOn. 2021;5(2):305\u201313. https:\/\/doi.org\/10.33395\/sinkron.v5i2.10909.","journal-title":"SinkrOn"},{"key":"3707_CR37","doi-asserted-by":"publisher","unstructured":"Rushkin I. Document similarity from vector space densities. Springer International Publishing; 2020. p. 160\u201371. https:\/\/doi.org\/10.1007\/978-3-030-55187-2_14.","DOI":"10.1007\/978-3-030-55187-2_14"},{"key":"3707_CR38","doi-asserted-by":"publisher","first-page":"4111","DOI":"10.1007\/s11042-020-09423-2","volume":"80","author":"S Sharma","year":"2021","unstructured":"Sharma S, Gupta V, Juneja M. Diverse feature set based keyphrase extraction and indexing techniques. Multimed Tools Appl. 2021;80:4111\u201342.","journal-title":"Multimed Tools Appl"},{"issue":"5","key":"3707_CR39","first-page":"270","volume":"2","author":"D Sinwar","year":"2014","unstructured":"Sinwar D, Kaushik R. Study of Euclidean and Manhattan distance metrics using simple k-means clustering. Int J Res Appl Sci Eng Technol. 2014;2(5):270\u20134.","journal-title":"Int J Res Appl Sci Eng Technol"},{"key":"3707_CR40","doi-asserted-by":"publisher","unstructured":"Thanh Nguyen H, Kieu Nguyen T, Tri Pham M, Le Hoang Tran C, Thanh Dien T, Thai-Nghe N. Similar Vietnamese document detection in online assignment submission system. Springer International Publishing; 2022. p. 251\u201364. https:\/\/doi.org\/10.1007\/978-3-031-08580-2_23.","DOI":"10.1007\/978-3-031-08580-2_23"},{"issue":"9","key":"3707_CR41","doi-asserted-by":"publisher","first-page":"421","DOI":"10.3390\/info11090421","volume":"11","author":"J Wang","year":"2020","unstructured":"Wang J, Dong Y. Measurement of text similarity: a survey. Information. 2020;11(9):421. https:\/\/doi.org\/10.3390\/info11090421.","journal-title":"Information"},{"key":"3707_CR42","doi-asserted-by":"publisher","unstructured":"Wang J, Xu W, Yan W, Li C. Text similarity calculation method based on hybrid model of LDA and TF-IDF. In: Proceedings of the 2019 3rd international conference on computer science and artificial intelligence. ACM. 2019. https:\/\/doi.org\/10.1145\/3374587.3374590.","DOI":"10.1145\/3374587.3374590"},{"key":"3707_CR43","doi-asserted-by":"publisher","unstructured":"Wang J, Xu W, Yan W, Li C. Text similarity calculation method based on hybrid model of lda and tf-idf. In: Proceedings of the 2019 3rd international conference on computer science and artificial intelligence, CSAI2019. ACM. 2019. https:\/\/doi.org\/10.1145\/3374587.3374590.","DOI":"10.1145\/3374587.3374590"},{"issue":"3","key":"3707_CR44","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3654974","volume":"2","author":"M Widmoser","year":"2024","unstructured":"Widmoser M, Kocher D, Augsten N. Scalable distributed inverted list indexes in disaggregated memory. Proc ACM Manag Data. 2024;2(3):1\u201327. https:\/\/doi.org\/10.1145\/3654974.","journal-title":"Proc ACM Manag Data"},{"key":"3707_CR45","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.5765","author":"S Yang","year":"2020","unstructured":"Yang S, Huang G, Ofoghi B, Yearwood J. Short text similarity measurement using context-aware weighted biterms. Concurr Comput Pract Exp. 2020. https:\/\/doi.org\/10.1002\/cpe.5765.","journal-title":"Concurr Comput Pract Exp"}],"container-title":["SN Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-03707-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42979-025-03707-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-03707-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T06:52:07Z","timestamp":1740379927000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42979-025-03707-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,24]]},"references-count":45,"journal-issue":{"issue":"3","published-online":{"date-parts":[[2025,3]]}},"alternative-id":["3707"],"URL":"https:\/\/doi.org\/10.1007\/s42979-025-03707-w","relation":{},"ISSN":["2661-8907"],"issn-type":[{"value":"2661-8907","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,24]]},"assertion":[{"value":"6 December 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 January 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"All authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"The dataset used has no ethical risk and is public dataset.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical and informed consent for data used"}}],"article-number":"216"}}