{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,17]],"date-time":"2026-06-17T14:22:04Z","timestamp":1781706124684,"version":"3.54.5"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,8,13]],"date-time":"2025-08-13T00:00:00Z","timestamp":1755043200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,8,13]],"date-time":"2025-08-13T00:00:00Z","timestamp":1755043200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/501100014188","name":"Ministry of Science and ICT, South Korea","doi-asserted-by":"publisher","award":["2021-0- 01341-003"],"award-info":[{"award-number":["2021-0- 01341-003"]}],"id":[{"id":"10.13039\/501100014188","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Big Data"],"DOI":"10.1186\/s40537-025-01257-9","type":"journal-article","created":{"date-parts":[[2025,8,13]],"date-time":"2025-08-13T10:11:54Z","timestamp":1755079914000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Data augmentation for dense passage retrieval using corpus-passage frequency-based token deletion"],"prefix":"10.1186","volume":"12","author":[{"given":"A-Seong","family":"Moon","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kyumin","family":"Kim","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jaesung","family":"Lee","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,8,13]]},"reference":[{"issue":"4","key":"1257_CR1","doi-asserted-by":"publisher","first-page":"333","DOI":"10.1561\/1500000019","volume":"3","author":"S Robertson","year":"2009","unstructured":"Robertson S, Zaragoza H. The probabilistic relevance framework: Bm25 and beyond. Found Trends Inf Retr. 2009;3(4):333\u201389.","journal-title":"Found Trends Inf Retr"},{"key":"1257_CR2","doi-asserted-by":"crossref","unstructured":"Karpukhin V, Oguz B, Min S, Lewis P, Wu L, Edunov S, Chen D, Yih W-T. Dense passage retrieval for open-domain question answering. In: Proceedings of the 15th Conference on Empirical Methods in Natural Language Processing, Virtual; 2020. p. 6769\u20136781.","DOI":"10.18653\/v1\/2020.emnlp-main.550"},{"issue":"9","key":"1257_CR3","doi-asserted-by":"publisher","first-page":"1263","DOI":"10.1109\/TKDE.2008.239","volume":"21","author":"H He","year":"2009","unstructured":"He H, Garcia EA. Learning from imbalanced data. IEEE Trans Knowl Data Eng. 2009;21(9):1263\u201384.","journal-title":"IEEE Trans Knowl Data Eng"},{"key":"1257_CR4","doi-asserted-by":"crossref","unstructured":"Gao L, Dai Z, Callan J. COIL: Revisit exact lexical match in information retrieval with contextualized inverted list. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Mexico City, Mexico; 2021. p. 3030\u20133042.","DOI":"10.18653\/v1\/2021.naacl-main.241"},{"key":"1257_CR5","doi-asserted-by":"crossref","unstructured":"Formal T, Piwowarski B, Clinchant S. Splade: Sparse lexical and expansion model for first stage ranking. In: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval, Virtual 2021.","DOI":"10.1145\/3404835.3463098"},{"issue":"1","key":"1257_CR6","doi-asserted-by":"publisher","first-page":"329","DOI":"10.1162\/tacl_a_00369","volume":"9","author":"Y Luan","year":"2021","unstructured":"Luan Y, Eisenstein J, Toutanova K, Collins M. Sparse, dense, and attentional representations for text retrieval. Trans Assoc Comput Linguist. 2021;9(1):329\u201345.","journal-title":"Trans Assoc Comput Linguist"},{"key":"1257_CR7","doi-asserted-by":"crossref","unstructured":"Reichman B, Heck L. Dense passage retrieval: Is it retrieving? In: Al-Onaizan, Y., Bansal, M., Chen, Y.-N. (eds). Findings of the Association for Computational Linguistics: EMNLP 2024, Miami, USA; November 12-16, 2024. p. 13540\u201313553.","DOI":"10.18653\/v1\/2024.findings-emnlp.791"},{"key":"1257_CR8","doi-asserted-by":"crossref","unstructured":"Dong Z, Ni J, Bikel D, Alfonseca E, Wang Y, Qu C, Zitouni I. Exploring dual encoder architectures for question answering. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, Abu Dhabi, United Arab Emirates; 2022. p. 9414\u20139419.","DOI":"10.18653\/v1\/2022.emnlp-main.640"},{"key":"1257_CR9","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K. BERT: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 17th Conference of the North American Chapter of the Association for Computational Linguistics, Minnesota, USA; June 2-7, 2019. p. 4171\u20134186."},{"key":"1257_CR10","doi-asserted-by":"crossref","unstructured":"Song X, Salcianu A, Song Y, Dopson D, Zhou D. Fast WordPiece tokenization. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, Punta Cana, Dominican Repub; 2021. p. 2089\u20132103.","DOI":"10.18653\/v1\/2021.emnlp-main.160"},{"key":"1257_CR11","doi-asserted-by":"crossref","unstructured":"Tenney I, Das D, Pavlick E. BERT rediscovers the classical NLP pipeline. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Florence, Italy; 2019. p. 4593\u20134601.","DOI":"10.18653\/v1\/P19-1452"},{"key":"1257_CR12","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I. Attention is all you need. In: Proceedings of the 31st International Conference on Neural Information Processing Systems, Long Beach, California, USA; 2017. p. 6000\u20136010."},{"key":"1257_CR13","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J. Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition, Las vegas, USA; 2016. p. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"1257_CR14","doi-asserted-by":"crossref","unstructured":"Clark K, Khandelwal U, Levy O, Manning CD. What does BERT look at? an analysis of BERT\u2019s attention. In: Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP, Florence, Italy; 2019. p. 276\u2013286.","DOI":"10.18653\/v1\/W19-4828"},{"key":"1257_CR15","doi-asserted-by":"crossref","unstructured":"Ren R, Lv S, Qu Y, Liu J, Zhao WX, She Q, Wu H, Wang H, Wen J-R. PAIR: leveraging passage-centric similarity relation for improving dense passage retrieval. In: Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021, Bangkok, Thailand; August 1-6, 2021. p. 2173\u20132183.","DOI":"10.18653\/v1\/2021.findings-acl.191"},{"key":"1257_CR16","doi-asserted-by":"crossref","unstructured":"Wang L, Yang N, Huang X, Jiao B, Yang L, Jiang D, Majumder R, Wei F. SimLM: Pre-training with representation bottleneck for dense passage retrieval. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, Toronto, Canada; 2023. p. 2244\u20132258.","DOI":"10.18653\/v1\/2023.acl-long.125"},{"key":"1257_CR17","unstructured":"Mussmann S, Ermon S. Learning and inference via maximum inner product search. In: Proceedings of the 33rd International Conference on International Conference on Machine Learning, New York, USA; 2016. p. 2587\u20132596."},{"issue":"4","key":"1257_CR18","doi-asserted-by":"publisher","first-page":"824","DOI":"10.1109\/TPAMI.2018.2889473","volume":"42","author":"YA Malkov","year":"2020","unstructured":"Malkov YA, Yashunin DA. Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. IEEE Trans Pattern Anal Mach Intell. 2020;42(4):824\u201336.","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"1257_CR19","unstructured":"Guu K, Lee K, Tung Z, Pasupat P, Chang M. Retrieval augmented language model pre-training. In: Proceedings of the 37th International Conference on Machine Learning, Virtual; 2020. p. 3929\u20133938."},{"key":"1257_CR20","doi-asserted-by":"crossref","unstructured":"Zhou K, Gong Y, Liu X, Zhao WX, Shen Y, Dong A, Lu J, Majumder R, Wen J-r, Duan N. SimANS: Simple ambiguous negatives sampling for dense text retrieval. In: Proceedings of the 17th Conference on Empirical Methods in Natural Language Processing, Abu Dhabi, United Arab Emirates; 2022. p. 548\u2013559.","DOI":"10.18653\/v1\/2022.emnlp-industry.56"},{"issue":"1","key":"1257_CR21","doi-asserted-by":"publisher","first-page":"600","DOI":"10.1162\/tacl_a_00564","volume":"11","author":"DS Sachan","year":"2023","unstructured":"Sachan DS, Lewis M, Yogatama D, Zettlemoyer L, Pineau J, Zaheer M. Questions are all you need to train a dense passage retriever. Trans Assoc Comput Linguist. 2023;11(1):600\u201316.","journal-title":"Trans Assoc Comput Linguist"},{"issue":"60","key":"1257_CR22","first-page":"1","volume":"6","author":"C Shorten","year":"2019","unstructured":"Shorten C, Khoshgoftaar TM. A survey on image data augmentation for deep learning. J Big Data. 2019;6(60):1\u201348.","journal-title":"J Big Data"},{"key":"1257_CR23","doi-asserted-by":"crossref","unstructured":"Liu, P., Wang, X., Xiang, C., Meng, W.: A survey of text data augmentation. In: 2020 International Conference on Computer Communication and Network Security (CCNS), Xi an, China; 2020. p. 191\u2013195.","DOI":"10.1109\/CCNS50731.2020.00049"},{"key":"1257_CR24","doi-asserted-by":"crossref","unstructured":"Feng SY, Gangal V, Wei J, Chandar S, Vosoughi S, Mitamura T, Hovy E. A survey of data augmentation approaches for NLP. In: Findings of The Joint Conference of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing, Bankok, Thailand; 2021. p. 968\u2013988.","DOI":"10.18653\/v1\/2021.findings-acl.84"},{"key":"1257_CR25","doi-asserted-by":"crossref","unstructured":"Wei J, Zou K. EDA: Easy data augmentation techniques for boosting performance on text classification tasks. In: Proceedings of the 14th Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, Hong Kong, China; 2019. p. 6382\u20136388.","DOI":"10.18653\/v1\/D19-1670"},{"issue":"1","key":"1257_CR26","doi-asserted-by":"publisher","first-page":"101","DOI":"10.1186\/s40537-021-00492-0","volume":"8","author":"C Shorten","year":"2021","unstructured":"Shorten C, Khoshgoftaar TM, Furht B. Text data augmentation for deep learning. J Big Data. 2021;8(1):101\u201335.","journal-title":"J Big Data"},{"key":"1257_CR27","unstructured":"Kim H-S, Kang Y, Lee J-H. STAGE: Simple text data augmentation by graph exploration. In: Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), Torino, Italia; 2024. p. 15238\u201315256."},{"key":"1257_CR28","unstructured":"Brown T, Mann B, Ryder N, Subbiah M, Kaplan JD, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D. Language models are few-shot learners. In: Proceedings of the 34th International Conference on Neural Information Processing Systems, Vancouver, Canada; 2020. p. 1877\u20131901."},{"key":"1257_CR29","doi-asserted-by":"crossref","unstructured":"Madsen A, Meade N, Adlakha V, Reddy S. Evaluating the faithfulness of importance measures in NLP by recursively masking allegedly important tokens and retraining. In: Findings of the 17th Conference on Empirical Methods in Natural Language Processing, Abu Dhabi, United Arab Emirates; 2022. p. 1731\u20131751.","DOI":"10.18653\/v1\/2022.findings-emnlp.125"},{"key":"1257_CR30","doi-asserted-by":"crossref","unstructured":"Chalkidis I, Fergadiotis E, Malakasiotis P, Androutsopoulos I. Large-scale multi-label text classification on EU legislation. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Florence, Italy; 2019. p. 6314\u20136322.","DOI":"10.18653\/v1\/P19-1636"},{"key":"1257_CR31","doi-asserted-by":"crossref","unstructured":"Jin Q, Dhingra B, Liu Z, Cohen W, Lu X. PubMedQA: a dataset for biomedical research question answering. In: Proceedings of the 14th Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, Hong Kong, China; 2019. p. 2567\u20132577.","DOI":"10.18653\/v1\/D19-1259"},{"key":"1257_CR32","doi-asserted-by":"crossref","unstructured":"Chalkidis I, Androutsopoulos I, Aletras N. Neural legal judgment prediction in English. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Florence, Italy; 2019. p. 4317\u20134323.","DOI":"10.18653\/v1\/P19-1424"},{"key":"1257_CR33","doi-asserted-by":"crossref","unstructured":"Jin Y, Jang E, Cui J, Chung J-W, Lee Y, Shin S. DarkBERT: A language model for the dark side of the Internet. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, Toronto, Canada; 2023. p. 7515\u20137533.","DOI":"10.18653\/v1\/2023.acl-long.415"},{"key":"1257_CR34","doi-asserted-by":"crossref","unstructured":"Karimi A, Rossi L, Prati A. AEDA: An easier data augmentation technique for text classification. In: Findings of the 16th Conference on Empirical Methods in Natural Language Processing, Punta Cana, Dominican Republic; 2021. p. 2748\u20132754.","DOI":"10.18653\/v1\/2021.findings-emnlp.234"},{"key":"1257_CR35","doi-asserted-by":"publisher","unstructured":"Qu Y, Ding Y, Liu J, Liu K, Ren R, Zhao WX, Dong D, Wu H, Wang H. Rocketqa: an optimized training approach to dense passage retrieval for open-domain question answering. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT; June 6-11, 2021. p. 5835\u20135847. https:\/\/doi.org\/10.18653\/V1\/2021.NAACL-MAIN.466. Accessed 11 Aug 2025.","DOI":"10.18653\/V1\/2021.NAACL-MAIN.466"},{"key":"1257_CR36","doi-asserted-by":"publisher","unstructured":"Ge S, Xiong C, Rosset C, Overwijk A, Han J, Bennett P. Augmenting zero-shot dense retrievers with plug-in mixture-of-memories. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, Singapore; 2023. p. 1796\u20131812. https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-main.111. https:\/\/aclanthology.org\/2023.emnlp-main.111\/. Accessed 11 Aug 2025.","DOI":"10.18653\/v1\/2023.emnlp-main.111"},{"key":"1257_CR37","doi-asserted-by":"publisher","unstructured":"Chen H, Dou Z, Mao K, Liu J, Zhao Z. Generalizing conversational dense retrieval via LLM-cognition data augmentation. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Bangkok, Thailand; 2024. p. 2700\u20132718. https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.149. Accessed August 11, 2025. https:\/\/aclanthology.org\/2024.acl-long.149\/. Accessed 11 Aug 2025.","DOI":"10.18653\/v1\/2024.acl-long.149"},{"key":"1257_CR38","unstructured":"Li X, Li X, Zhang H, Du Z, Jia P, Wang Y, Zhao X, Guo H, Tang R. Syneg: Llm-driven synthetic hard-negatives for dense retrieval. arXiv preprint arXiv:2412.17250 2024."},{"key":"1257_CR39","doi-asserted-by":"crossref","unstructured":"Sennrich R, Haddow B, Birch A. Improving neural machine translation models with monolingual data. In: Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics, Berlin, Germany; August 7-12, 2016. 86\u201396.","DOI":"10.18653\/v1\/P16-1009"},{"key":"1257_CR40","unstructured":"Chen T, Kornblith S, Norouzi M, Hinton G. A simple framework for contrastive learning of visual representations. In: Proceedings of the 37th International Conference on Machine Learning, Virtual; 2020. p. 1597\u20131607."},{"key":"1257_CR41","doi-asserted-by":"crossref","unstructured":"Lee K, Chang M-W, Toutanova K. Latent retrieval for weakly supervised open domain question answering. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Florence, Italy; 2019. p. 6086\u20136096.","DOI":"10.18653\/v1\/P19-1612"},{"key":"1257_CR42","doi-asserted-by":"crossref","unstructured":"Chen D, Fisch A, Weston J, Bordes A. Reading Wikipedia to answer open-domain questions. In: Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics, Vancouver, Canada; 2017. p. 1870\u20131879.","DOI":"10.18653\/v1\/P17-1171"},{"key":"1257_CR43","doi-asserted-by":"crossref","unstructured":"Wang Z, Ng P, Ma X, Nallapati R, Xiang B. Multi-passage BERT: a globally normalized BERT model for open-domain question answering. In: Proceedings of the 14th Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, Hong Kong, China; 2019. p. 5878\u20135882.","DOI":"10.18653\/v1\/D19-1599"},{"key":"1257_CR44","doi-asserted-by":"crossref","unstructured":"Yang Y, Yih W-t, Meek C. WikiQA: A challenge dataset for open-domain question answering. In: M\u00e0rquez, L., Callison-Burch, C., Su, J. (eds). Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing, Lisbon, Portugal; 2015. p. 2013\u20132018.","DOI":"10.18653\/v1\/D15-1237"},{"issue":"1","key":"1257_CR45","first-page":"452","volume":"7","author":"T Kwiatkowski","year":"2019","unstructured":"Kwiatkowski T, Palomaki J, Redfield O, Collins M, Parikh A, Alberti C, Epstein D, Polosukhin I, Devlin J, Lee K, Toutanova K, Jones L, Kelcey M, Chang M-W, Dai AM, Uszkoreit J, Le Q, Petrov S. Natural questions: a benchmark for question answering research. Trans Assoc Comput Linguist. 2019;7(1):452\u201366.","journal-title":"Trans Assoc Comput Linguist"},{"issue":"1","key":"1257_CR46","first-page":"1","volume":"11","author":"S Siriwardhana","year":"2023","unstructured":"Siriwardhana S, Weerasekera R, Wen E, Kaluarachchi T, Rana R, Nanayakkara S. Improving the domain adaptation of retrieval augmented generation (RAG) models for open domain question answering. Trans Assoc Comput Linguist. 2023;11(1):1\u201317.","journal-title":"Trans Assoc Comput Linguist"},{"issue":"4","key":"1257_CR47","doi-asserted-by":"publisher","first-page":"361","DOI":"10.1017\/S1351324901002789","volume":"7","author":"EM Voorhees","year":"2001","unstructured":"Voorhees EM. The TREC question answering track. Nat Lang Eng. 2001;7(4):361\u201378.","journal-title":"Nat Lang Eng"}],"container-title":["Journal of Big Data"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s40537-025-01257-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1186\/s40537-025-01257-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s40537-025-01257-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,9]],"date-time":"2025-09-09T01:20:08Z","timestamp":1757380808000},"score":1,"resource":{"primary":{"URL":"https:\/\/journalofbigdata.springeropen.com\/articles\/10.1186\/s40537-025-01257-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,13]]},"references-count":47,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["1257"],"URL":"https:\/\/doi.org\/10.1186\/s40537-025-01257-9","relation":{},"ISSN":["2196-1115"],"issn-type":[{"value":"2196-1115","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,8,13]]},"assertion":[{"value":"6 January 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 August 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 August 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"Not applicable","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"The authors declare no Competing interests.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"199"}}