{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,8]],"date-time":"2026-03-08T01:44:47Z","timestamp":1772934287713,"version":"3.50.1"},"reference-count":49,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T00:00:00Z","timestamp":1765152000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T00:00:00Z","timestamp":1765152000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,8]]},"DOI":"10.1109\/bigdata66926.2025.11402039","type":"proceedings-article","created":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T20:57:57Z","timestamp":1772830677000},"page":"3656-3664","source":"Crossref","is-referenced-by-count":0,"title":["Analysis of Semantic Benchmark Data Contamination Attack for LLM-Driven Fake News Detection"],"prefix":"10.1109","author":[{"given":"Cheng","family":"Xu","sequence":"first","affiliation":[{"name":"School of Computer Science, University College Dublin,Dublin,Ireland"}]},{"given":"M-Tahar","family":"Kechadi","sequence":"additional","affiliation":[{"name":"School of Computer Science, University College Dublin,Dublin,Ireland"}]}],"member":"263","reference":[{"key":"ref1","year":"2024","journal-title":"Gpt-4 technical report"},{"key":"ref2","author":"Touvron","year":"2023","journal-title":"Llama: Open and efficient foundation language models"},{"key":"ref3","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref4","article-title":"Benchmark data contamination of large language models: A survey","author":"Xu","year":"2024"},{"key":"ref5","article-title":"The emperor\u2019s new clothes in benchmarking? a rigorous examination of mitigation strategies for LLM benchmark data contamination","volume-title":"ICLR 2025 Workshop on Navigating and Addressing Data Problems for Foundation Models","author":"Sun","year":"2025"},{"key":"ref6","doi-asserted-by":"crossref","first-page":"8424","DOI":"10.18653\/v1\/2022.acl-long.577","article-title":"Deduplicating training data makes language models better","volume-title":"Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Lee","year":"2022"},{"key":"ref7","doi-asserted-by":"crossref","first-page":"10776","DOI":"10.18653\/v1\/2023.findings-emnlp.722","article-title":"NLP evaluation in trouble: On the need to measure LLM data contamination for each benchmark","volume-title":"Findings of the Association for Computational Linguistics: EMNLP 2023","author":"Sainz","year":"2023"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/tai.2025.3569516"},{"key":"ref9","author":"Zhou","year":"2023","journal-title":"Don\u2019t make your 11m an evaluation benchmark cheater"},{"key":"ref10","doi-asserted-by":"crossref","first-page":"14116","DOI":"10.18653\/v1\/2024.acl-long.761","article-title":"Quantifying contamination in evaluating code generation capabilities of language models","volume-title":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Riddell","year":"2024"},{"key":"ref11","first-page":"507","article-title":"Advancing Arabic sentiment analysis: ArSen benchmark and the improved fuzzy deep hybrid network","volume-title":"Proceedings of the 28th Conference on Computational Natural Language Learning","author":"Fang"},{"key":"ref12","article-title":"Decolonizing african NLP: A survey on power dynamics and data colonialism in tech development","volume-title":"5th Workshop on African Natural Language Processing","author":"Yan","year":"2024"},{"key":"ref13","first-page":"23013","article-title":"DCR: Quantifying data contamination in LLMs evaluation","volume-title":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","author":"Xu"},{"key":"ref14","first-page":"14748","article-title":"SSA: Semantic contamination of LLM-driven fake news detection","volume-title":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","author":"Xu"},{"key":"ref15","article-title":"Time travel in LLMs: Tracing data contamination in large language models","volume-title":"The Twelfth International Conference on Learning Representations","author":"Golchin","year":"2024"},{"key":"ref16","doi-asserted-by":"crossref","first-page":"8808","DOI":"10.18653\/v1\/2025.acl-long.431","article-title":"TripleFact: Defending data contamination in the evaluation of LLM-driven fake news detection","volume-title":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Xu","year":"2025"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3395046"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3137597.3137600"},{"key":"ref19","volume-title":"Fake news detectors are biased against texts generated by large language models","author":"Su","year":"2023"},{"key":"ref20","article-title":"AROT-COV23: A dataset of 500k original arabic tweets on COVID-19","volume-title":"4th Workshop on African Natural Language Processing","author":"Xu","year":"2023"},{"key":"ref21","doi-asserted-by":"crossref","first-page":"16078","DOI":"10.18653\/v1\/2024.findings-acl.951","article-title":"Unveiling the spectrum of data contamination in language model: A survey from detection to remediation","author":"Deng","year":"2024","journal-title":"Findings of the Association for Computational Linguistics ACL 2024"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/access.2024.3418340"},{"key":"ref23","article-title":"Introducing the next generation of claude","year":"2024"},{"key":"ref24","article-title":"Gemini: A family of highly capable multimodal models","author":"Team","year":"2024"},{"key":"ref25","article-title":"Mistral 7b","author":"Jiang","year":"2023"},{"key":"ref26","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref27","first-page":"67","article-title":"Leak, cheat, repeat: Data contamination and evaluation malpractices in closed-source LLMs","volume-title":"Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Balloccu","year":"2024"},{"key":"ref28","first-page":"28","article-title":"Preventing generation of verbatim memorization in language models gives a false sense of privacy","volume-title":"Proceedings of the 16th International Natural Language Generation Conference","author":"Ippolito"},{"key":"ref29","first-page":"17","article-title":"Deduplicating training data mitigates privacy risks in language models","volume-title":"Proceedings of the 39th International Conference on Machine Learning, ser. Proceedings of Machine Learning Research","volume":"162","author":"Kandpal"},{"key":"ref30","article-title":"Detecting pretraining data from large language models","volume-title":"The Twelfth International Conference on Learning Representations","author":"Shi","year":"2024"},{"key":"ref31","author":"Duarte","year":"2024","journal-title":"De-cop: Detecting copyrighted content in language models training data"},{"key":"ref32","author":"Li","year":"2023","journal-title":"Estimating contamination via perplexity: Quantifying memorisation in language model evaluation"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1162\/tacl.a.20"},{"key":"ref34","article-title":"Proving test set contamination for black-box language models","volume-title":"The Twelfth International Conference on Learning Representations, 2024","author":"Oren"},{"key":"ref35","doi-asserted-by":"crossref","first-page":"157","DOI":"10.18653\/v1\/2022.acl-short.18","article-title":"Data contamination: From memorization to exploitation","volume-title":"Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)","author":"Magar","year":"2022"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3628797.3628971"},{"key":"ref37","article-title":"Real-time fake news from adversarial feedback","author":"Chen","year":"2024"},{"key":"ref38","article-title":"Dyval: Graph-informed dynamic evaluation of large language models","volume-title":"The Twelfth International Conference on Learning Representations, 2024","author":"Zhu"},{"key":"ref39","author":"Ying","year":"2024","journal-title":"Have seen me before? automating dataset updates towards reliable and timely evaluation"},{"key":"ref40","author":"Yang","year":"2023","journal-title":"Rethinking benchmark and contamination for language models with rephrased samples"},{"key":"ref41","first-page":"5075","article-title":"Stop uploading test data in plain text: Practical strategies for mitigating data contamination by evaluation benchmarks","volume-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing","author":"Jacovi"},{"key":"ref42","article-title":"Platypus: Quick, cheap, and powerful refinement of LLMs","volume-title":"NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following","author":"Lee","year":"2023"},{"key":"ref43","first-page":"17864","article-title":"Data contamination can cross language barriers","volume-title":"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing","author":"Yao"},{"issue":"16","key":"ref44","first-page":"18471","article-title":"Task contamination: Language models may not be few-shot anymore","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"38","author":"Li","year":"2024"},{"key":"ref45","first-page":"508","article-title":"On fake news detection with LLM enhanced semantics mining","volume-title":"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing","author":"Ma"},{"key":"ref46","first-page":"2529","article-title":"Instruction pre-training: Language models are supervised multitask learners","volume-title":"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing","author":"Cheng"},{"key":"ref47","article-title":"Qwen2.5: A party of foundation models","author":"Team","year":"2024"},{"key":"ref48","first-page":"1","article-title":"Gdelt: Global data on events, location, and tone, 1979\u20132012","volume-title":"ISA annual convention","volume":"2","author":"Leetaru","year":"2013"},{"key":"ref49","article-title":"The refinedweb dataset for falcon 11m: outperforming curated corpora with web data only","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems, ser. NIPS \u201923. Red Hook","author":"Penedo","year":"2023"}],"event":{"name":"2025 IEEE International Conference on Big Data (BigData)","location":"Macau, China","start":{"date-parts":[[2025,12,8]]},"end":{"date-parts":[[2025,12,11]]}},"container-title":["2025 IEEE International Conference on Big Data (BigData)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11400704\/11400712\/11402039.pdf?arnumber=11402039","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T07:04:15Z","timestamp":1772867055000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11402039\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,8]]},"references-count":49,"URL":"https:\/\/doi.org\/10.1109\/bigdata66926.2025.11402039","relation":{},"subject":[],"published":{"date-parts":[[2025,12,8]]}}}