{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,25]],"date-time":"2026-06-25T02:37:04Z","timestamp":1782355024401,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":82,"publisher":"ACM","funder":[{"name":"The Alan Turing Institute","award":["SDCfP2\\100009"],"award-info":[{"award-number":["SDCfP2\\100009"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754798","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"11746-11755","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["RAIDX: A Retrieval-Augmented Generation and GRPO Reinforcement Learning Framework for Explainable Deepfake Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-8315-2135","authenticated-orcid":false,"given":"Tianxiao","family":"Li","sequence":"first","affiliation":[{"name":"University of Liverpool, Liverpool, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3759-7888","authenticated-orcid":false,"given":"Zhenglin","family":"Huang","sequence":"additional","affiliation":[{"name":"University of Liverpool, Liverpool, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3804-6753","authenticated-orcid":false,"given":"Haiquan","family":"Wen","sequence":"additional","affiliation":[{"name":"University of Liverpool, Liverpool, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0717-8517","authenticated-orcid":false,"given":"Yiwei","family":"He","sequence":"additional","affiliation":[{"name":"University of Liverpool, Liverpool, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9769-7083","authenticated-orcid":false,"given":"Shuchang","family":"Lyu","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2183-5990","authenticated-orcid":false,"given":"Baoyuan","family":"Wu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8686-9513","authenticated-orcid":false,"given":"Guangliang","family":"Cheng","sequence":"additional","affiliation":[{"name":"University of Liverpool, Liverpool, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. GPT-4 technical report. Arxiv","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. GPT-4 technical report. Arxiv, 2023."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/WIFS.2018.8630761"},{"key":"e_1_3_2_1_3_1","unstructured":"Stability AI. Stable Diffusion v2.0. https:\/\/huggingface.co\/stabilityai\/stable-diffusion-2. Accessed: 2025-04--12."},{"key":"e_1_3_2_1_4_1","unstructured":"Stability AI. DeepFloyd IF. https:\/\/github.com\/deep-floyd\/IF. Accessed: 2025-04--12."},{"key":"e_1_3_2_1_5_1","volume-title":"Arxiv","author":"Bi Xiuli","year":"2023","unstructured":"Xiuli Bi, Bo Liu, Fan Yang, Bin Xiao, Weisheng Li, Gao Huang, and Pamela C. Cosman. Detecting Generated Images by Real Images Only. Arxiv, 2023."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00408"},{"key":"e_1_3_2_1_7_1","volume-title":"Arxiv","author":"Chang You-Ming","year":"2023","unstructured":"You-Ming Chang, Chen Yeh, Wei-Chen Chiu, and Ning Yu. AntifakePrompt: Prompt-Tuned Vision-Language Models are Fake Image Detectors. Arxiv, 2023."},{"key":"e_1_3_2_1_8_1","unstructured":"Liang Chen Lei Li Haozhe Zhao Yifan Song and Vinci. R1-V: Reinforcing Super Generalization Ability in Vision-Language Models with Less Than $3. https:\/\/github.com\/Deep-Agent\/R1-V. Accessed: 2025-02-02."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01815"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.195"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3261988"},{"key":"e_1_3_2_1_12_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv preprint arXiv:2305.06500 [cs.CV]","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv preprint arXiv:2305.06500 [cs.CV], 2023. https:\/\/arxiv.org\/abs\/2305.06500"},{"key":"e_1_3_2_1_13_1","volume-title":"Arxiv","author":"Dao Alan","year":"2025","unstructured":"Alan Dao and Dinh Bach Vu. AlphaMaze: Enhancing Large Language Models' Spatial Intelligence via GRPO. Arxiv, 2025."},{"key":"e_1_3_2_1_14_1","unstructured":"DeepSeek-AI. DeepSeek-V3 Technical Report. Arxiv 2024."},{"key":"e_1_3_2_1_15_1","volume-title":"Arxiv","author":"AI.","year":"2025","unstructured":"DeepSeek-AI. DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. Arxiv, 2025."},{"key":"e_1_3_2_1_16_1","volume-title":"Arxiv","author":"Dolhansky Brian","year":"2020","unstructured":"Brian Dolhansky, Joanna Bitton, Ben Pflaum, Jikuo Lu, Russ Howes, Menglin Wang, and Cristian Canton Ferrer. The DeepFake Detection Challenge (DFDC) Dataset. Arxiv, 2020."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3180556"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2025.3618474"},{"key":"e_1_3_2_1_19_1","volume-title":"ICCV","author":"Duan Junxian","year":"2024","unstructured":"Junxian Duan, Yuang Ai, Jipeng Liu, Shenyuan Huang, Huaibo Huang, Jie Cao, and Ran He. Test-time Forgery Detection with Spatial-Frequency Prompt Learning. ICCV, 2024."},{"key":"e_1_3_2_1_20_1","first-page":"3247","volume-title":"ICML","author":"Frank Joel","year":"2020","unstructured":"Joel Frank, Thorsten Eisenhofer, Lea Sch\u00f6nherr, Asja Fischer, Dorothea Kolossa, and Thorsten Holz. Leveraging frequency analysis for deep fake image recognition. In ICML, pages 3247--3258, 2020."},{"key":"e_1_3_2_1_21_1","volume-title":"Arxiv","author":"Gao Yunfan","year":"2024","unstructured":"Yunfan Gao, Yun Xiong, Xinyu Gao, Kangxiang Jia, Jinliu Pan, Yuxi Bi, Yi Dai, Jiawei Sun, Meng Wang, and Haofen Wang. Retrieval-Augmented Generation for Large Language Models: A Survey. Arxiv, 2024."},{"key":"e_1_3_2_1_22_1","volume-title":"Arxiv","author":"Goodfellow Ian J.","year":"2014","unstructured":"Ian J. Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, and Yoshua Bengio. Generative Adversarial Networks. Arxiv, 2014."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00308"},{"key":"e_1_3_2_1_24_1","volume-title":"Arxiv","author":"He Kaiming","year":"2015","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. Deep Residual Learning for Image Recognition. Arxiv, 2015."},{"key":"e_1_3_2_1_25_1","volume-title":"ICLR","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. LoRA: Low-Rank Adaptation of Large Language Models. In ICLR, 2022."},{"key":"e_1_3_2_1_26_1","volume-title":"Arxiv","author":"Huang Wenxuan","year":"2025","unstructured":"Wenxuan Huang, Bohan Jia, Zijie Zhai, Shaosheng Cao, Zheyu Ye, Fei Zhao, Zhe Xu, Yao Hu, and Shaohui Lin. Vision-R1: Incentivizing Reasoning Capability in Multimodal Large Language Models. Arxiv, 2025."},{"key":"e_1_3_2_1_27_1","volume-title":"Localization and Explanation with Large Multimodal Model. In CVPR","author":"Huang Zhenglin","year":"2025","unstructured":"Zhenglin Huang, Jinwei Hu, Xiangtai Li, Yiwei He, Xingyu Zhao, Bei Peng, Baoyuan Wu, Xiaowei Huang, and Guangliang Cheng. SIDA: Social Media Image Deepfake Detection, Localization and Explanation with Large Multimodal Model. In CVPR, 2025."},{"key":"e_1_3_2_1_28_1","volume-title":"Arxiv","author":"Huang Zhenglin","year":"2025","unstructured":"Zhenglin Huang, Tianxiao Li, Xiangtai Li, Haiquan Wen, Yiwei He, Jiangning Zhang, Hao Fei, Xi Yang, Xiaowei Huang, Bei Peng, and Guangliang Cheng. So-Fake: Benchmarking and Explaining Social Media Image Forgery Detection. Arxiv, 2025."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP46576.2022.9897820"},{"key":"e_1_3_2_1_30_1","volume-title":"Arxiv","author":"Kang Hengrui","year":"2025","unstructured":"Hengrui Kang, Siwei Wen, Zichen Wen, Junyan Ye, Weijia Li, Peilin Feng, Baichuan Zhou, Bin Wang, Dahua Lin, Linfeng Zhang, and Conghui He. LEGION: Learning to Ground and Explain for Synthetic Image Detection. Arxiv, 2025."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2970919"},{"key":"e_1_3_2_1_32_1","volume-title":"Arxiv","author":"Kundu Rohit","year":"2024","unstructured":"Rohit Kundu, Hao Xiong, Vishal Mohanty, Athula Balachandran, and Amit K Roy-Chowdhury. Towards a Universal Synthetic Video Detector: From Face or Background Manipulations to Fully AI-Generated Content. Arxiv, 2024."},{"key":"e_1_3_2_1_33_1","volume-title":"Woo. Quality-Agnostic Deepfake Detection with Intra-model Collaborative Learning. Arxiv","author":"Binh","year":"2023","unstructured":"Binh M. Le and Simon S. Woo. Quality-Agnostic Deepfake Detection with Intra-model Collaborative Learning. Arxiv, 2023."},{"key":"e_1_3_2_1_34_1","volume-title":"Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela.","author":"Lewis Patrick","year":"2021","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen tau Yih, Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela. 2021. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. Arxiv (2021)."},{"key":"e_1_3_2_1_35_1","unstructured":"Patrick S. H. Lewis Ethan Perez Aleksandra Piktus Fabio Petroni Vladimir Karpukhin Naman Goya Mike Lewis Wen{-}tau Yih Sebastian Riedel and Douwe Kiela. 2020. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In NeurIPS."},{"key":"e_1_3_2_1_36_1","volume-title":"Playground v2.5: Three Insights towards Enhancing Aesthetic Quality in Text-to-Image Generation. Arxiv","author":"Li Daiqing","year":"2024","unstructured":"Daiqing Li, Aleks Kamko, Ehsan Akhgari, Ali Sabet, Linmiao Xu, and Suhail Doshi. 2024. Playground v2.5: Three Insights towards Enhancing Aesthetic Quality in Text-to-Image Generation. Arxiv (2024)."},{"key":"e_1_3_2_1_37_1","volume-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML."},{"key":"e_1_3_2_1_38_1","volume-title":"A Large-scale Interpretable Multi-modality Benchmark for Facial Image Forgery Localization. Arxiv","author":"Lian Jingchun","year":"2024","unstructured":"Jingchun Lian, Lingyu Liu, Yaxiong Wang, Yujiao Wu, Li Zhu, and Zhedong Zheng. 2024. A Large-scale Interpretable Multi-modality Benchmark for Facial Image Forgery Localization. Arxiv (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Microsoft COCO: Common Objects in Context. Arxiv","author":"Lin Tsung-Yi","year":"2015","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, Lubomir Bourdev, Ross Girshick, James Hays, Pietro Perona, Deva Ramanan, C. Lawrence Zitnick, and Piotr Doll\u00e1r. 2015. Microsoft COCO: Common Objects in Context. Arxiv (2015)."},{"key":"e_1_3_2_1_40_1","volume-title":"Visual instruction tuning. NeurIPS","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. NeurIPS (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"Forgery-aware Adaptive Transformer for Generalizable Synthetic Image Detection. Arxiv","author":"Liu Huan","year":"2023","unstructured":"Huan Liu, Zichang Tan, Chuangchuang Tan, Yunchao Wei, Yao Zhao, and Jingdong Wang. 2023. Forgery-aware Adaptive Transformer for Generalizable Synthetic Image Detection. Arxiv (2023)."},{"key":"e_1_3_2_1_42_1","volume-title":"ForgeryGPT: Multimodal Large Language Model For Explainable Image Forgery Detection and Localization. Arxiv","author":"Liu Jiawei","year":"2025","unstructured":"Jiawei Liu, Fanrui Zhang, Jiaying Zhu, Esther Sun, Qiang Zhang, and Zheng-Jun Zha. 2025. ForgeryGPT: Multimodal Large Language Model For Explainable Image Forgery Detection and Localization. Arxiv (2025)."},{"key":"e_1_3_2_1_43_1","volume-title":"PSCC-Net: Progressive Spatio-Channel Correlation Network for Image Manipulation Detection and Localization. T-CSVT","author":"Liu Xiaohong","year":"2022","unstructured":"Xiaohong Liu, Yaojie Liu, Jun Chen, and Xiaoming Liu. 2022. PSCC-Net: Progressive Spatio-Channel Correlation Network for Image Manipulation Detection and Localization. T-CSVT (2022)."},{"key":"e_1_3_2_1_44_1","volume-title":"Torr","author":"Liu Zhengzhe","year":"2020","unstructured":"Zhengzhe Liu, Xiaojuan Qi, and Philip H. S. Torr. 2020. Global Texture Enhancement for Fake Face Detection in the Wild. In CVPR."},{"key":"e_1_3_2_1_45_1","volume-title":"Video-RAG: Visually-aligned Retrieval-Augmented Long Video Comprehension. Arxiv","author":"Luo Yongdong","year":"2024","unstructured":"Yongdong Luo, Xiawu Zheng, Xiao Yang, Guilin Li, Haojia Lin, Jinfa Huang, Jiayi Ji, Fei Chao, Jiebo Luo, and Rongrong Ji. 2024. Video-RAG: Visually-aligned Retrieval-Augmented Long Video Comprehension. Arxiv (2024)."},{"key":"e_1_3_2_1_46_1","unstructured":"Inc. Midjourney. 2023. Midjourney v5. https:\/\/www.midjourney.com\/. Accessed: 2025-04--12."},{"key":"e_1_3_2_1_47_1","volume-title":"GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. Arxiv","author":"Nichol Alex","year":"2022","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2022. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. Arxiv (2022)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Utkarsh Ojha Yuheng Li and Yong Jae Lee. 2023. Towards Universal Fake Image Detectors that Generalize Across Generative Models. In CVPR.","DOI":"10.1109\/CVPR52729.2023.02345"},{"key":"e_1_3_2_1_49_1","unstructured":"OpenAI. 2023. DALL\u00b7E 3. https:\/\/openai.com\/dall-e. Accessed: 2025-04--12."},{"key":"e_1_3_2_1_50_1","volume-title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis. Arxiv","author":"Podell Dustin","year":"2023","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. 2023. SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis. Arxiv (2023)."},{"key":"e_1_3_2_1_51_1","volume-title":"RoRA-VLM: Robust Retrieval-Augmented Vision Language Models. Arxiv","author":"Qi Jingyuan","year":"2024","unstructured":"Jingyuan Qi, Zhiyang Xu, Rulin Shao, Yang Chen, Di Jin, Yu Cheng, Qifan Wang, and Lifu Huang. 2024. RoRA-VLM: Robust Retrieval-Augmented Vision Language Models. Arxiv (2024)."},{"key":"e_1_3_2_1_52_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML."},{"key":"e_1_3_2_1_53_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans and Ilya Sutskever. 2018. Improving language understanding by generative pre-training."},{"key":"e_1_3_2_1_54_1","volume-title":"Hierarchical Text-Conditional Image Generation with CLIP Latents. Arxiv","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. Arxiv (2022)."},{"key":"e_1_3_2_1_55_1","volume-title":"Towards the Detection of Diffusion Model Deepfakes. Arxiv","author":"Ricker Jonas","year":"2024","unstructured":"Jonas Ricker, Simon Damm, Thorsten Holz, and Asja Fischer. 2024. Towards the Detection of Diffusion Model Deepfakes. Arxiv (2024)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-Resolution Image Synthesis with Latent Diffusion Models. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_57_1","volume-title":"Proximal policy optimization algorithms. Arxiv","author":"Schulman John","year":"2017","unstructured":"John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. 2017. Proximal policy optimization algorithms. Arxiv (2017)."},{"key":"e_1_3_2_1_58_1","volume-title":"DE-FAKE: Detection and Attribution of Fake Images Generated by Text-to-Image Generation Models. Arxiv","author":"Sha Zeyang","year":"2023","unstructured":"Zeyang Sha, Zheng Li, Ning Yu, and Yang Zhang. 2023. DE-FAKE: Detection and Attribution of Fake Images Generated by Text-to-Image Generation Models. Arxiv (2023)."},{"key":"e_1_3_2_1_59_1","volume-title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models. Arxiv","author":"Shao Zhihong","year":"2024","unstructured":"Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Mingchuan Zhang, Y. K. Li, Y. Wu, and Daya Guo. 2024. DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models. Arxiv (2024)."},{"key":"e_1_3_2_1_60_1","unstructured":"Haozhan Shen Zilun Zhang Kangjia Zhao Qianqian Zhang Ruochen Xu and Tiancheng Zhao. 2025. VLM-R1: A stable and generalizable R1-style Large Vision-Language Model. https:\/\/github.com\/om-ai-lab\/VLM-R1. Accessed: 2025-02--15."},{"key":"e_1_3_2_1_61_1","volume-title":"RACE: Retrieval-Augmented Commit Message Generation. Arxiv","author":"Shi Ensheng","year":"2022","unstructured":"Ensheng Shi, Yanlin Wang, Wei Tao, Lun Du, Hongyu Zhang, Shi Han, Dongmei Zhang, and Hongbin Sun. 2022. RACE: Retrieval-Augmented Commit Message Generation. Arxiv (2022)."},{"key":"e_1_3_2_1_62_1","volume-title":"A Survey of Multimodal-Guided Image Editing with Text-to-Image Diffusion Models. Arxiv","author":"Shuai Xincheng","year":"2024","unstructured":"Xincheng Shuai, Henghui Ding, Xingjun Ma, Rongcheng Tu, Yu-Gang Jiang, and Dacheng Tao. 2024. A Survey of Multimodal-Guided Image Editing with Text-to-Image Diffusion Models. Arxiv (2024)."},{"key":"e_1_3_2_1_63_1","volume-title":"ForgerySleuth: Empowering Multimodal Large Language Models for Image Manipulation Detection. Arxiv","author":"Sun Zhihao","year":"2024","unstructured":"Zhihao Sun, Haoran Jiang, Haoran Chen, Yixin Cao, Xipeng Qiu, Zuxuan Wu, and Yu-Gang Jiang. 2024. ForgerySleuth: Empowering Multimodal Large Language Models for Image Manipulation Detection. Arxiv (2024)."},{"key":"e_1_3_2_1_64_1","unstructured":"Chuangchuang Tan Yao Zhao Shikui Wei Guanghua Gu Ping Liu and Yunchao Wei. 2024. Rethinking the up-sampling operations in CNN-based generative network for generalizable deepfake detection. In CVPR."},{"key":"e_1_3_2_1_65_1","unstructured":"Chuangchuang Tan Yao Zhao Shikui Wei Guanghua Gu and Yunchao Wei. 2023. Learning on gradients: Generalized artifacts representation for GAN-generated images detection. In CVPR."},{"key":"e_1_3_2_1_66_1","volume-title":"LLaMA: Open and Efficient Foundation Language Models. Arxiv","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. Arxiv (2023)."},{"key":"e_1_3_2_1_67_1","volume-title":"Representative Forgery Mining for Fake Face Detection. Arxiv","author":"Wang Chengrui","year":"2021","unstructured":"Chengrui Wang and Weihong Deng. 2021. Representative Forgery Mining for Fake Face Detection. Arxiv (2021)."},{"key":"e_1_3_2_1_68_1","volume-title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. Arxiv","author":"Wang Peng","year":"2024","unstructured":"Peng Wang, Shuai Bai, Sinan Tan, Shijie Wang, Zhihao Fan, Jinze Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Yang Fan, Kai Dang, Mengfei Du, Xuancheng Ren, Rui Men, Dayiheng Liu, Chang Zhou, Jingren Zhou, and Junyang Lin. 2024. Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. Arxiv (2024)."},{"key":"e_1_3_2_1_69_1","volume-title":"Efros","author":"Wang Sheng-Yu","year":"2020","unstructured":"Sheng-Yu Wang, Oliver Wang, Richard Zhang, Andrew Owens, and Alexei A. Efros. 2020. CNN-generated images are surprisingly easy to spot... for now. In CVPR."},{"key":"e_1_3_2_1_70_1","volume-title":"Efros","author":"Wang Sheng-Yu","year":"2020","unstructured":"Sheng-Yu Wang, Oliver Wang, Richard Zhang, Andrew Owens, and Alexei A. Efros. 2020. CNN-generated images are surprisingly easy to spot... for now. Arxiv (2020). https:\/\/arxiv.org\/abs\/1912.11035"},{"key":"e_1_3_2_1_71_1","volume-title":"CogVLM: Visual Expert for Pretrained Language Models. Arxiv","author":"Wang Weihan","year":"2024","unstructured":"Weihan Wang, Qingsong Lv, Wenmeng Yu, Wenyi Hong, Ji Qi, Yan Wang, Junhui Ji, Zhuoyi Yang, Lei Zhao, Xixuan Song, Jiazheng Xu, Bin Xu, Juanzi Li, Yuxiao Dong, Ming Ding, and Jie Tang. 2024. CogVLM: Visual Expert for Pretrained Language Models. Arxiv (2024)."},{"key":"e_1_3_2_1_72_1","volume-title":"DIRE for Diffusion-Generated Image Detection. Arxiv","author":"Wang Zhendong","year":"2023","unstructured":"Zhendong Wang, Jianmin Bao, Wengang Zhou, Weilun Wang, Hezhen Hu, Hong Chen, and Houqiang Li. 2023. DIRE for Diffusion-Generated Image Detection. Arxiv (2023)."},{"key":"e_1_3_2_1_73_1","volume-title":"DiffusionDB: A Large-scale Prompt Gallery Dataset for Text-to-Image Generative Models. Arxiv","author":"Wang Zijie J.","year":"2023","unstructured":"Zijie J. Wang, Evan Montoya, David Munechika, Haoyang Yang, Benjamin Hoover, and Duen Horng Chau. 2023. DiffusionDB: A Large-scale Prompt Gallery Dataset for Text-to-Image Generative Models. Arxiv (2023)."},{"key":"e_1_3_2_1_74_1","volume-title":"BusterX: MLLM-Powered AI-Generated Video Forgery Detection and Explanation. Arxiv","author":"Wen Haiquan","year":"2025","unstructured":"Haiquan Wen, Yiwei He, Zhenglin Huang, Tianxiao Li, Zihan Yu, Xingru Huang, Lu Qi, Baoyuan Wu, Xiangtai Li, and Guangliang Cheng. 2025. BusterX: MLLM-Powered AI-Generated Video Forgery Detection and Explanation. Arxiv (2025)."},{"key":"e_1_3_2_1_75_1","volume-title":"BusterX: Towards Unified Cross-Modal AI-Generated Content Detection and Explanation with MLLM. Arxiv","author":"Wen Haiquan","year":"2025","unstructured":"Haiquan Wen, Tianxiao Li, Zhenglin Huang, Yiwei He, and Guangliang Cheng. 2025. BusterX: Towards Unified Cross-Modal AI-Generated Content Detection and Explanation with MLLM. Arxiv (2025)."},{"key":"e_1_3_2_1_76_1","volume-title":"Generalizable Synthetic Image Detection via Language-guided Contrastive Learning. Arxiv","author":"Wu Haiwei","year":"2023","unstructured":"Haiwei Wu, Jiantao Zhou, and Shile Zhang. 2023. Generalizable Synthetic Image Detection via Language-guided Contrastive Learning. Arxiv (2023)."},{"key":"e_1_3_2_1_77_1","unstructured":"Jin Xu Zhifang Guo Jinzheng He Hangrui Hu Ting He Shuai Bai Keqin Chen Jialin Wang Yang Fan Kai Dang Bin Zhang Xiong Wang Yunfei Chu and Junyang Lin. 2025. Qwen2.5-Omni Technical Report. Arxiv (2025)."},{"key":"e_1_3_2_1_78_1","unstructured":"Zhiyuan Yan Taiping Yao Shen Chen Yandan Zhao Xinghe Fu Junwei Zhu Donghao Luo Chengjie Wang Shouhong Ding Yunsheng Wu and Li Yuan. 2024. DF40: Toward Next-Generation Deepfake Detection. In NeurIPS."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_80_1","volume-title":"Xing","author":"Zhan Fangneng","year":"2023","unstructured":"Fangneng Zhan, Yingchen Yu, Rongliang Wu, Jiahui Zhang, Shijian Lu, Lingjie Liu, Adam Kortylewski, Christian Theobalt, and Eric P. Xing. 2023. Multimodal Image Synthesis and Editing: The Generative AI Era. PAMI (2023)."},{"key":"e_1_3_2_1_81_1","volume-title":"RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation. Arxiv","author":"Zhang Fengji","year":"2023","unstructured":"Fengji Zhang, Bei Chen, Yue Zhang, Jacky Keung, Jin Liu, Daoguang Zan, Yi Mao, Jian-Guang Lou, and Weizhu Chen. 2023. RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation. Arxiv (2023)."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"crossref","unstructured":"Yinglin Zheng Jianmin Bao Dong Chen Ming Zeng and Fang Wen. 2021. Exploring temporal coherence for more general video face forgery detection. In CVPR.","DOI":"10.1109\/ICCV48922.2021.01477"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754798","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:56:39Z","timestamp":1765342599000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754798"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":82,"alternative-id":["10.1145\/3746027.3754798","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754798","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}