{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:09:35Z","timestamp":1765339775619,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFC3301702, 2022YFC3301703"],"award-info":[{"award-number":["2022YFC3301702, 2022YFC3301703"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62476093"],"award-info":[{"award-number":["62476093"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755803","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:56:43Z","timestamp":1761371803000},"page":"757-766","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["From Pixels to Semantics: A Novel MLLM-Driven Approach for Explainable Tampered Text Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-3023-4186","authenticated-orcid":false,"given":"Guitao","family":"Xu","sequence":"first","affiliation":[{"name":"South China University of Technology, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2928-6746","authenticated-orcid":false,"given":"Ziqi","family":"Yi","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1857-5473","authenticated-orcid":false,"given":"Peirong","family":"Zhang","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9483-2424","authenticated-orcid":false,"given":"Jiahuan","family":"Cao","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3863-047X","authenticated-orcid":false,"given":"Shihang","family":"Wu","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5456-0957","authenticated-orcid":false,"given":"Lianwen","family":"Jin","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, Guangdong, China and SCUT-Zhuhai Institute of Modern Industrial Innovation, Zhuhai, Guangdong, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Anthropic. 2024. Introducing Claude 3.5 Sonnet. https-:\/\/www.anthropic.com\/news\/claude-3-5-sonnet."},{"key":"e_1_3_2_1_2_1","unstructured":"Anthropic. 2025. Introducing Claude 3.7 Sonnet. https:\/\/www.anthropic.com\/news\/claude-3-7-sonnet."},{"key":"e_1_3_2_1_3_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2.5-VL Technical Report. arXiv:2502.13923"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00010"},{"key":"e_1_3_2_1_5_1","first-page":"63062","article-title":"Diffute: Universal text editing diffusion model","volume":"36","author":"Chen Haoxing","year":"2023","unstructured":"Haoxing Chen, Zhuoer Xu, Zhangxuan Gu, Yaohui Li, Changhua Meng, Huijia Zhu, Weiqiang Wang, et al., 2023. Diffute: Universal text editing diffusion model. Advances in Neural Information Processing Systems, Vol. 36 (2023), 63062-63074.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"e_1_3_2_1_7_1","unstructured":"Yize Chen Zhiyuan Yan Siwei Lyu and Baoyuan Wu. 2024c. X2-DFD: A framework for eXplainable and eXtendable Deepfake Detection. arXiv:2410.06126"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the European Conference on Computer Vision. 200-217","author":"Chen Zhongxi","year":"2024","unstructured":"Zhongxi Chen, Shen Chen, Taiping Yao, Ke Sun, Shouhong Ding, Xianming Lin, Liujuan Cao, and Rongrong Ji. 2024a. Enhancing Tampered Text Detection Through Frequency Feature Fusion and Decomposition. In Proceedings of the European Conference on Computer Vision. 200-217."},{"key":"e_1_3_2_1_9_1","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye et al. 2025. Expanding Performance Boundaries of Open-Source Multimodal Models with Model Data and Test-Time Scaling. arXiv:2412.05271"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185-24198","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al., 2024b. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185-24198."},{"key":"e_1_3_2_1_11_1","first-page":"17864","article-title":"Per-pixel classification is not all you need for semantic segmentation","volume":"34","author":"Cheng Bowen","year":"2021","unstructured":"Bowen Cheng, Alex Schwing, and Alexander Kirillov. 2021. Per-pixel classification is not all you need for semantic segmentation. Advances in Neural Information Processing Systems, Vol. 34 (2021), 17864-17875.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3180556"},{"key":"e_1_3_2_1_13_1","unstructured":"Yueying Gao Dongliang Chang Bingyao Yu Haotian Qin Lei Chen Kongming Liang and Zhanyu Ma. 2025. FakeReasoning: Towards Generalizable Forgery Detection and Reasoning. arXiv:2503.21210"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.111656"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al., 2022. LoRA: Low-Rank Adaptation of Large Language Models.. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_16_1","volume-title":"SIDA: Social Media Image Deepfake Detection, Localization and Explanation with Large Multimodal Model. arXiv:2412.04292","author":"Huang Zhenglin","year":"2024","unstructured":"Zhenglin Huang, Jinwei Hu, Xiangtai Li, Yiwei He, Xingyu Zhao, Bei Peng, Baoyuan Wu, Xiaowei Huang, and Guangliang Cheng. 2024a. SIDA: Social Media Image Deepfake Detection, Localization and Explanation with Large Multimodal Model. arXiv:2412.04292"},{"key":"e_1_3_2_1_17_1","volume-title":"FFAA: Multimodal Large Language Model based Explainable Open-World Face Forgery Analysis Assistant. arXiv:2408.10072","author":"Huang Zhengchao","year":"2024","unstructured":"Zhengchao Huang, Bin Xia, Zicheng Lin, Zhun Mou, Wenming Yang, and Jiaya Jia. 2024b. FFAA: Multimodal Large Language Model based Explainable Open-World Face Forgery Analysis Assistant. arXiv:2408.10072"},{"key":"e_1_3_2_1_18_1","unstructured":"INTSIG. 2025. TextIn-AI. https:\/\/www.textin.com."},{"key":"e_1_3_2_1_19_1","volume-title":"LEGION: Learning to Ground and Explain for Synthetic Image Detection. arXiv:2503.15264","author":"Kang Hengrui","year":"2025","unstructured":"Hengrui Kang, Siwei Wen, Zichen Wen, Junyan Ye, Weijia Li, Peilin Feng, Baichuan Zhou, Bin Wang, Dahua Lin, Linfeng Zhang, and Conghui He. 2025. LEGION: Learning to Ground and Explain for Synthetic Image Detection. arXiv:2503.15264"},{"key":"e_1_3_2_1_20_1","volume-title":"Segment Anything. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 4015-4026","author":"Kirillov Alexander","year":"2023","unstructured":"Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alexander C. Berg, Wan-Yen Lo, Piotr Dollar, and Ross Girshick. 2023. Segment Anything. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 4015-4026."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01617-5"},{"key":"e_1_3_2_1_22_1","unstructured":"Bo Li Yuanhan Zhang Dong Guo Renrui Zhang Feng Li Hao Zhang Kaichen Zhang Peiyuan Zhang Yanwei Li Ziwei Liu et al. 2024b. LLaVA-OneVision: Easy Visual Task Transfer. arXiv:2408.03326"},{"key":"e_1_3_2_1_23_1","volume-title":"Shiqi Wang, Anderson Rocha, and Weisi Lin.","author":"Li Yixuan","year":"2024","unstructured":"Yixuan Li, Xuelin Liu, Xiaoyang Wang, Bu Sung Lee, Shiqi Wang, Anderson Rocha, and Weisi Lin. 2024a. FakeBench: Probing Explainable Fake Image Detection via Large Multimodal Models. arXiv:2404.13306"},{"key":"e_1_3_2_1_24_1","unstructured":"Jingchun Lian Lingyu Liu Yaxiong Wang Yujiao Wu Li Zhu and Zhedong Zheng. 2024. A Large-scale Interpretable Multi-modality Benchmark for Facial Image Forgery Localization. arXiv:2412.19685"},{"key":"e_1_3_2_1_25_1","unstructured":"Jiawei Liu Fanrui Zhang Jiaying Zhu Esther Sun Qiang Zhang and Zheng-Jun Zha. 2024. ForgeryGPT: Multimodal Large Language Model For Explainable Image Forgery Detection and Localization. arXiv:2410.10238"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3189545"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.110828"},{"key":"e_1_3_2_1_30_1","unstructured":"OpenAI Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman et al. 2023. GPT-4 Technical Report. arXiv:2303.08774"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28245"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00575"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Qu Chenfan","year":"2025","unstructured":"Chenfan Qu, Chongyu Liu, Yuliang Liu, Xinhong Chen, Dezhi Peng, Fengjun Guo, and Lianwen Jin. 2025a. Ke Sun and Shen Chen and Taiping Yao and Haozhe Yang and Xiaoshuai Sun and Shouhong Ding and Rongrong Ji. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_34_1","unstructured":"Chenfan Qu Jian Liu Haoxing Chen Baihan Yu Jingjing Liu Weiqiang Wang and Lianwen Jin. 2024. TextSleuth: Towards Explainable Tampered Text Detection. arXiv:2412.14816"},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence.","author":"Qu Chenfan","year":"2025","unstructured":"Chenfan Qu, Yiwu Zhong, Fengjun Guo, and Lianwen Jin. 2025b. Generalized Tampered Scene Text Detection in the era of Generative AI. In Proceedings of the AAAI Conference on Artificial Intelligence."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3686158"},{"key":"e_1_3_2_1_37_1","unstructured":"Zhihao Sun Haoran Jiang Haoran Chen Yixin Cao Xipeng Qiu Zuxuan Wu and Yu-Gang Jiang. 2024. ForgerySleuth: Empowering Multimodal Large Language Models for Image Manipulation Detection. arXiv:2411.19466"},{"key":"e_1_3_2_1_38_1","volume-title":"Gemini: A Family of Highly Capable Multimodal Models. arXiv:2312.11805","author":"Team Gemini","year":"2024","unstructured":"Gemini Team, Rohan Anil, Sebastian Borgeaud, Jean-Baptiste Alayrac, Jiahui Yu, Radu Soricut, Johan Schalkwyk, Andrew M. Dai, Anja Hauth, Katie Millican, et al., 2024. Gemini: A Family of Highly Capable Multimodal Models. arXiv:2312.11805"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2983686"},{"key":"e_1_3_2_1_40_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024. Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv:2409.12191"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_13"},{"key":"e_1_3_2_1_42_1","first-page":"29","article-title":"Tampered text detection via RGB and frequency relationship modeling","volume":"8","author":"Yuxin WANG, Boqiang ZHANG, Hongtao","year":"2022","unstructured":"Yuxin WANG, Boqiang ZHANG, Hongtao XIE, and Yongdong ZHANG. 2022. Tampered text detection via RGB and frequency relationship modeling. Chinese Journal of Network and Information Security, Vol. 8, 3 (2022), 29.","journal-title":"Chinese Journal of Network and Information Security"},{"key":"e_1_3_2_1_43_1","unstructured":"Siwei Wen Junyan Ye Peilin Feng Hengrui Kang Zichen Wen Yize Chen Jiang Wu Wenjun Wu Conghui He and Weijia Li. 2025. Spot the Fake: Large Multimodal Model-Based Synthetic Image Detection with Artifact Explanation. arXiv:2503.14905"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350929"},{"key":"e_1_3_2_1_45_1","unstructured":"Zhiyu Wu Xiaokang Chen Zizheng Pan Xingchao Liu Wen Liu Damai Dai Huazuo Gao Yiyang Ma Chengyue Wu Bingxuan Wang et al. 2024. DeepSeek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding. arXiv:2412.10302"},{"key":"e_1_3_2_1_46_1","first-page":"12077","article-title":"SegFormer: Simple and efficient design for semantic segmentation with transformers","volume":"34","author":"Xie Enze","year":"2021","unstructured":"Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M Alvarez, and Ping Luo. 2021. SegFormer: Simple and efficient design for semantic segmentation with transformers. Advances in Neural Information Processing Systems, Vol. 34 (2021), 12077-12090.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Xu Zhipei","year":"2025","unstructured":"Zhipei Xu, Xuanyu Zhang, Runyi Li, Zecheng Tang, Qing Huang, and Jian Zhang. 2025. FakeShield: Explainable Image Forgery Detection and Localization via Multi-modal Large Language Models. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095070"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755803","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:05:43Z","timestamp":1765339543000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755803"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":48,"alternative-id":["10.1145\/3746027.3755803","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755803","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}