{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:42:21Z","timestamp":1777657341070,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62271312, 62401365, 62225112, 62132006, U24A20220"],"award-info":[{"award-number":["62271312, 62401365, 62225112, 62132006, U24A20220"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","award":["BX20250411, 2025M773473"],"award-info":[{"award-number":["BX20250411, 2025M773473"]}],"id":[{"id":"10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758204","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:39:06Z","timestamp":1761377946000},"page":"12666-12673","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["DFBench: Benchmarking Deepfake Image Detection Capability of Large Multimodal Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-9890-2698","authenticated-orcid":false,"given":"Jiarui","family":"Wang","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6519-4067","authenticated-orcid":false,"given":"Huiyu","family":"Duan","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2497-2451","authenticated-orcid":false,"given":"Juntong","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2623-4756","authenticated-orcid":false,"given":"Ziheng","family":"Jia","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1171-2929","authenticated-orcid":false,"given":"Woo Yi","family":"Yang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9145-3829","authenticated-orcid":false,"given":"Xiaorong","family":"Zhu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6121-1272","authenticated-orcid":false,"given":"Yu","family":"Zhao","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8783-4942","authenticated-orcid":false,"given":"Jiaying","family":"Qian","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3968-594X","authenticated-orcid":false,"given":"Yuke","family":"Xing","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8165-9322","authenticated-orcid":false,"given":"Guangtao","family":"Zhai","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5693-0416","authenticated-orcid":false,"given":"Xiongkuo","family":"Min","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV). 1-17","author":"Ahn Donghoon","year":"2024","unstructured":"Donghoon Ahn, Hyoungwon Cho, Jaewon Min, Wooseok Jang, Jungwoo Kim, SeonHwa Kim, Hyun Hee Park, Kyong Hwan Jin, and Seungryong Kim. 2024. Self-rectifying diffusion sampling with perturbed-attention guidance. In Proceedings of the European Conference on Computer Vision (ECCV). 1-17."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Vladimir Arkhipkin Viacheslav Vasilev Andrei Filatov Igor Pavlov Julia Agafonova Nikolai Gerasimenko Anna Averchenkova Evelina Mironova Anton Bukashkin Konstantin Kulikov et al. 2024. Kandinsky 3: Text-to-image synthesis for multifunctional generative framework. arXiv preprint arXiv:2410.21061 (2024).","DOI":"10.18653\/v1\/2024.emnlp-demo.48"},{"key":"e_1_3_2_1_3_1","volume-title":"arXiv preprint arXiv:2502.13923","author":"Bai Shuai","year":"2025","unstructured":"Shuai Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Sibo Song, Kai Dang, Peng Wang, Shijie Wang, Jun Tang, Humen Zhong, Yuanzhi Zhu, Mingkun Yang, Zhaohai Li, Jianqiang Wan, Pengfei Wang, Wei Ding, Zheren Fu, Yiheng Xu, Jiabo Ye, Xi Zhang, Tianbao Xie, Zesen Cheng, Hang Zhang, Zhibo Yang, Haiyang Xu, and Junyang Lin. 2025. Qwen2.5-VL Technical Report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_4_1","volume-title":"Cosman","author":"Bi Xiuli","year":"2023","unstructured":"Xiuli Bi, Bo Liu, Fan Yang, Bin Xiao, Weisheng Li, Gao Huang, and Pamela C. Cosman. 2023. Detecting Generated Images by Real Images Only. Arxiv (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"CIFAKE: Image Classification and Explainable Identification of AI-Generated Synthetic Images. arXiv preprint arXiv:2303.14126","author":"Bird Jordan J","year":"2023","unstructured":"Jordan J Bird and Ahmad Lotfi. 2023. CIFAKE: Image Classification and Explainable Identification of AI-Generated Synthetic Images. arXiv preprint arXiv:2303.14126 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Large scale GAN training for high fidelity natural image synthesis. arXiv preprint arXiv:1809.11096","author":"Brock Andrew","year":"2018","unstructured":"Andrew Brock, Jeff Donahue, and Karen Simonyan. 2018. Large scale GAN training for high fidelity natural image synthesis. arXiv preprint arXiv:1809.11096 (2018)."},{"key":"e_1_3_2_1_7_1","volume-title":"AntifakePrompt: Prompt-Tuned Vision-Language Models are Fake Image Detectors. Arxiv","author":"Chang You-Ming","year":"2023","unstructured":"You-Ming Chang, Chen Yeh, Wei-Chen Chiu, and Ning Yu. 2023. AntifakePrompt: Prompt-Tuned Vision-Language Models are Fake Image Detectors. Arxiv (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV). 74-91","author":"Chen Junsong","year":"2024","unstructured":"Junsong Chen, Chongjian Ge, Enze Xie, Yue Wu, Lewei Yao, Xiaozhe Ren, Zhongdao Wang, Ping Luo, Huchuan Lu, and Zhenguo Li. 2024a. Pixart-\u03c3: Weak-to-strong training of diffusion transformer for 4k text-to-image generation. In Proceedings of the European Conference on Computer Vision (ECCV). 74-91."},{"key":"e_1_3_2_1_9_1","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu et al. 2024b. Expanding performance boundaries of open-source multimodal models with model data and test-time scaling. arXiv preprint arXiv:2412.05271 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"Diffedit: Diffusion-based semantic image editing with mask guidance. arXiv preprint arXiv:2210.11427","author":"Couairon Guillaume","year":"2022","unstructured":"Guillaume Couairon, Jakob Verbeek, Holger Schwenk, and Matthieu Cord. 2022. Diffedit: Diffusion-based semantic image editing with mask guidance. arXiv preprint arXiv:2210.11427 (2022)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00582"},{"key":"e_1_3_2_1_12_1","volume-title":"Autoregressive Video Generation without Vector Quantization. arXiv preprint arXiv:2412.14169","author":"Deng Haoge","year":"2024","unstructured":"Haoge Deng, Ting Pan, Haiwen Diao, Zhengxiong Luo, Yufeng Cui, Huchuan Lu, Shiguang Shan, Yonggang Qi, and Xinlong Wang. 2024. Autoregressive Video Generation without Vector Quantization. arXiv preprint arXiv:2412.14169 (2024)."},{"key":"e_1_3_2_1_13_1","first-page":"8780","article-title":"Diffusion models beat gans on image synthesis","volume":"34","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. In Proceedings of the Advances in Neural Information Processing Systems (NeurIPS), Vol. 34. 8780-8794.","journal-title":"Proceedings of the Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML).","author":"Esser Patrick","year":"2024","unstructured":"Patrick Esser, Sumith Kulal, Andreas Blattmann, Rahim Entezari, Jonas M\u00fcller, Harry Saini, Yam Levi, Dominik Lorenz, Axel Sauer, Frederic Boesel, et al., 2024. Scaling rectified flow transformers for high-resolution image synthesis. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_15_1","volume-title":"International Conference on Machine Learning (ICML). PMLR, 3247-3258","author":"Frank Joel","year":"2020","unstructured":"Joel Frank, Thorsten Eisenhofer, Lea Sch\u00f6nherr, Asja Fischer, Dorothea Kolossa, and Thorsten Holz. 2020. Leveraging frequency analysis for deep fake image recognition. In International Conference on Machine Learning (ICML). PMLR, 3247-3258."},{"key":"e_1_3_2_1_16_1","volume-title":"CNN-generated images are surprisingly easy to spot...for now. Arxiv","author":"Frank Joel","year":"2021","unstructured":"Joel Frank and Thorsten Holz. 2021. CNN-generated images are surprisingly easy to spot...for now. Arxiv (2021)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN48605.2020.9207034"},{"key":"e_1_3_2_1_18_1","article-title":"Massive online crowdsourced study of subjective and objective picture quality","volume":"25","author":"Ghadiyaram Deepti","year":"2015","unstructured":"Deepti Ghadiyaram and Alan C Bovik. 2015. Massive online crowdsourced study of subjective and objective picture quality. IEEE Transactions on Image Processing (TIP), Vol. 25 (2015).","journal-title":"IEEE Transactions on Image Processing (TIP)"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"e_1_3_2_1_20_1","volume-title":"Infinity: Scaling Bitwise AutoRegressive Modeling for High-Resolution Image Synthesis. arXiv preprint arXiv:2412.04431","author":"Han Jian","year":"2024","unstructured":"Jian Han, Jinlai Liu, Yi Jiang, Bin Yan, Yuqi Zhang, Zehuan Yuan, Bingyue Peng, and Xiaobing Liu. 2024. Infinity: Scaling Bitwise AutoRegressive Modeling for High-Resolution Image Synthesis. arXiv preprint arXiv:2412.04431 (2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.5555\/2566972.2566993"},{"key":"e_1_3_2_1_22_1","volume-title":"CogAgent: A Visual Language Model for GUI Agents. arXiv preprint arXiv:2312.08914","author":"Hong Wenyi","year":"2024","unstructured":"Wenyi Hong, Weihan Wang, Qingsong Lv, Jiazheng Xu, Wenmeng Yu, Junhui Ji, Yan Wang, Zihan Wang, Yuxuan Zhang, Juanzi Li, Bin Xu, Yuxiao Dong, Ming Ding, and Jie Tang. 2024. CogAgent: A Visual Language Model for GUI Agents. arXiv preprint arXiv:2312.08914 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"KonIQ-10k: An ecologically valid database for deep learning of blind image quality assessment","author":"Hosu Vlad","year":"2020","unstructured":"Vlad Hosu, Hanhe Lin, Tamas Sziranyi, and Dietmar Saupe. 2020. KonIQ-10k: An ecologically valid database for deep learning of blind image quality assessment. IEEE Transactions on Image Processing (TIP) (2020), 4041-4056."},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","volume":"1","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al., 2022. LoRA: Low-Rank Adaptation of Large Language Models. In Proceedings of the International Conference on Learning Representations (ICLR), Vol. 1. 3."},{"key":"e_1_3_2_1_25_1","volume-title":"SIDA: Social Media Image Deepfake Detection, Localization and Explanation with Large Multimodal Model. arXiv preprint arXiv:2412.04292","author":"Huang Zhenglin","year":"2025","unstructured":"Zhenglin Huang, Jinwei Hu, Xiangtai Li, Yiwei He, Xingyu Zhao, Bei Peng, Baoyuan Wu, Xiaowei Huang, and Guangliang Cheng. 2025. SIDA: Social Media Image Deepfake Detection, Localization and Explanation with Large Multimodal Model. arXiv preprint arXiv:2412.04292 (2025)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP46576.2022.9897820"},{"key":"e_1_3_2_1_27_1","volume-title":"Progressive growing of gans for improved quality, stability, and variation. arXiv preprint arXiv:1710.10196","author":"Karras Tero","year":"2017","unstructured":"Tero Karras, Timo Aila, Samuli Laine, and Jaakko Lehtinen. 2017. Progressive growing of gans for improved quality, stability, and variation. arXiv preprint arXiv:1710.10196 (2017)."},{"key":"e_1_3_2_1_28_1","unstructured":"Black Forest Labs. 2024. FLUX. https:\/\/github.com\/black-forest-labs\/flux."},{"key":"e_1_3_2_1_29_1","article-title":"Most apparent distortion: full-reference image quality assessment and the role of strategy","volume":"19","author":"Larson Eric Cooper","year":"2010","unstructured":"Eric Cooper Larson and Damon Michael Chandler. 2010. Most apparent distortion: full-reference image quality assessment and the role of strategy. Journal of Electronic Imaging (JEI), Vol. 19, 1 (2010).","journal-title":"Journal of Electronic Imaging (JEI)"},{"key":"e_1_3_2_1_30_1","volume-title":"LLaVA-OneVision: Easy Visual Task Transfer. arXiv preprint arXiv:2408.03326","author":"Li Bo","year":"2024","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Peiyuan Zhang, Yanwei Li, Ziwei Liu, and Chunyuan Li. 2024b. LLaVA-OneVision: Easy Visual Task Transfer. arXiv preprint arXiv:2408.03326 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"Playground v2.5: Three Insights towards Enhancing Aesthetic Quality in Text-to-Image Generation. arXiv preprint arXiv:2402.17245","author":"Li Daiqing","year":"2024","unstructured":"Daiqing Li, Aleks Kamko, Ehsan Akhgari, Ali Sabet, Linmiao Xu, and Suhail Doshi. 2024a. Playground v2.5: Three Insights towards Enhancing Aesthetic Quality in Text-to-Image Generation. arXiv preprint arXiv:2402.17245 (2024)."},{"key":"e_1_3_2_1_32_1","volume-title":"LLaVA-NeXT-Interleave: Tackling Multi-image, Video, and 3D in Large Multimodal Models. arXiv preprint arXiv:2407.07895","author":"Li Feng","year":"2024","unstructured":"Feng Li, Renrui Zhang, Hao Zhang, Yuanhan Zhang, Bo Li, Wei Li, Zejun Ma, and Chunyuan Li. 2024c. LLaVA-NeXT-Interleave: Tackling Multi-image, Video, and 3D in Large Multimodal Models. arXiv preprint arXiv:2407.07895 (2024)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/QoMEX.2019.8743252"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV).","author":"Lin Tsung-Yi","unstructured":"Tsung-Yi Lin, Michael Maire, Serge J. Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In Proceedings of the European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00808"},{"key":"e_1_3_2_1_37_1","volume-title":"DeepSeek-VL: Towards Real-World Vision-Language Understanding. arXiv preprint arXiv:2403.05525","author":"Lu Haoyu","year":"2024","unstructured":"Haoyu Lu, Wen Liu, Bo Zhang, Bingxuan Wang, Kai Dong, Bo Liu, Jingxiang Sun, Tongzheng Ren, Zhuoshu Li, Hao Yang, Yaofeng Sun, Chengqi Deng, Hanwei Xu, Zhenda Xie, and Chong Ruan. 2024. DeepSeek-VL: Towards Real-World Vision-Language Understanding. arXiv preprint arXiv:2403.05525 (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"Meta AI Blog. Retrieved December","volume":"20","author":"Meta AI","year":"2024","unstructured":"AI Meta. 2024. Llama 3.2: Revolutionizing edge AI and vision with open, customizable models. Meta AI Blog. Retrieved December, Vol. 20 (2024), 2024."},{"key":"e_1_3_2_1_39_1","volume-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741","author":"Nichol Alex","year":"2021","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2021. Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02345"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_42_1","volume-title":"Signal Processing: Image Communication (SPIC)","author":"Ponomarenko Nikolay","year":"2015","unstructured":"Nikolay Ponomarenko, Lina Jin, Oleg Ieremeiev, Vladimir Lukin, Karen Egiazarian, Jaakko Astola, Benoit Vozel, Kacem Chehdi, Marco Carli, Federica Battisti, et al., 2015. Image database TID2013: Peculiarities, results and perspectives. Signal Processing: Image Communication (SPIC), Vol. 30 (2015)."},{"key":"e_1_3_2_1_43_1","volume-title":"Towards Explainable Partial-AIGC Image Quality Assessment. arXiv preprint arXiv:2504.09291","author":"Qian Jiaying","year":"2025","unstructured":"Jiaying Qian, Ziheng Jia, Zicheng Zhang, Zeyu Zhang, Guangtao Zhai, and Xiongkuo Min. 2025. Towards Explainable Partial-AIGC Image Quality Assessment. arXiv preprint arXiv:2504.09291 (2025)."},{"key":"e_1_3_2_1_44_1","volume-title":"DE-FAKE: Detection and Attribution of Fake Images Generated by Text-to-Image Diffusion Models. arXiv preprint arXiv:2210.06998","author":"Sha Zeyang","year":"2022","unstructured":"Zeyang Sha, Zheng Li, Ning Yu, and Yang Zhang. 2022. DE-FAKE: Detection and Attribution of Fake Images Generated by Text-to-Image Diffusion Models. arXiv preprint arXiv:2210.06998 (2022)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2006.881959"},{"key":"e_1_3_2_1_46_1","volume-title":"Robustness and Generalizability of Deepfake Detection: A Study with Diffusion Models. Arxiv","author":"Song Haixu","year":"2023","unstructured":"Haixu Song, Shiyu Huang, Yinpeng Dong, and Wei-Wei Tu. 2023. Robustness and Generalizability of Deepfake Detection: A Study with Diffusion Models. Arxiv (2023)."},{"key":"e_1_3_2_1_47_1","unstructured":"Chuangchuang Tan Yao Zhao Shikui Wei Guanghua Gu and Yunchao Wei. 2023. Learning on Gradients: Generalized Artifacts Representation for GAN-Generated Images Detection. In CVPR."},{"key":"e_1_3_2_1_48_1","unstructured":"Google Team. 2024a. Gemini1.5-pro. https:\/\/gemini.google.com\/. Accessed: 2025-03-08."},{"key":"e_1_3_2_1_49_1","volume-title":"Kolors: Effective Training of Diffusion Model for Photorealistic Text-to-Image Synthesis. arXiv preprint","author":"Team Kolors","year":"2024","unstructured":"Kolors Team. 2024b. Kolors: Effective Training of Diffusion Model for Photorealistic Text-to-Image Synthesis. arXiv preprint (2024)."},{"key":"e_1_3_2_1_50_1","unstructured":"Luisa Verdoliva Davide Cozzolino and Koki Nagano. 2022. 2022 IEEE Image and Video Processing Cup Synthetic Image Detection."},{"key":"e_1_3_2_1_51_1","volume-title":"Proceedings of the CAAI International Conference on Artificial Intelligence (CICAI). 46-57","author":"Wang Jiarui","year":"2023","unstructured":"Jiarui Wang, Huiyu Duan, Jing Liu, Shi Chen, Xiongkuo Min, and Guangtao Zhai. 2023a. Aigciqa2023: A large-scale image quality assessment database for ai generated images: from the perspectives of quality, authenticity and correspondence. In Proceedings of the CAAI International Conference on Artificial Intelligence (CICAI). 46-57."},{"key":"e_1_3_2_1_52_1","volume-title":"Quality Assessment for AI Generated Images with Instruction Tuning. arXiv preprint arXiv:2405.07346","author":"Wang Jiarui","year":"2025","unstructured":"Jiarui Wang, Huiyu Duan, Guangtao Zhai, and Xiongkuo Min. 2025. Quality Assessment for AI Generated Images with Instruction Tuning. arXiv preprint arXiv:2405.07346 (2025)."},{"key":"e_1_3_2_1_53_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024a. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Fakespotter: A simple yet robust baseline for spotting ai-synthesized fake faces. arXiv preprint arXiv:1909.06122","author":"Wang Run","year":"2019","unstructured":"Run Wang, Felix Juefei-Xu, Lei Ma, Xiaofei Xie, Yihao Huang, Jian Wang, and Yang Liu. 2019. Fakespotter: A simple yet robust baseline for spotting ai-synthesized fake faces. arXiv preprint arXiv:1909.06122 (2019)."},{"key":"e_1_3_2_1_55_1","volume-title":"Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization. arXiv preprint arXiv:2411.10442","author":"Wang Weiyun","year":"2024","unstructured":"Weiyun Wang, Zhe Chen, Wenhai Wang, Yue Cao, Yangzhou Liu, Zhangwei Gao, Jinguo Zhu, Xizhou Zhu, Lewei Lu, Yu Qiao, and Jifeng Dai. 2024b. Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization. arXiv preprint arXiv:2411.10442 (2024)."},{"key":"e_1_3_2_1_56_1","volume-title":"Benchmarking Deepart Detection. arXiv preprint arXiv:2302.14475","author":"Wang Yabin","year":"2023","unstructured":"Yabin Wang, Zhiwu Huang, and Xiaopeng Hong. 2023b. Benchmarking Deepart Detection. arXiv preprint arXiv:2302.14475 (2023)."},{"key":"e_1_3_2_1_57_1","volume-title":"Janus: Decoupling visual encoding for unified multimodal understanding and generation. arXiv preprint arXiv:2410.13848","author":"Wu Chengyue","year":"2024","unstructured":"Chengyue Wu, Xiaokang Chen, Zhiyu Wu, Yiyang Ma, Xingchao Liu, Zizheng Pan, Wen Liu, Zhenda Xie, Xingkai Yu, Chong Ruan, et al., 2024. Janus: Decoupling visual encoding for unified multimodal understanding and generation. arXiv preprint arXiv:2410.13848 (2024)."},{"key":"e_1_3_2_1_58_1","unstructured":"xAI Team. 2024. Grok2 Vision. https:\/\/grok.com\/. Accessed: 2025-03-08."},{"key":"e_1_3_2_1_59_1","volume-title":"HarmonyIQA: Pioneering Benchmark and Model for Image Harmonization Quality Assessment. arXiv preprint arXiv:2501.01116","author":"Xu Zitong","year":"2025","unstructured":"Zitong Xu, Huiyu Duan, Guangji Ma, Liu Yang, Jiarui Wang, Qingbo Wu, Xiongkuo Min, Guangtao Zhai, and Patrick Le Callet. 2025. HarmonyIQA: Pioneering Benchmark and Model for Image Harmonization Quality Assessment. arXiv preprint arXiv:2501.01116 (2025)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683164"},{"key":"e_1_3_2_1_61_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR).","author":"Ye Jiabo","year":"2024","unstructured":"Jiabo Ye, Haiyang Xu, Haowei Liu, Anwen Hu, Ming Yan, Qi Qian, Ji Zhang, Fei Huang, and Jingren Zhou. 2024. mplug-owl3: Towards long image-sequence understanding in multi-modal large language models. In Proceedings of the International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_62_1","volume-title":"InternLM-XComposer: A Vision-Language Large Model for Advanced Text-image Comprehension and Composition. arXiv preprint arXiv:2309.15112","author":"Zhang Pan","year":"2023","unstructured":"Pan Zhang, Xiaoyi Dong, Bin Wang, Yuhang Cao, Chao Xu, Linke Ouyang, Zhiyuan Zhao, Shuangrui Ding, Songyang Zhang, Haodong Duan, Wenwei Zhang, Hang Yan, Xinyue Zhang, Wei Li, Jingwen Li, Kai Chen, Conghui He, Xingcheng Zhang, Yu Qiao, Dahua Lin, and Jiaqi Wang. 2023. InternLM-XComposer: A Vision-Language Large Model for Advanced Text-image Comprehension and Composition. arXiv preprint arXiv:2309.15112 (2023)."},{"key":"e_1_3_2_1_63_1","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV). 70-86","author":"Zhao Shihao","year":"2024","unstructured":"Shihao Zhao, Shaozhe Hao, Bojia Zi, Huaizhe Xu, and Kwan-Yee K Wong. 2024. Bridging different language models and generative vision models for text-to-image generation. In Proceedings of the European Conference on Computer Vision (ECCV). 70-86."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758204","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:59:32Z","timestamp":1765310372000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758204"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":63,"alternative-id":["10.1145\/3746027.3758204","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758204","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}