{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T19:57:51Z","timestamp":1777406271448,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758283","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:55Z","timestamp":1761377215000},"page":"13258-13265","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Multiverse Through Deepfakes: The MultiFakeVerse Dataset of Person-Centric Visual and Conceptual Manipulations"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4379-1573","authenticated-orcid":false,"given":"Parul","family":"Gupta","sequence":"first","affiliation":[{"name":"Monash University, Melbourne, Victoria, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2639-8374","authenticated-orcid":false,"given":"Shreya","family":"Ghosh","sequence":"additional","affiliation":[{"name":"Curtin University, Perth, WA, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8356-4909","authenticated-orcid":false,"given":"Tom","family":"Gedeon","sequence":"additional","affiliation":[{"name":"Curtin University, Perth, WA, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6249-0848","authenticated-orcid":false,"given":"Thanh-Toan","family":"Do","sequence":"additional","affiliation":[{"name":"Monash University, Melbourne, Victoria, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2230-1440","authenticated-orcid":false,"given":"Abhinav","family":"Dhall","sequence":"additional","affiliation":[{"name":"Monash University, Melbourne, Victoria, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Gemini: A Family of Highly Capable Multimodal Models. arXiv preprint arXiv:2312.11805","author":"Anil Rohan","year":"2023","unstructured":"Rohan Anil and et al., 2023. Gemini: A Family of Highly Capable Multimodal Models. arXiv preprint arXiv:2312.11805 (2023). https:\/\/arxiv.org\/abs\/2312.11805"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2024.3356122"},{"key":"e_1_3_2_1_3_1","volume-title":"InstructPix2Pix: Learning to Follow Image Editing Instructions. arXiv preprint arXiv:2211.09800","author":"Brooks Tim","year":"2022","unstructured":"Tim Brooks, Aleksander Holynski, and Alexei A Efros. 2022. InstructPix2Pix: Learning to Follow Image Editing Instructions. arXiv preprint arXiv:2211.09800 (2022)."},{"key":"e_1_3_2_1_4_1","first-page":"1877","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems, Vol. 33. Curran Associates, Inc., 1877-1901."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680795"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103818"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/DICTA56598.2022.10034605"},{"key":"e_1_3_2_1_8_1","first-page":"3645","article-title":"SC-GlowTTS: An Efficient Zero-Shot Multi-Speaker Text-To-Speech Model. In Interspeech 2021","author":"Casanova Edresson","year":"2021","unstructured":"Edresson Casanova, Christopher Shulby, Eren G\u00f6lge, Nicolas Michael M\u00fcller, Frederico Santos De Oliveira, Arnaldo Candido Jr., Anderson Da Silva Soares, Sandra Maria Aluisio, and Moacir Antonelli Ponti. 2021. SC-GlowTTS: An Efficient Zero-Shot Multi-Speaker Text-To-Speech Model. In Interspeech 2021. ISCA, 3645-3649.","journal-title":"ISCA"},{"key":"e_1_3_2_1_9_1","unstructured":"You-Ming Chang Chen Yeh Wei-Chen Chiu and Ning Yu. 2024. AntifakePrompt: Prompt-Tuned Vision-Language Models are Fake Image Detectors. arXiv:2310.17419 [cs.CV] https:\/\/arxiv.org\/abs\/2310.17419"},{"key":"e_1_3_2_1_10_1","volume-title":"European Conference on Computer Vision. Springer, 370-387","author":"Chen Lin","year":"2024","unstructured":"Lin Chen, Jinsong Li, Xiaoyi Dong, Pan Zhang, Conghui He, Jiaqi Wang, Feng Zhao, and Dahua Lin. 2024. Sharegpt4v: Improving large multi-modal models with better captions. In European Conference on Computer Vision. Springer, 370-387."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530164"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_7"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01974"},{"key":"e_1_3_2_1_14_1","volume-title":"Advances in Neural Information Processing Systems","volume":"30","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium. In Advances in Neural Information Processing Systems, Vol. 30. https:\/\/papers.nips.cc\/paper\/2017\/hash\/8a1d694707eb0fefe65871369074926d-Abstract.html"},{"key":"e_1_3_2_1_15_1","volume-title":"SIDA: Social Media Image Deepfake Detection, Localization and Explanation with Large Multimodal Model. arXiv preprint arXiv:2412.04292","author":"Huang Zhenglin","year":"2024","unstructured":"Zhenglin Huang, Jinwei Hu, Xiangtai Li, Yiwei He, Xingyu Zhao, Bei Peng, Baoyuan Wu, Xiaowei Huang, and Guangliang Cheng. 2024. SIDA: Social Media Image Deepfake Detection, Localization and Explanation with Large Multimodal Model. arXiv preprint arXiv:2412.04292 (2024)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2916866"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.289"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3075846"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577163.3595101"},{"key":"e_1_3_2_1_21_1","volume-title":"Multi-spectral Class Center Network for Face Manipulation Detection and Localization. arXiv preprint arXiv:2305.10794","author":"Miao Changtao","year":"2023","unstructured":"Changtao Miao, Qi Chu, Zhentao Tan, Zhenchao Jin, Tao Gong, Wanyi Zhuang, Yue Wu, Bin Liu, Honggang Hu, and Nenghai Yu. 2023. Multi-spectral Class Center Network for Face Manipulation Detection and Localization. arXiv preprint arXiv:2305.10794 (2023). https:\/\/arxiv.org\/abs\/2305.10794"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00939"},{"key":"e_1_3_2_1_23_1","volume-title":"Dung Tien Nguyen, Duc Thanh Nguyen, Thien Huynh-The, Saeid Nahavandi, Thanh Tam Nguyen, Quoc-Viet Pham, and Cuong M. Nguyen.","author":"Nguyen Thanh Thi","year":"2019","unstructured":"Thanh Thi Nguyen, Quoc Viet Hung Nguyen, Dung Tien Nguyen, Duc Thanh Nguyen, Thien Huynh-The, Saeid Nahavandi, Thanh Tam Nguyen, Quoc-Viet Pham, and Cuong M. Nguyen. 2019. Deep Learning for Deepfakes Creation and Detection: A Survey. arXiv preprint arXiv:1909.11573 (2019). https:\/\/arxiv.org\/abs\/1909.11573"},{"key":"e_1_3_2_1_24_1","volume-title":"IMD2020: A large-scale annotated dataset tailored for detecting manipulated images. In Proceedings of the IEEE\/CVF winter conference on applications of computer vision workshops. 71-80","author":"Novozamsky Adam","year":"2020","unstructured":"Adam Novozamsky, Babak Mahdian, and Stanislav Saic. 2020. IMD2020: A large-scale annotated dataset tailored for detecting manipulated images. In Proceedings of the IEEE\/CVF winter conference on applications of computer vision workshops. 71-80."},{"key":"e_1_3_2_1_25_1","unstructured":"OpenAI. 2024. GPT-4o System Card. https:\/\/arxiv.org\/abs\/2410.21276. Accessed: 2025-05-31."},{"key":"e_1_3_2_1_26_1","first-page":"118025","article-title":"Semi-Truths: A Large-Scale Dataset of AI-Augmented Images for Evaluating Robustness of AI-Generated Image detectors","volume":"37","author":"Pal Anisha","year":"2024","unstructured":"Anisha Pal, Julia Kruk, Mansi Phute, Manognya Bhattaram, Diyi Yang, Duen Horng Chau, and Judy Hoffman. 2024. Semi-Truths: A Large-Scale Dataset of AI-Augmented Images for Evaluating Robustness of AI-Generated Image detectors. Advances in Neural Information Processing Systems, Vol. 37 (2024), 118025-118051.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_27_1","volume-title":"LANCE: Stress-testing Visual Models by Generating Language-guided Counterfactual Images. In Neural Information Processing Systems (NeurIPS).","author":"Prabhu Viraj","year":"2023","unstructured":"Viraj Prabhu, Sriram Yenamandra, Prithvijit Chattopadhyay, and Judy Hoffman. 2023. LANCE: Stress-testing Visual Models by Generating Language-guided Counterfactual Images. In Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP49359.2023.10222083"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-024-19906-1"},{"key":"e_1_3_2_1_30_1","volume-title":"Naturalspeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers. arXiv preprint arXiv:2304.09116","author":"Shen Kai","year":"2023","unstructured":"Kai Shen, Zeqian Ju, Xu Tan, Yanqing Liu, Yichong Leng, Lei He, Tao Qin, Sheng Zhao, and Jiang Bian. 2023. Naturalspeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers. arXiv preprint arXiv:2304.09116 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"On the detection of digital face manipulation. arXiv","author":"Stehouwer Joel","year":"2019","unstructured":"Joel Stehouwer, Hao Dang, Feng Liu, Xiaoming Liu, and Anil Jain. 2019. On the detection of digital face manipulation. arXiv (2019), arXiv-1910."},{"key":"e_1_3_2_1_32_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al., 2023a. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_33_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023b. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv:2307.09288 [cs]."},{"key":"e_1_3_2_1_34_1","first-page":"1146","volume-title":"Science","volume":"359","author":"Vosoughi Soroush","year":"2018","unstructured":"Soroush Vosoughi, Deb Roy, and Sinan Aral. 2018. The Spread of True and False News Online. Science, Vol. 359, 6380 (2018), 1146-1151. https:\/\/news.mit.edu\/2018\/study-twitter-false-news-travels-faster-true-stories-0308 Accessed: 2024-05-30."},{"key":"e_1_3_2_1_35_1","volume-title":"Image Editing with Diffusion Models: A Survey. arXiv preprint arXiv:2504.13226","author":"Wang Jia","year":"2025","unstructured":"Jia Wang, Jie Hu, Xiaoqi Ma, Hanghang Ma, Xiaoming Wei, and Enhua Wu. 2025. Image Editing with Diffusion Models: A Survey. arXiv preprint arXiv:2504.13226 (2025). https:\/\/arxiv.org\/abs\/2504.13226"},{"key":"e_1_3_2_1_36_1","volume-title":"Fakespotter: A simple yet robust baseline for spotting ai-synthesized fake faces. arXiv preprint arXiv:1909.06122","author":"Wang Run","year":"2019","unstructured":"Run Wang, Felix Juefei-Xu, Lei Ma, Xiaofei Xie, Yihao Huang, Jian Wang, and Yang Liu. 2019. Fakespotter: A simple yet robust baseline for spotting ai-synthesized fake faces. arXiv preprint arXiv:1909.06122 (2019)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00872"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_2_1_39_1","volume-title":"A Timely Survey on Vision Transformer for Deepfake Detection. arXiv preprint arXiv:2405.08463","author":"Wang Zhikan","year":"2024","unstructured":"Zhikan Wang, Zhongyao Cheng, Jiajie Xiong, Xun Xu, Tianrui Li, Bharadwaj Veeravalli, and Xulei Yang. 2024. A Timely Survey on Vision Transformer for Deepfake Detection. arXiv preprint arXiv:2405.08463 (2024). https:\/\/arxiv.org\/abs\/2405.08463"},{"key":"e_1_3_2_1_40_1","volume-title":"The Disturbing World of Deepfake Pornography. WIRED (October","author":"Wardle Claire","year":"2019","unstructured":"Claire Wardle. 2019. The Disturbing World of Deepfake Pornography. WIRED (October 2019). https:\/\/www.wired.com\/story\/deepfakes-pornography Accessed: 2024-05-30."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02148"},{"key":"e_1_3_2_1_42_1","volume-title":"Long-CLIP: Unlocking the Long-Text Capability of CLIP. arXiv preprint arXiv:2403.15378","author":"Zhang Beichen","year":"2024","unstructured":"Beichen Zhang, Pan Zhang, Xiaoyi Dong, Yuhang Zang, and Jiaqi Wang. 2024. Long-CLIP: Unlocking the Long-Text Capability of CLIP. arXiv preprint arXiv:2403.15378 (2024)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Ning Zhang Manohar Paluri Yaniv Taigman Rob Fergus and Lubomir Bourdev. 2015. Beyond Frontal Faces: Improving Person Recognition Using Multiple Cues. arXiv:1501.05703 [cs.CV] https:\/\/arxiv.org\/abs\/1501.05703","DOI":"10.1109\/CVPR.2015.7299113"},{"key":"e_1_3_2_1_44_1","unstructured":"Zechuan Zhang Ji Xie Yu Lu Zongxin Yang and Yi Yang. 2025. In-Context Edit: Enabling Instructional Image Editing with In-Context Generation in Large Scale Diffusion Transformer. arXiv:2504.20690 [cs.CV] https:\/\/arxiv.org\/abs\/2504.20690"},{"key":"e_1_3_2_1_45_1","volume-title":"Rich and poor texture contrast: A simple yet effective approach for ai-generated image detection. CoRR","author":"Zhong Nan","year":"2023","unstructured":"Nan Zhong, Yiran Xu, Zhenxing Qian, and Xinpeng Zhang. 2023. Rich and poor texture contrast: A simple yet effective approach for ai-generated image detection. CoRR (2023)."},{"key":"e_1_3_2_1_46_1","first-page":"77771","article-title":"Genimage: A million-scale benchmark for detecting ai-generated image","volume":"36","author":"Zhu Mingjian","year":"2023","unstructured":"Mingjian Zhu, Hanting Chen, Qiangyu Yan, Xudong Huang, Guanyu Lin, Wei Li, Zhijun Tu, Hailin Hu, Jie Hu, and Yunhe Wang. 2023. Genimage: A million-scale benchmark for detecting ai-generated image. Advances in Neural Information Processing Systems, Vol. 36 (2023), 77771-77782.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446605"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758283","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:09:15Z","timestamp":1765343355000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758283"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":47,"alternative-id":["10.1145\/3746027.3758283","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758283","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}