{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,17]],"date-time":"2026-07-17T15:32:34Z","timestamp":1784302354270,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3761979","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:51Z","timestamp":1761377211000},"page":"13686-13691","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["AV-Deepfake1M++: A Large-Scale Audio-Visual Deepfake Benchmark with Real-World Perturbations"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7978-0860","authenticated-orcid":false,"given":"Zhixi","family":"Cai","sequence":"first","affiliation":[{"name":"Monash University, Melbourne, Australia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5427-7587","authenticated-orcid":false,"given":"Kartik","family":"Kuckreja","sequence":"additional","affiliation":[{"name":"MBZUAI, Abu Dhabi, United Arab Emirates"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2639-8374","authenticated-orcid":false,"given":"Shreya","family":"Ghosh","sequence":"additional","affiliation":[{"name":"The University of Queensland, Brisbane, Australia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1041-9350","authenticated-orcid":false,"given":"Akanksha","family":"Chuchra","sequence":"additional","affiliation":[{"name":"IIT Ropar, Ropar, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9746-276X","authenticated-orcid":false,"given":"Muhammad Haris","family":"Khan","sequence":"additional","affiliation":[{"name":"MBZUAI, Abu Dhabi, United Arab Emirates"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8244-2165","authenticated-orcid":false,"given":"Usman","family":"Tariq","sequence":"additional","affiliation":[{"name":"American University of Sharjah, Sharjah, United Arab Emirates"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8356-4909","authenticated-orcid":false,"given":"Tom","family":"Gedeon","sequence":"additional","affiliation":[{"name":"Curtin University, Perth, Australia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2230-1440","authenticated-orcid":false,"given":"Abhinav","family":"Dhall","sequence":"additional","affiliation":[{"name":"Monash University, Melbourne, Australia"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Joon Son Chung, and Andrew Zisserman","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, and Andrew Zisserman. 2018. LRS3-TED: a large-scale dataset for visual speech recognition. arXiv:1809.00496 [cs]."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3689145"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680795"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103818"},{"key":"e_1_3_2_1_5_1","first-page":"1","article-title":"Do You Really Mean That? Content Driven Audio-Visual Deepfake Dataset and Multimodal Method for Temporal Forgery Localization. In 2022 International Conference on Digital Image Computing","author":"Cai Zhixi","year":"2022","unstructured":"Zhixi Cai, Kalin Stefanov, Abhinav Dhall, and Munawar Hayat. 2022. Do You Really Mean That? Content Driven Audio-Visual Deepfake Dataset and Multimodal Method for Temporal Forgery Localization. In 2022 International Conference on Digital Image Computing: Techniques and Applications (DICTA). 1-10.","journal-title":"Techniques and Applications (DICTA)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Edresson Casanova Kelly Davis Eren G\u00f6lge G\u00f6rkem G\u00f6knar Iulian Gulea Logan Hart Aya Aljafari Joshua Meyer Reuben Morais Samuel Olayemi and Julian Weber. 2024. XTTS: a Massively Multilingual Zero-Shot Text-to-Speech Model. arXiv:2406.04904 [cs eess].","DOI":"10.21437\/Interspeech.2024-2016"},{"key":"e_1_3_2_1_7_1","first-page":"2640","volume-title":"Proceedings of the 39th International Conference on Machine Learning. 2709-2720","author":"Casanova Edresson","unstructured":"Edresson Casanova, Julian Weber, Christopher D. Shulby, Arnaldo Candido Junior, Eren G\u00f6lge, and Moacir A. Ponti. 2022. YourTTS: Towards Zero-Shot Multi-Speaker TTS and Zero-Shot Voice Conversion for Everyone. In Proceedings of the 39th International Conference on Machine Learning. 2709-2720. ISSN: 2640-3498."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Yushen Chen Zhikang Niu Ziyang Ma Keqi Deng Chunhui Wang Jian Zhao Kai Yu and Xie Chen. 2025. F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching. arXiv:2410.06885 [eess].","DOI":"10.18653\/v1\/2025.acl-long.313"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.195"},{"key":"e_1_3_2_1_10_1","first-page":"1086","article-title":"VoxCeleb2","volume":"2018","author":"Chung Joon Son","year":"2018","unstructured":"Joon Son Chung, Arsha Nagrani, and Andrew Zisserman. 2018. VoxCeleb2: Deep Speaker Recognition. In Interspeech 2018. 1086-1090.","journal-title":"Deep Speaker Recognition. In Interspeech"},{"key":"e_1_3_2_1_11_1","volume-title":"The DeepFake Detection Challenge (DFDC) Dataset. arXiv","author":"Dolhansky Brian","year":"2006","unstructured":"Brian Dolhansky, Joanna Bitton, Ben Pflaum, Jikuo Lu, Russ Howes, Menglin Wang, and Cristian Canton Ferrer. 2020. The DeepFake Detection Challenge (DFDC) Dataset. arXiv: 2006.07397 [cs]."},{"key":"e_1_3_2_1_12_1","first-page":"3291","article-title":"Real Time Speech Enhancement in the Waveform Domain","volume":"2020","author":"D\u00e9fossez Alexandre","year":"2020","unstructured":"Alexandre D\u00e9fossez, Gabriel Synnaeve, and Yossi Adi. 2020. Real Time Speech Enhancement in the Waveform Domain. In Interspeech 2020. 3291-3295.","journal-title":"Interspeech"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Parul Gupta Shreya Ghosh Tom Gedeon Thanh-Toan Do and Abhinav Dhall. 2025. Multiverse Through Deepfakes: The MultiFakeVerse Dataset of Person-Centric Visual and Conceptual Manipulations. arXiv:2506.00868 [cs].","DOI":"10.1145\/3746027.3758283"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00434"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-78341-8_12"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02685"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the 32nd International Conference on Neural Information Processing Systems. 4485-4495","author":"Jia Ye","year":"2018","unstructured":"Ye Jia, Yu Zhang, Ron J. Weiss, Quan Wang, Jonathan Shen, Fei Ren, Zhifeng Chen, Patrick Nguyen, Ruoming Pang, Ignacio Lopez Moreno, and Yonghui Wu. 2018. Transfer learning from speaker verification to multispeaker text-to-speech synthesis. In Proceedings of the 32nd International Conference on Neural Information Processing Systems. 4485-4495."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00296"},{"key":"e_1_3_2_1_19_1","volume-title":"FakeAVCeleb: A Novel Audio-Video Multimodal Deepfake Dataset. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track.","author":"Khalid Hasam","unstructured":"Hasam Khalid, Shahroz Tariq, Minha Kim, and Simon S. Woo. 2021. FakeAVCeleb: A Novel Audio-Video Multimodal Deepfake Dataset. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track."},{"key":"e_1_3_2_1_20_1","first-page":"2640","volume-title":"Proceedings of the 38th International Conference on Machine Learning. 5530-5540","author":"Kim Jaehyeon","year":"2021","unstructured":"Jaehyeon Kim, Jungil Kong, and Juhee Son. 2021. Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. In Proceedings of the 38th International Conference on Machine Learning. 5530-5540. ISSN: 2640-3498."},{"key":"e_1_3_2_1_21_1","volume-title":"Pindrop it! Audio and Visual Deepfake Countermeasures for Robust Detection and Fine Grained-Localization. arXiv preprint arXiv:2508.08141","author":"Klein Nicholas","year":"2025","unstructured":"Nicholas Klein, Hemlata Tak, James Fullwood, Krishna Regmi, Leonidas Spinoulas, Ganesh Sivaraman, Tianxiang Chen, and Elie Khoury. 2025. Pindrop it! Audio and Visual Deepfake Countermeasures for Robust Detection and Fine Grained-Localization. arXiv preprint arXiv:2508.08141 (2025)."},{"key":"e_1_3_2_1_22_1","unstructured":"Pavel Korshunov and Sebastien Marcel. 2018. DeepFakes: a New Threat to Face Recognition? Assessment and Detection. arXiv:1812.08685 [cs]."},{"key":"e_1_3_2_1_23_1","volume-title":"Muhammad Haris Khan, and Abhinav Dhall","author":"Kuckreja Kartik","year":"2025","unstructured":"Kartik Kuckreja, Parul Gupta, Injy Hamed, Thamar Solorio, Muhammad Haris Khan, and Abhinav Dhall. 2025. Tell me Habibi, is it Real or Fake? arXiv:2505.22581 [cs]."},{"key":"e_1_3_2_1_24_1","volume-title":"KLASSify to Verify: Audio-Visual Deepfake Detection Using SSL-based Audio and Handcrafted Visual Features. arXiv preprint arXiv:2508.07337","author":"Kukanov Ivan","year":"2025","unstructured":"Ivan Kukanov and Jun Wah Ng. 2025. KLASSify to Verify: Audio-Visual Deepfake Detection Using SSL-based Audio and Handcrafted Visual Features. arXiv preprint arXiv:2508.07337 (2025)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01057"},{"key":"e_1_3_2_1_26_1","unstructured":"Chunyu Li Chao Zhang Weikai Xu Jingyu Lin Jinghui Xie Weiguo Feng Bingyue Peng Cunjian Chen and Weiwei Xing. 2025. LatentSync: Taming Audio-Conditioned Latent Diffusion Models for Lip Sync with SyncNet Supervision. arXiv:2412.09262 [cs]."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00327"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3285283"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00521"},{"key":"e_1_3_2_1_30_1","volume-title":"LayLens: Improving Deepfake Understanding through Simplified Explanations. arXiv preprint arXiv:2507.10066","author":"Narang Abhijeet","year":"2025","unstructured":"Abhijeet Narang, Parul Gupta, Liuyijia Su, and Abhinav Dhall. 2025. LayLens: Improving Deepfake Understanding through Simplified Explanations. arXiv preprint arXiv:2507.10066 (2025)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00939"},{"key":"e_1_3_2_1_32_1","unstructured":"Dufou Nick and Jigsaw Andrew. 2019. Contributing Data to Deepfake Detection Research."},{"key":"e_1_3_2_1_33_1","unstructured":"OpenAI. 2024. GPT-4o System Card."},{"key":"e_1_3_2_1_34_1","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul F. Christiano, Jan Leike, and Ryan Lowe. 2022. Training language models to follow instructions with human feedback. In Advances in Neural Information Processing Systems, Vol. 35. 27730-27744.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_35_1","first-page":"118025","article-title":"Semi-Truths: A Large-Scale Dataset of AI-Augmented Images for Evaluating Robustness of AI-Generated Image detectors","volume":"37","author":"Pal Anisha","year":"2024","unstructured":"Anisha Pal, Julia Kruk, Mansi Phute, Manognya Bhattaram, Diyi Yang, Duen Horng Chau, and Judy Hoffman. 2024. Semi-Truths: A Large-Scale Dataset of AI-Augmented Images for Evaluating Robustness of AI-Generated Image detectors. In Advances in Neural Information Processing Systems, Vol. 37. 118025-118051.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the 28th ACM International Conference on Multimedia. 484-492","author":"Prajwal K R","unstructured":"K R Prajwal, Rudrabha Mukhopadhyay, Vinay P. Namboodiri, and C.V. Jawahar. 2020. A Lip Sync Expert Is All You Need for Speech to Lip Generation In the Wild. In Proceedings of the 28th ACM International Conference on Multimedia. 484-492."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3688983"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00009"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577190.3614164"},{"key":"e_1_3_2_1_40_1","volume-title":"Multi-Lingual Deepfake Dataset. In The Thirteenth International Conference on Learning Representations.","author":"Thakral Kartik","year":"2024","unstructured":"Kartik Thakral, Rishabh Ranjan, Akanksha Singh, Akshat Jain, Mayank Vatsa, and Richa Singh. 2024. ILLUSION: Unveiling Truth with a Comprehensive Multi-Modal, Multi-Lingual Deepfake Dataset. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01408"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3688985"},{"key":"e_1_3_2_1_43_1","volume-title":"HOLA: Enhancing Audio-visual Deepfake Detection via Hierarchical Contextual Aggregations and Efficient Pre-training. arXiv preprint arXiv:2507.22781","author":"Wu Xuecheng","year":"2025","unstructured":"Xuecheng Wu, Danlei Huang, Heli Sun, Xinyi Yin, Yifan Wang, Hao Wang, Jia Zhang, Fei Wang, Peihao Guo, Suyu Xing, et al., 2025. HOLA: Enhancing Audio-visual Deepfake Detection via Hierarchical Contextual Aggregations and Efficient Pre-training. arXiv preprint arXiv:2507.22781 (2025)."},{"key":"e_1_3_2_1_44_1","volume-title":"Exposing Deep Fakes Using Inconsistent Head Poses. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 8261-8265","author":"Yang Xin","year":"2019","unstructured":"Xin Yang, Yuezun Li, and Siwei Lyu. 2019. Exposing Deep Fakes Using Inconsistent Head Poses. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 8261-8265. ISSN: 2379-190X."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3688984"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00572"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413769"},{"key":"e_1_3_2_1_48_1","volume-title":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 13176-13180","author":"Zingarini G.","unstructured":"G. Zingarini, D. Cozzolino, R. Corvi, G. Poggi, and L. Verdoliva. 2024. M3DSYNTH: A Dataset of Medical 3D Images with AI-Generated Local Manipulations. In ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 13176-13180. ISSN: 2379-190X."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3761979","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:09:34Z","timestamp":1765339774000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3761979"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":48,"alternative-id":["10.1145\/3746027.3761979","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3761979","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}