{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T02:59:05Z","timestamp":1769137145003,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","funder":[{"name":"National Science Foundation of China","award":["62472348"],"award-info":[{"award-number":["62472348"]}]},{"name":"the Aviation Science Foundation","award":["2023M071070002, 2024M071070001"],"award-info":[{"award-number":["2023M071070002, 2024M071070001"]}]},{"name":"the Xianyang Major Scientific and Technological Achievements Transformation Special Project?","award":["L2024-ZDKJ-ZDCGZH-0012"],"award-info":[{"award-number":["L2024-ZDKJ-ZDCGZH-0012"]}]},{"name":"the Key Research and Development Program of Shaanxi","award":["2023-YBGY-230, 2024GX-YBXM-533"],"award-info":[{"award-number":["2023-YBGY-230, 2024GX-YBXM-533"]}]},{"name":"the Sichuan Science and Technology Program","award":["2025ZNSFSC0468"],"award-info":[{"award-number":["2025ZNSFSC0468"]}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LQ23F030009"],"award-info":[{"award-number":["LQ23F030009"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3761980","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:51Z","timestamp":1761377211000},"page":"13692-13699","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["HOLA: Enhancing Audio-visual Deepfake Detection via Hierarchical Contextual Aggregations and Efficient Pre-training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6244-0269","authenticated-orcid":false,"given":"Xuecheng","family":"Wu","sequence":"first","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0818-0301","authenticated-orcid":false,"given":"Heli","family":"Sun","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5929-6455","authenticated-orcid":false,"given":"Danlei","family":"Huang","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9553-8654","authenticated-orcid":false,"given":"Xinyi","family":"Yin","sequence":"additional","affiliation":[{"name":"Zhengzhou University, Zhengzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8153-492X","authenticated-orcid":false,"given":"Yifan","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6959-7237","authenticated-orcid":false,"given":"Hao","family":"Wang","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8922-1822","authenticated-orcid":false,"given":"Jia","family":"Zhang","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3920-0264","authenticated-orcid":false,"given":"Fei","family":"Wang","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5909-1340","authenticated-orcid":false,"given":"Peihao","family":"Guo","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2615-9464","authenticated-orcid":false,"given":"Suyu","family":"Xing","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1569-5362","authenticated-orcid":false,"given":"Junxiao","family":"Xue","sequence":"additional","affiliation":[{"name":"Zhejiang Lab, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6463-5158","authenticated-orcid":false,"given":"Liang","family":"He","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, et al.","author":"Abdin Marah","year":"2024","unstructured":"Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, et al., 2024. Phi-3 technical report: A highly capable language model locally on your phone. arXiv preprint arXiv:2404.14219 (2024)."},{"key":"e_1_3_2_1_2_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2.5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_3_1","first-page":"4","article-title":"Is space-time attention all you need for video understanding?","volume":"2","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is space-time attention all you need for video understanding?. In ICML, Vol. 2. 4.","journal-title":"ICML"},{"key":"e_1_3_2_1_4_1","unstructured":"ByteDance. 2025. Doubao-1.5-vision-pro-32k. https:\/\/volcengine.com\/product\/doubao."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680795"},{"key":"e_1_3_2_1_6_1","volume-title":"Usman Tariq, Tom Gedeon, and Abhinav Dhall.","author":"Cai Zhixi","year":"2025","unstructured":"Zhixi Cai, Kartik Kuckreja, Shreya Ghosh, Akanksha Chuchra, Muhammad Haris Khan, Usman Tariq, Tom Gedeon, and Abhinav Dhall. 2025. AV-Deepfake1M: A Large-Scale Audio-Visual Deepfake Benchmark with Real-World Perturbations. arXiv preprint arXiv:2507.20579 (2025)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/DICTA56598.2022.10034605"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00101"},{"key":"e_1_3_2_1_10_1","volume-title":"Ecapa-tdnn: Emphasized channel attention, propagation and aggregation in tdnn based speaker verification. arXiv preprint arXiv:2005.07143","author":"Desplanques Brecht","year":"2020","unstructured":"Brecht Desplanques, Jenthe Thienpondt, and Kris Demuynck. 2020. Ecapa-tdnn: Emphasized channel attention, propagation and aggregation in tdnn based speaker verification. arXiv preprint arXiv:2005.07143 (2020)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054017"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3731715.3733415"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10044-022-01083-2"},{"key":"e_1_3_2_1_14_1","volume-title":"Res2net: A new multi-scale backbone architecture","author":"Gao Shang-Hua","year":"2019","unstructured":"Shang-Hua Gao, Ming-Ming Cheng, Kai Zhao, Xin-Yu Zhang, Ming-Hsuan Yang, and Philip Torr. 2019. Res2net: A new multi-scale backbone architecture. IEEE transactions on pattern analysis and machine intelligence, Vol. 43, 2 (2019), 652-662."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/IConSCEPT61884.2024.10627878"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00434"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"e_1_3_2_1_19_1","volume-title":"FakeAVCeleb: A novel audio-video multimodal deepfake dataset. arXiv preprint arXiv:2108.05080","author":"Khalid Hasam","year":"2021","unstructured":"Hasam Khalid, Shahroz Tariq, Minha Kim, and Simon S Woo. 2021. FakeAVCeleb: A novel audio-video multimodal deepfake dataset. arXiv preprint arXiv:2108.05080 (2021)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00327"},{"key":"e_1_3_2_1_22_1","first-page":"1476","article-title":"MSDWild: Multi-modal Speaker Diarization Dataset in the Wild","author":"Liu Tao","year":"2022","unstructured":"Tao Liu, Shuai Fan, Xu Xiang, Hongbo Song, Shaoxiong Lin, Jiaqi Sun, Tianyuan Han, Siyuan Chen, Binwei Yao, Sen Liu, et al., 2022a. MSDWild: Multi-modal Speaker Diarization Dataset in the Wild.. In INTERSPEECH. 1476-1480.","journal-title":"INTERSPEECH."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437880.3460400"},{"key":"e_1_3_2_1_25_1","volume-title":"DDL: A Dataset for Interpretable Deepfake Detection and Localization in Real-World Scenarios. arXiv preprint arXiv:2506.23292","author":"Miao Changtao","year":"2025","unstructured":"Changtao Miao, Yi Zhang, Weize Gao, Man Luo, Weiwei Feng, Zhiya Tan, Jianshu Li, Ajian Liu, Yunfeng Diao, Qi Chu, et al., 2025. DDL: A Dataset for Interpretable Deepfake Detection and Localization in Real-World Scenarios. arXiv preprint arXiv:2506.23292 (2025)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0287503"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01647"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02559"},{"key":"e_1_3_2_1_29_1","unstructured":"OpenAI. 2024. Hello GPT-4o. https:\/\/openai.com\/index\/hello-gpt-4o\/."},{"key":"e_1_3_2_1_30_1","unstructured":"OpenAI. 2025. OpenAI o3. https:\/\/openai.com\/index\/introducing-o3-and-o4-mini\/."},{"key":"e_1_3_2_1_31_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al., 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3263288"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00009"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCB52358.2021.9484408"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","unstructured":"Sahil Sharma Ashima Sood and Vijay Kumar. 2024. Deepfake Synthetic-20K Dataset. https:\/\/doi.org\/10.21227\/67x4-9g14","DOI":"10.21227\/67x4-9g14"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102382"},{"key":"e_1_3_2_1_37_1","unstructured":"Gemini Team. 2025. Gemini-2.5-pro-preview-03-25. https:\/\/deepmind.google\/technologies\/gemini\/pro\/."},{"key":"e_1_3_2_1_38_1","volume-title":"QVQ: To See the World with Wisdom. https:\/\/qwenlm.github.io\/blog\/qvq-72b-preview\/.","author":"Team Qwen","year":"2024","unstructured":"Qwen Team. 2024. QVQ: To See the World with Wisdom. https:\/\/qwenlm.github.io\/blog\/qvq-72b-preview\/."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2022.03.026"},{"key":"e_1_3_2_1_41_1","volume-title":"CAM: A Fast and Efficient Network For Speaker Verification Using Context-Aware Masking. arXiv preprint arXiv:2303.00332","author":"Wang Hui","year":"2023","unstructured":"Hui Wang, Siqi Zheng, Yafeng Chen, Luyao Cheng, and Qian Chen. 2023. CAM: A Fast and Efficient Network For Speaker Verification Using Context-Aware Masking. arXiv preprint arXiv:2303.00332 (2023)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2024.104133"},{"key":"e_1_3_2_1_43_1","unstructured":"Xuecheng Wu Jiaxing Liu Danlei Huang Xiaoyu Li et al. 2025a. ViC-Bench: Benchmarking Visual-Interleaved Chain-of-Thought Capability in MLLMs with Free-Style Intermediate State Representations. arXiv preprint arXiv:2505.14404 (2025)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00854"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3731715.3733453"},{"key":"e_1_3_2_1_46_1","unstructured":"xAI. 2025. xAI Grok 4. https:\/\/x.ai\/news\/grok-4."},{"key":"e_1_3_2_1_47_1","first-page":"29387","article-title":"Df40: Toward next-generation deepfake detection","volume":"37","author":"Yan Zhiyuan","year":"2024","unstructured":"Zhiyuan Yan, Taiping Yao, Shen Chen, Yandan Zhao, Xinghe Fu, Junwei Zhu, Donghao Luo, Chengjie Wang, Shouhong Ding, Yunsheng Wu, et al., 2024. Df40: Toward next-generation deepfake detection. Advances in Neural Information Processing Systems, Vol. 37 (2024), 29387-29434.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3625100"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"e_1_3_2_1_50_1","volume-title":"HKD4VLM: A Progressive Hybrid Knowledge Distillation Framework for Robust Multimodal Hallucination and Factuality Detection in VLMs. arXiv preprint arXiv:2506.13038","author":"Zhang Zijian","year":"2025","unstructured":"Zijian Zhang, Xuecheng Wu, Danlei Huang, Siyu Yan, Chong Peng, and Xuezhi Cao. 2025a. HKD4VLM: A Progressive Hybrid Knowledge Distillation Framework for Robust Multimodal Hallucination and Factuality Detection in VLMs. arXiv preprint arXiv:2506.13038 (2025)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW67362.2025.00120"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00222"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01477"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_38"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3761980","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:06:35Z","timestamp":1765339595000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3761980"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":54,"alternative-id":["10.1145\/3746027.3761980","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3761980","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}