{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T15:45:43Z","timestamp":1780501543013,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Aviation Science Foundation","award":["2022Z071078001"],"award-info":[{"award-number":["2022Z071078001"]}]},{"name":"Dreams Foundation of Jianghuai Advance Technology Center","award":["2023-ZM01Z001"],"award-info":[{"award-number":["2023-ZM01Z001"]}]},{"name":"Natural Science Foundation of China","award":["62276242, 62006209"],"award-info":[{"award-number":["62276242, 62006209"]}]},{"name":"Beijing Municipal Science & Technology Commission, Administrative Commission of Zhongguancun Science Park","award":["Z231100005923035"],"award-info":[{"award-number":["Z231100005923035"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3688985","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"11370-11376","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Building Robust Video-Level Deepfake Detection via Audio-Visual Local-Global Interactions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-8153-492X","authenticated-orcid":false,"given":"Yifan","family":"Wang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6244-0269","authenticated-orcid":false,"given":"Xuecheng","family":"Wu","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8922-1822","authenticated-orcid":false,"given":"Jia","family":"Zhang","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3891-9207","authenticated-orcid":false,"given":"Mohan","family":"Jing","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8974-3813","authenticated-orcid":false,"given":"Keda","family":"Lu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3197-8103","authenticated-orcid":false,"given":"Jun","family":"Yu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6787-4384","authenticated-orcid":false,"given":"Wen","family":"Su","sequence":"additional","affiliation":[{"name":"Zhejiang Sci-Tech University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1816-5420","authenticated-orcid":false,"given":"Fang","family":"Gao","sequence":"additional","affiliation":[{"name":"Guangxi University, Nanning, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1208-3031","authenticated-orcid":false,"given":"Qingsong","family":"Liu","sequence":"additional","affiliation":[{"name":"Unisound, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3598-8564","authenticated-orcid":false,"given":"Jianqing","family":"Sun","sequence":"additional","affiliation":[{"name":"Unisound, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8309-1301","authenticated-orcid":false,"given":"Jiaen","family":"Liang","sequence":"additional","affiliation":[{"name":"Unisound, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/WIFS.2018.8630761"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00338"},{"key":"e_1_3_2_1_3_1","volume-title":"Jamie Ryan Kiros, and Geoffrey E Hinton","author":"Ba Jimmy Lei","year":"2016","unstructured":"Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. 2016. Layer normalization. arXiv preprint arXiv:1607.06450 (2016)."},{"key":"e_1_3_2_1_4_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems, Vol. 33 (2020), 12449--12460."},{"key":"e_1_3_2_1_5_1","first-page":"4","article-title":"Is space-time attention all you need for video understanding?","volume":"2","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is space-time attention all you need for video understanding?. In ICML, Vol. 2. 4.","journal-title":"ICML"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3689145"},{"key":"e_1_3_2_1_7_1","volume-title":"Munawar Hayat, Abhinav Dhall, and Kalin Stefanov.","author":"Cai Zhixi","year":"2023","unstructured":"Zhixi Cai, Shreya Ghosh, Aman Pankaj Adatia, Munawar Hayat, Abhinav Dhall, and Kalin Stefanov. 2023. AV-Deepfake1M: A large-scale LLM-driven audio-visual deepfake dataset. arXiv preprint arXiv:2311.15308 (2023)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00150"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/DICTA56598.2022.10034605"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01815"},{"key":"e_1_3_2_1_11_1","volume-title":"An enhanced res2net with local and global feature fusion for speaker verification. arXiv preprint arXiv:2305.12838","author":"Chen Yafeng","year":"2023","unstructured":"Yafeng Chen, Siqi Zheng, Hui Wang, Luyao Cheng, Qian Chen, and Jiajun Qi. 2023. An enhanced res2net with local and global feature fusion for speaker verification. arXiv preprint arXiv:2305.12838 (2023)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3625231","article-title":"Voice-face homogeneity tells deepfake","volume":"20","author":"Cheng Harry","year":"2023","unstructured":"Harry Cheng, Yangyang Guo, Tianyi Wang, Qi Li, Xiaojun Chang, and Liqiang Nie. 2023. Voice-face homogeneity tells deepfake. ACM Transactions on Multimedia Computing, Communications and Applications, Vol. 20, 3 (2023), 1--22.","journal-title":"ACM Transactions on Multimedia Computing, Communications and Applications"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413700"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_15_1","volume-title":"Ecapa-tdnn: Emphasized channel attention, propagation and aggregation in tdnn based speaker verification. arXiv preprint arXiv:2005.07143","author":"Desplanques Brecht","year":"2020","unstructured":"Brecht Desplanques, Jenthe Thienpondt, and Kris Demuynck. 2020. Ecapa-tdnn: Emphasized channel attention, propagation and aggregation in tdnn based speaker verification. arXiv preprint arXiv:2005.07143 (2020)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00389"},{"key":"e_1_3_2_1_17_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10044-022-01083-2"},{"key":"e_1_3_2_1_19_1","volume-title":"Res2net: A new multi-scale backbone architecture","author":"Gao Shang-Hua","year":"2019","unstructured":"Shang-Hua Gao, Ming-Ming Cheng, Kai Zhao, Xin-Yu Zhang, Ming-Hsuan Yang, and Philip Torr. 2019. Res2net: A new multi-scale backbone architecture. IEEE transactions on pattern analysis and machine intelligence, Vol. 43, 2 (2019), 652--662."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00500"},{"key":"e_1_3_2_1_21_1","volume-title":"Kushal Lakhotia","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio, speech, and language processing, Vol. 29 (2021), 3451--3460."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00436"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19990"},{"key":"e_1_3_2_1_25_1","volume-title":"DF-TransFusion: Multimodal deepfake detection via lip-audio cross-attention and facial self-attention. arXiv preprint arXiv:2309.06511","author":"Kharel Aaditya","year":"2023","unstructured":"Aaditya Kharel, Manas Paranjape, and Aniket Bera. 2023. DF-TransFusion: Multimodal deepfake detection via lip-audio cross-attention and facial self-attention. arXiv preprint arXiv:2309.06511 (2023)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_2_1_27_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00013"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2021.101329"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0287503"},{"key":"e_1_3_2_1_31_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00009"},{"key":"e_1_3_2_1_33_1","first-page":"80","article-title":"Recurrent convolutional strategies for face manipulation detection in videos","volume":"3","author":"Sabir Ekraam","year":"2019","unstructured":"Ekraam Sabir, Jiaxin Cheng, Ayush Jaiswal, Wael AbdAlmageed, Iacopo Masi, and Prem Natarajan. 2019. Recurrent convolutional strategies for face manipulation detection in videos. Interfaces (GUI), Vol. 3, 1 (2019), 80--87.","journal-title":"Interfaces (GUI)"},{"key":"e_1_3_2_1_34_1","volume-title":"Musan: A music, speech, and noise corpus. arXiv preprint arXiv:1510.08484","author":"Snyder David","year":"2015","unstructured":"David Snyder, Guoguo Chen, and Daniel Povey. 2015. Musan: A music, speech, and noise corpus. arXiv preprint arXiv:1510.08484 (2015)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612365"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3595353.3595880"},{"key":"e_1_3_2_1_37_1","volume-title":"Cam: A fast and efficient network for speaker verification using context-aware masking. arXiv preprint arXiv:2303.00332","author":"Wang Hui","year":"2023","unstructured":"Hui Wang, Siqi Zheng, Yafeng Chen, Luyao Cheng, and Qian Chen. 2023. Cam: A fast and efficient network for speaker verification using context-aware masking. arXiv preprint arXiv:2303.00332 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"AVT2-DWF: Improving Deepfake Detection with Audio-Visual Fusion and Dynamic Weighting Strategies. arXiv preprint arXiv:2403.14974","author":"Wang Rui","year":"2024","unstructured":"Rui Wang, Dengpan Ye, Long Tang, Yunming Zhang, and Jiacheng Deng. 2024. AVT2-DWF: Improving Deepfake Detection with Audio-Visual Fusion and Dynamic Weighting Strategies. arXiv preprint arXiv:2403.14974 (2024)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00991"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00402"},{"key":"e_1_3_2_1_41_1","volume-title":"Explicit Correlation Learning for Generalizable Cross-Modal Deepfake Detection. arXiv preprint arXiv:2404.19171","author":"Yu Cai","year":"2024","unstructured":"Cai Yu, Shan Jia, Xiaomeng Fu, Jin Liu, Jiahe Tian, Jiao Dai, Xi Wang, Siwei Lyu, and Jizhong Han. 2024. Explicit Correlation Learning for Generalizable Cross-Modal Deepfake Detection. arXiv preprint arXiv:2404.19171 (2024)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01811"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00222"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i01.5364"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01477"},{"key":"e_1_3_2_1_46_1","volume-title":"Eng Siong Chng, and Deepu Rajan","author":"Zou Heqing","year":"2024","unstructured":"Heqing Zou, Meng Shen, Yuchen Hu, Chen Chen, Eng Siong Chng, and Deepu Rajan. 2024. Cross-Modality and Within-Modality Regularization for Audio-Visual Deepfake Detection. In ICASSP 2024--2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 4900--4904."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3688985","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3688985","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:29Z","timestamp":1750295849000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3688985"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":46,"alternative-id":["10.1145\/3664647.3688985","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3688985","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}