{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T02:02:30Z","timestamp":1780020150881,"version":"3.53.1"},"reference-count":45,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100004608","name":"Jiangsu Province Natural Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004608","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.knosys.2026.116214","type":"journal-article","created":{"date-parts":[[2026,5,16]],"date-time":"2026-05-16T15:30:25Z","timestamp":1778945425000},"page":"116214","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["AMID: Audio\u2013visual deepfake detection via adaptive multi-dimensional interaction modeling"],"prefix":"10.1016","volume":"346","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-5319-8317","authenticated-orcid":false,"given":"Chenlong","family":"Xue","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0874-5033","authenticated-orcid":false,"given":"Kunyuan","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7435-3752","authenticated-orcid":false,"given":"Meng","family":"Sun","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0015-8257","authenticated-orcid":false,"given":"Qiang","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4890-0668","authenticated-orcid":false,"given":"Xiongwei","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7695-0690","authenticated-orcid":false,"given":"Kui","family":"Yao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2026.116214_b1","doi-asserted-by":"crossref","first-page":"131","DOI":"10.1016\/j.inffus.2020.06.014","article-title":"Deepfakes and beyond: A survey of face manipulation and fake detection","volume":"64","author":"Tolosana","year":"2020","journal-title":"Inf. Fusion"},{"issue":"1","key":"10.1016\/j.knosys.2026.116214_b2","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3425780","article-title":"The creation and detection of deepfakes: A survey","volume":"54","author":"Mirsky","year":"2021","journal-title":"ACM Comput. Surv."},{"issue":"5","key":"10.1016\/j.knosys.2026.116214_b3","doi-asserted-by":"crossref","first-page":"910","DOI":"10.1109\/JSTSP.2020.3002101","article-title":"Media forensics and DeepFakes: An overview","volume":"14","author":"Verdoliva","year":"2020","journal-title":"IEEE J. Sel. Top. Signal Process."},{"key":"10.1016\/j.knosys.2026.116214_b4","series-title":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"27092","article-title":"AVFF: Audio-visual feature fusion for video deepfake detection","author":"Oorloff","year":"2024"},{"key":"10.1016\/j.knosys.2026.116214_b5","first-page":"612","article-title":"Multi-modal deepfake detection via multi-task audio-visual prompt learning","volume":"vol. 39","author":"Miao","year":"2025"},{"key":"10.1016\/j.knosys.2026.116214_b6","article-title":"DIP: Diffusion learning of inconsistency pattern for general deepfake detection","author":"Nie","year":"2024","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.knosys.2026.116214_b7","first-page":"122054","article-title":"On learning multi-modal forgery representation for diffusion generated video detection","volume":"37","author":"Song","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"3","key":"10.1016\/j.knosys.2026.116214_b8","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3625231","article-title":"Voice-face homogeneity tells deepfake","volume":"20","author":"Cheng","year":"2023","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"10.1016\/j.knosys.2026.116214_b9","doi-asserted-by":"crossref","unstructured":"D. Cozzolino, A. Pianese, M. Nie\u00dfner, L. Verdoliva, Audio-visual person-of-interest deepfake detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 943\u2013952.","DOI":"10.1109\/CVPRW59228.2023.00101"},{"key":"10.1016\/j.knosys.2026.116214_b10","doi-asserted-by":"crossref","first-page":"2015","DOI":"10.1109\/TIFS.2023.3262148","article-title":"AVoiD-DF: Audio-visual joint learning for detecting deepfake","volume":"18","author":"Yang","year":"2023","journal-title":"IEEE Trans. Inf. Forensics Secur."},{"key":"10.1016\/j.knosys.2026.116214_b11","doi-asserted-by":"crossref","DOI":"10.1016\/j.asoc.2023.110124","article-title":"AVFakeNet: A unified end-to-end Dense Swin Transformer deep learning model for audio\u2013visual deepfakes detection","volume":"136","author":"Ilyas","year":"2023","journal-title":"Appl. Soft Comput."},{"key":"10.1016\/j.knosys.2026.116214_b12","series-title":"ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"4900","article-title":"Cross-modality and within-modality regularization for audio-visual deepfake detection","author":"Zou","year":"2024"},{"key":"10.1016\/j.knosys.2026.116214_b13","unstructured":"H. Khalid, S. Tariq, M. Kim, S.S. Woo, FakeAVCeleb: A novel audio-video multimodal deepfake dataset, in: Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks, 2021."},{"key":"10.1016\/j.knosys.2026.116214_b14","doi-asserted-by":"crossref","unstructured":"Z. Cai, K. Stefanov, A. Dhall, M. Hayat, Do you really mean that? Content driven audio-visual deepfake dataset and multimodal method for temporal forgery localization, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2022, pp. 2107\u20132116.","DOI":"10.1109\/DICTA56598.2022.10034605"},{"key":"10.1016\/j.knosys.2026.116214_b15","unstructured":"B. Dolhansky, J. Bitton, B. Pflaum, J. Lu, R. Howes, M. Wang, C.C. Ferrer, The deepfake detection challenge (DFDC) dataset, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, 2020, pp. 1\u201310."},{"key":"10.1016\/j.knosys.2026.116214_b16","doi-asserted-by":"crossref","unstructured":"A. Rossler, D. Cozzolino, L. Verdoliva, C. Riess, J. Thies, M. Nie\u00dfner, Faceforensics++: Learning to detect manipulated facial images, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 1\u201311.","DOI":"10.1109\/ICCV.2019.00009"},{"key":"10.1016\/j.knosys.2026.116214_b17","doi-asserted-by":"crossref","unstructured":"H. Zhao, W. Zhou, D. Chen, T. Wei, W. Zhang, N. Yu, Multi-attentional deepfake detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 2185\u20132194.","DOI":"10.1109\/CVPR46437.2021.00222"},{"key":"10.1016\/j.knosys.2026.116214_b18","doi-asserted-by":"crossref","unstructured":"Z. Wang, J. Bao, W. Zhou, W. Wang, H. Hu, H. Chen, H. Li, Dire for diffusion-generated image detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 22445\u201322455.","DOI":"10.1109\/ICCV51070.2023.02051"},{"key":"10.1016\/j.knosys.2026.116214_b19","series-title":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"6369","article-title":"End-to-end anti-spoofing with RawNet2","author":"Tak","year":"2021"},{"key":"10.1016\/j.knosys.2026.116214_b20","series-title":"Does audio deepfake detection generalize?","author":"M\u00fcller","year":"2022"},{"key":"10.1016\/j.knosys.2026.116214_b21","doi-asserted-by":"crossref","unstructured":"T. Mittal, U. Bhattacharya, R. Chandra, A. Bera, D. Manocha, Emotions don\u2019t lie: An audio-visual deepfake detection method using affective cues, in: Proceedings of the 28th ACM International Conference on Multimedia, 2020, pp. 2823\u20132832.","DOI":"10.1145\/3394171.3413570"},{"key":"10.1016\/j.knosys.2026.116214_b22","doi-asserted-by":"crossref","unstructured":"Z. Liu, J. Ning, Y. Cao, Y. Wei, Z. Zhang, S. Lin, H. Hu, Video swin transformer, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 3202\u20133211.","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"10.1016\/j.knosys.2026.116214_b23","series-title":"2021 IEEE\/CVF International Conference on Computer Vision","first-page":"14780","article-title":"Joint audio-visual deepfake detection","author":"Zhou","year":"2021"},{"key":"10.1016\/j.knosys.2026.116214_b24","doi-asserted-by":"crossref","unstructured":"A. Haliassos, R. Mira, S. Petridis, M. Pantic, Leveraging real talking faces via self-supervision for robust forgery detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 14950\u201314962.","DOI":"10.1109\/CVPR52688.2022.01453"},{"key":"10.1016\/j.knosys.2026.116214_b25","series-title":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops","first-page":"993","article-title":"Multimodaltrace: Deepfake detection using audiovisual representation learning","author":"Anas Raza","year":"2023"},{"key":"10.1016\/j.knosys.2026.116214_b26","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.knosys.2026.116214_b27","doi-asserted-by":"crossref","unstructured":"R. Girdhar, A. El-Nouby, Z. Liu, M. Singh, K.V. Alwala, A. Joulin, I. Misra, Imagebind: One embedding space to bind them all, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 15180\u201315190.","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"10.1016\/j.knosys.2026.116214_b28","series-title":"International Conference on Learning Representations","article-title":"Self-supervised learning from a multi-view perspective","author":"Tsai","year":"2021"},{"key":"10.1016\/j.knosys.2026.116214_b29","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2023.111149","article-title":"Co-space representation interaction network for multimodal sentiment analysis","volume":"283","author":"Shi","year":"2024","journal-title":"Knowl.-Based Syst."},{"issue":"2","key":"10.1016\/j.knosys.2026.116214_b30","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu","year":"2022","journal-title":"ICLR"},{"key":"10.1016\/j.knosys.2026.116214_b31","article-title":"GCS-Net: A universal AI-generated visual content detection method based on CLIP","author":"Xu","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116214_b32","series-title":"Learning audio-visual speech representation by masked multimodal cluster prediction","author":"Shi","year":"2022"},{"key":"10.1016\/j.knosys.2026.116214_b33","series-title":"LRS3-TED: a large-scale dataset for visual speech recognition","author":"Afouras","year":"2018"},{"key":"10.1016\/j.knosys.2026.116214_b34","doi-asserted-by":"crossref","unstructured":"J. Thies, M. Zollhofer, M. Stamminger, C. Theobalt, M. Nie\u00dfner, Face2face: Real-time face capture and reenactment of rgb videos, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 2387\u20132395.","DOI":"10.1109\/CVPR.2016.262"},{"issue":"4","key":"10.1016\/j.knosys.2026.116214_b35","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3306346.3323035","article-title":"Deferred neural rendering: Image synthesis using neural textures","volume":"38","author":"Thies","year":"2019","journal-title":"Acm Trans. Graph. (TOG)"},{"key":"10.1016\/j.knosys.2026.116214_b36","doi-asserted-by":"crossref","unstructured":"I. Korshunova, W. Shi, J. Dambre, L. Theis, Fast face-swap using convolutional neural networks, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 3677\u20133685.","DOI":"10.1109\/ICCV.2017.397"},{"key":"10.1016\/j.knosys.2026.116214_b37","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","volume":"vol. 31","author":"Jia","year":"2018"},{"key":"10.1016\/j.knosys.2026.116214_b38","doi-asserted-by":"crossref","unstructured":"K. Prajwal, R. Mukhopadhyay, V.P. Namboodiri, C. Jawahar, A lip sync expert is all you need for speech to lip generation in the wild, in: Proceedings of the 28th ACM International Conference on Multimedia, 2020, pp. 484\u2013492.","DOI":"10.1145\/3394171.3413532"},{"key":"10.1016\/j.knosys.2026.116214_b39","doi-asserted-by":"crossref","unstructured":"E. Zakharov, A. Shysheya, E. Burkov, V. Lempitsky, Few-shot adversarial learning of realistic neural talking head models, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 9459\u20139468.","DOI":"10.1109\/ICCV.2019.00955"},{"key":"10.1016\/j.knosys.2026.116214_b40","doi-asserted-by":"crossref","unstructured":"Y. Nirkin, Y. Keller, T. Hassner, FSGAN: Subject agnostic face swapping and reenactment, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 7184\u20137193.","DOI":"10.1109\/ICCV.2019.00728"},{"key":"10.1016\/j.knosys.2026.116214_b41","doi-asserted-by":"crossref","unstructured":"T. Karras, S. Laine, T. Aila, A style-based generator architecture for generative adversarial networks, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 4401\u20134410.","DOI":"10.1109\/CVPR.2019.00453"},{"key":"10.1016\/j.knosys.2026.116214_b42","doi-asserted-by":"crossref","unstructured":"J. Wang, Y. Liu, Y. Hu, H. Shi, T. Mei, Facex-zoo: A pytorch toolbox for face recognition, in: Proceedings of the 29th ACM International Conference on Multimedia, 2021, pp. 3779\u20133782.","DOI":"10.1145\/3474085.3478324"},{"key":"10.1016\/j.knosys.2026.116214_b43","article-title":"Pytorch: An imperative style, high-performance deep learning library","volume":"32","author":"Paszke","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"Nov","key":"10.1016\/j.knosys.2026.116214_b44","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"Maaten","year":"2008","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.knosys.2026.116214_b45","series-title":"2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference","first-page":"1524","article-title":"Multimodal forgery detection using ensemble learning","author":"Hashmi","year":"2022"}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126009408?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126009408?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T01:14:10Z","timestamp":1780017250000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126009408"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":45,"alternative-id":["S0950705126009408"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116214","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"AMID: Audio\u2013visual deepfake detection via adaptive multi-dimensional interaction modeling","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116214","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"116214"}}