{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:09:47Z","timestamp":1765339787932,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3761982","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:51Z","timestamp":1761377211000},"page":"13707-13713","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["KLASSify to Verify: Audio-Visual Deepfake Detection Using SSL-based Audio and Handcrafted Visual Features"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4052-2754","authenticated-orcid":false,"given":"Ivan","family":"Kukanov","sequence":"first","affiliation":[{"name":"KLASS Engineering and Solutions, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3299-8113","authenticated-orcid":false,"given":"Jun Wah","family":"Ng","sequence":"additional","affiliation":[{"name":"KLASS Engineering and Solutions, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"volume-title":"Advances in Neural Information Processing Systems","author":"Baevski Alexei","key":"e_1_3_2_1_1_1","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. Wav2Vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. In Advances in Neural Information Processing Systems. H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin (Eds.) Vol. 33. Curran Associates, Inc., 12449--12460."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680795"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103818"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103818"},{"key":"e_1_3_2_1_5_1","volume-title":"Usman Tariq, Tom Gedeon, and Abhinav Dhall.","author":"Cai Zhixi","year":"2025","unstructured":"Zhixi Cai, Kartik Kuckreja, Shreya Ghosh, Akanksha Chuchra, Muhammad Haris Khan, Usman Tariq, Tom Gedeon, and Abhinav Dhall. 2025. AV-Deepfake1M: A Large-Scale Audio-Visual Deepfake Benchmark with Real-World Perturbations. (2025). arXiv: 2507.20579."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Zhixi Cai Kalin Stefanov Abhinav Dhall and Munawar Hayat. 2022. Do You Really Mean That? Content Driven Audio-Visual Deepfake Dataset and Multimodal Method for Temporal Forgery Localization. In 2022 International Conference on Digital Image Computing: Techniques and Applications (DICTA) 1--10.","DOI":"10.1109\/DICTA56598.2022.10034605"},{"key":"e_1_3_2_1_7_1","volume-title":"Eren G\u00f6lge, and Moacir Antonelli Ponti.","author":"Casanova Edresson","year":"2023","unstructured":"Edresson Casanova, JulianWeber, Christopher Shulby, Arnaldo Candido Junior, Eren G\u00f6lge, and Moacir Antonelli Ponti. 2023. YourTTS: Towards Zero-Shot Multi-Speaker TTS and Zero-Shot Voice Conversion for everyone. (2023). arXiv: 2112.02418 [cs.SD]."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Edresson Casanova et al. 2024. XTTS: a Massively Multilingual Zero-Shot Text-to-Speech Model. (2024). arXiv: 2406.04904 [eess.AS].","DOI":"10.21437\/Interspeech.2024-2016"},{"key":"e_1_3_2_1_9_1","unstructured":"Sanyuan Chen et al. 2021. WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing. CoRR abs\/2110.13900."},{"key":"e_1_3_2_1_10_1","unstructured":"Sanyuan Chen et al. 2021. WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing. CoRR abs\/2110.13900."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.195"},{"key":"e_1_3_2_1_12_1","unstructured":"Soumyya Kanti Datta Shan Jia and Siwei Lyu. 2024. Exposing Lip-syncing Deepfakes from Mouth Inconsistencies. (2024). arXiv: 2401.10113 [cs.CV]."},{"key":"e_1_3_2_1_13_1","unstructured":"Zhihao Gu Yang Chen Taiping Yao Shouhong Ding Jilin Li Feiyue Huang and Lizhuang Ma. 2021. Spatiotemporal Inconsistency Learning for DeepFake Video Detection. (2021). arXiv: 2109.01860 [cs.CV]."},{"key":"e_1_3_2_1_14_1","volume-title":"Proc., 29 (Oct.","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units. IEEE\/ACM Trans. Audio, Speech and Lang. Proc., 29 (Oct. 2021), 3451--3460."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747766"},{"key":"e_1_3_2_1_16_1","unstructured":"Jaehyeon Kim Jungil Kong and Juhee Son. 2021. Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. (2021). arXiv: 2106.06103 [cs.SD]."},{"key":"e_1_3_2_1_17_1","volume-title":"Kingma and Jimmy Ba","author":"Diederik","year":"2017","unstructured":"Diederik P. Kingma and Jimmy Ba. 2017. Adam: A Method for Stochastic Optimization. (2017). arXiv: 1412.6980."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953152"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832350"},{"key":"e_1_3_2_1_20_1","volume-title":"STC Antispoofing Systems for the ASVspoof2019 Challenge. (2019). arXiv","author":"Lavrentyeva Galina","year":"1904","unstructured":"Galina Lavrentyeva, Sergey Novoselov, Andzhukaev Tseren, Marina Volkova, Artem Gorlanov, and Alexandr Kozlov. 2019. STC Antispoofing Systems for the ASVspoof2019 Challenge. (2019). arXiv: 1904.05576 [cs.SD]."},{"key":"e_1_3_2_1_21_1","volume-title":"Austin Reiter, and Gregory D. Hager","author":"Lea Colin","year":"2016","unstructured":"Colin Lea, Rene Vidal, Austin Reiter, and Gregory D. Hager. 2016. Temporal Convolutional Networks: A Unified Approach to Action Segmentation. (2016). arXiv: 1608.08242 [cs.CV]."},{"key":"e_1_3_2_1_22_1","unstructured":"Chunyu Li Chao Zhang Weikai Xu Jingyu Lin Jinghui Xie Weiguo Feng Bingyue Peng Cunjian Chen and Weiwei Xing. 2025. LatentSync: Taming Audio-Conditioned Latent Diffusion Models for Lip Sync with SyncNet Supervision. (2025). arXiv: 2412.09262 [cs.CV]."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1155\/int\/7945646"},{"key":"e_1_3_2_1_24_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. (2019). arXiv: 1711.05101 [cs.LG]."},{"key":"e_1_3_2_1_25_1","volume-title":"SGDR: Stochastic Gradient Descent with Warm Restarts.","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. SGDR: Stochastic Gradient Descent with Warm Restarts. (2017). arXiv: 1608.03983 [cs.LG]."},{"key":"e_1_3_2_1_26_1","volume-title":"Ming Guang Thornton, and Yannis Papakonstantinou","author":"Lugaresi Camillo","year":"2019","unstructured":"Camillo Lugaresi, Jiuqiang Tang, Hadon Nash, Chris McCormick, Alexander Roff, Jonathon Shlens, Michael Mazzocchi, Ming Guang Thornton, and Yannis Papakonstantinou. 2019. MediaPipe: A Framework for Building Perception Pipelines. (2019). arXiv: 1906.08172 [cs.CV]."},{"key":"e_1_3_2_1_27_1","volume-title":"Ravi Teja Gadde, and Abhinav Shrivastava","author":"Mukhopadhyay Soumik","year":"2023","unstructured":"Soumik Mukhopadhyay, Saksham Suri, Ravi Teja Gadde, and Abhinav Shrivastava. 2023. Diff2Lip: Audio Conditioned Diffusion Models for Lip-Synchronization. (2023). arXiv: 2308.09716 [cs.CV]."},{"key":"e_1_3_2_1_28_1","volume-title":"Does Audio Deepfake Detection Generalize?","author":"M\u00fcller Nicolas M.","year":"2024","unstructured":"Nicolas M. M\u00fcller, Pavel Czempin, Franziska Dieckmann, Adam Froghyar, and Konstantin B\u00f6ttinger. 2024. Does Audio Deepfake Detection Generalize? (2024). arXiv: 2203.16263."},{"volume-title":"Advances in Large Margin Classifiers","author":"Platt John C.","key":"e_1_3_2_1_29_1","unstructured":"John C. Platt. [n.d.] Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods. In Advances in Large Margin Classifiers. MIT Press, 61--74."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.3115\/1596374.1596399"},{"key":"e_1_3_2_1_31_1","unstructured":"Hemlata Tak Massimiliano Todisco et al. 2022. Automatic Speaker Verification Spoofing and Deepfake Detection Using Wav2vec 2.0 and Data Augmentation. In Odyssey."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i4.25658"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Massimiliano Todisco et al. 2019. ASVspoof 2019: Future Horizons in Spoofed and Fake Audio Detection. In Interspeech.","DOI":"10.21437\/Interspeech.2019-2249"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Jiadong Wang Xinyuan Qian Malu Zhang Robby T. Tan and Haizhou Li. 2023. Seeing What You Said: Talking Face Generation Guided by a Lip Reading Expert. (2023). arXiv: 2303.17480 [cs.CV].","DOI":"10.1109\/CVPR52729.2023.01408"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Xin Wang et al. 2024. ASVspoof 5: Crowdsourced Speech Data Deepfakes and Adversarial Attacks at Scale. ArXiv abs\/2408.08739.","DOI":"10.21437\/ASVspoof.2024-1"},{"key":"e_1_3_2_1_36_1","volume-title":"X2Face: A network for controlling face generation by using images, audio, and pose codes. (2018). arXiv","author":"Wiles Olivia","year":"1807","unstructured":"Olivia Wiles, A. Sophia Koepke, and Andrew Zisserman. 2018. X2Face: A network for controlling face generation by using images, audio, and pose codes. (2018). arXiv: 1807.10550 [cs.CV]."},{"key":"e_1_3_2_1_37_1","unstructured":"Xinqi Xiong Prakrut Patel Qingyuan Fan Amisha Wadhwa Sarathy Selvam Xiao Guo Luchao Qi Xiaoming Liu and Roni Sengupta. 2025. TalkingHead-Bench: A Multi-Modal Benchmark & Analysis of Talking-Head DeepFake Detection. (2025). arXiv: 2505.24866 [cs.CV]."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.21437\/ASVSPOOF.2021-8"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3233236"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.21437\/ASVSPOOF.2021-2"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021--738"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613767"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2025-942"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Yinglin Zheng Jianmin Bao Dong Chen Ming Zeng and Fang Wen. 2021. Exploring Temporal Coherence for More General Video Face Forgery Detection. (2021). arXiv: 2108.06693 [cs.CV].","DOI":"10.1109\/ICCV48922.2021.01477"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Jiafeng Zhong Bin Li and Jiangyan Yi. 2024. Enhancing Partially Spoofed Audio Localization with Boundary-aware Attention Mechanism. arXiv: 2407.21611.","DOI":"10.21437\/Interspeech.2024-587"},{"volume-title":"Semi-Supervised Learning Literature Survey. Tech. rep. 1530. Computer Sciences","author":"Zhu Xiaojin","key":"e_1_3_2_1_46_1","unstructured":"Xiaojin Zhu. 2005. Semi-Supervised Learning Literature Survey. Tech. rep. 1530. Computer Sciences, University of Wisconsin-Madison."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3761982","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:06:30Z","timestamp":1765339590000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3761982"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":46,"alternative-id":["10.1145\/3746027.3761982","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3761982","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}