{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:12:23Z","timestamp":1778080343798,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680622","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"3887-3896","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["Identity-Driven Multimedia Forgery Detection via Reference Assistance"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-2811-1992","authenticated-orcid":false,"given":"Junhao","family":"Xu","sequence":"first","affiliation":[{"name":"Shanghai Key Lab of Intell. Info. Processing, School of CS, Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1737-3420","authenticated-orcid":false,"given":"Jingjing","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Key Lab of Intell. Info. Processing, School of CS, Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0324-277X","authenticated-orcid":false,"given":"Xue","family":"Song","sequence":"additional","affiliation":[{"name":"Shanghai Key Lab of Intell. Info. Processing, School of CS, Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5633-0224","authenticated-orcid":false,"given":"Feng","family":"Han","sequence":"additional","affiliation":[{"name":"Shanghai Key Lab of Intell. Info. Processing, School of CS, Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5693-6084","authenticated-orcid":false,"given":"Haijun","family":"Shan","sequence":"additional","affiliation":[{"name":"CEC GienTech Technology Co.,Ltd., Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1907-8567","authenticated-orcid":false,"given":"Yu-Gang","family":"Jiang","sequence":"additional","affiliation":[{"name":"Shanghai Key Lab of Intell. Info. Processing, School of CS, Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/WIFS.2018.8630761"},{"key":"e_1_3_2_1_2_1","unstructured":"babysor. 2022. MockingBird. https:\/\/github.com\/babysor\/MockingBird."},{"key":"e_1_3_2_1_3_1","unstructured":"James Betker. 2022. TorToiSe text-to-speech. https:\/\/github.com\/neonbjb\/tortoise-tts."},{"key":"e_1_3_2_1_4_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6--12, 2020, virtual, Hugo Larochelle, Marc'Aurelio Ranzato, Raia Hadsell, Maria-Florina Balcan, and Hsuan-Tien Lin (Eds.). https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/DICTA56598.2022.10034605"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_7_1","volume-title":"SimSwap: An Efficient Framework For High Fidelity Face Swapping. In MM '20: The 28th ACM International Conference on Multimedia.","author":"Chen Renwang","year":"2020","unstructured":"Renwang Chen, Xuanhong Chen, Bingbing Ni, and Yanhao Ge. 2020. SimSwap: An Efficient Framework For High Fidelity Face Swapping. In MM '20: The 28th ACM International Conference on Multimedia."},{"key":"e_1_3_2_1_8_1","volume-title":"Voice-face homogeneity tells deepfake. arXiv preprint arXiv:2203.02195","author":"Cheng Harry","year":"2022","unstructured":"Harry Cheng, Yangyang Guo, Tianyi Wang, Qi Li, Xiaojun Chang, and Liqiang Nie. 2022. Voice-face homogeneity tells deepfake. arXiv preprint arXiv:2203.02195 (2022)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.195"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.21437\/INTERSPEECH.2018--1929"},{"key":"e_1_3_2_1_11_1","unstructured":"CorentinJ. 2020. Real-Time-Voice-Cloning. https:\/\/github.com\/CorentinJ\/Real-Time-Voice-Cloning."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00101"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01483"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_15_1","unstructured":"B. Dolhansky R. Howes B. Pflaum N. Baram and C. Ferrer. 2019. The Deepfake Detection Challenge (DFDC) Preview Dataset. In arXiv:1910.08854."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00389"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00925"},{"key":"e_1_3_2_1_18_1","volume-title":"Identity-Driven DeepFake Detection. CoRR","author":"Dong Xiaoyi","year":"2020","unstructured":"Xiaoyi Dong, Jianmin Bao, Dongdong Chen, Weiming Zhang, Nenghai Yu, Dong Chen, Fang Wen, and Baining Guo. 2020. Identity-Driven DeepFake Detection. CoRR, Vol. abs\/2012.03930 (2020). showeprint[arXiv]2012.03930 https:\/\/arxiv.org\/abs\/2012.03930"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00341"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.21437\/INTERSPEECH.2021--698"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19955"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01453"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2405.12970"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00434"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/102"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00436"},{"key":"e_1_3_2_1_28_1","unstructured":"Deep Insight. 2023. Insightface. https:\/\/github.com\/deepinsight\/insightface."},{"key":"e_1_3_2_1_29_1","volume-title":"Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018","author":"Jia Ye","year":"2018","unstructured":"Ye Jia, Yu Zhang, Ron J. Weiss, Quan Wang, Jonathan Shen, Fei Ren, Zhifeng Chen, Patrick Nguyen, Ruoming Pang, Ignacio Lopez-Moreno, and Yonghui Wu. 2018. Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis. In Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018, NeurIPS 2018, December 3--8, 2018, Montr\u00e9al, Canada, Samy Bengio, Hanna M. Wallach, Hugo Larochelle, Kristen Grauman, Nicol\u00f2 Cesa-Bianchi, and Roman Garnett (Eds.). 4485--4495. https:\/\/proceedings.neurips.cc\/paper\/2018\/hash\/6832a7b24bc06775d02b7406880b93fc-Abstract.html"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00296"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3476099.3484315"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks 1, NeurIPS Datasets and Benchmarks 2021","author":"Khalid Hasam","year":"2021","unstructured":"Hasam Khalid, Shahroz Tariq, Minha Kim, and Simon S. Woo. 2021. FakeAVCeleb: A Novel Audio-Video Multimodal Deepfake Dataset. In Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks 1, NeurIPS Datasets and Benchmarks 2021, December 2021, virtual, Joaquin Vanschoren and Sai-Kit Yeung (Eds.). https:\/\/datasets-benchmarks-proceedings.neurips.cc\/paper\/2021\/hash\/d9d4f495e875a2e075a1a4a6e1b9770f-Abstract-round2.html"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01223"},{"key":"e_1_3_2_1_34_1","unstructured":"P. Korshunov and S. Marcel. 2018. DeepFakes: a New Threat to Face Recognition? Assessment and Detection. In arXiv:1812.08685."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/WIFS.2018.8630787"},{"key":"e_1_3_2_1_36_1","volume-title":"Exposing DeepFake Videos By Detecting Face Warping Artifacts. In IEEE Conference on Computer Vision and Pattern Recognition Workshops, CVPR Workshops 2019","author":"Li Yuezun","year":"2019","unstructured":"Yuezun Li and Siwei Lyu. 2019. Exposing DeepFake Videos By Detecting Face Warping Artifacts. In IEEE Conference on Computer Vision and Pattern Recognition Workshops, CVPR Workshops 2019, Long Beach, CA, USA, June 16--20, 2019. Computer Vision Foundation \/ IEEE, 46--52. http:\/\/openaccess.thecvf.com\/content_CVPRW_2019\/html\/Media_Forensics\/Li_Exposing_DeepFake_Videos_By_Detecting_Face_Warping_Artifacts_CVPRW_2019_paper.html"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00327"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00083"},{"key":"e_1_3_2_1_39_1","unstructured":"MarekKowalski. 2019. Faceswap-GAN. https:\/\/github.com\/shaoanlu\/faceswap-GAN."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3-030--58571--6_39"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00939"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02345"},{"key":"e_1_3_2_1_44_1","volume-title":"Luis RP, Jian Jiang, et al.","author":"Perov Ivan","year":"2020","unstructured":"Ivan Perov, Daiheng Gao, Nikolay Chervoniy, Kunlin Liu, Sugasa Marangonda, Chris Um\u00e9, Mr Dpfks, Carl Shift Facenheim, Luis RP, Jian Jiang, et al. 2020. DeepFaceLab: Integrated, flexible and extensible face-swapping framework. arXiv preprint arXiv:2005.05535 (2020)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3-030--58610--2_6"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.590"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2212.04356"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"A. Rossler D. Cozzolino L. Verdoliva C. Riess J. Thies and M. Niessner. 2019. FaceForensics: Learning to Detect Manipulated Facial Images. In ICCV.","DOI":"10.1109\/ICCV.2019.00009"},{"key":"e_1_3_2_1_50_1","volume-title":"RVC: Retrieval-based Voice Conversion WebUI. https:\/\/github.com\/RVC-Project\/Retrieval-based-Voice-Conversion-WebUI.","year":"2023","unstructured":"RVC-Project. 2023. RVC: Retrieval-based Voice Conversion WebUI. https:\/\/github.com\/RVC-Project\/Retrieval-based-Voice-Conversion-WebUI."},{"key":"e_1_3_2_1_51_1","unstructured":"s0md3v. 2023. Roop. https:\/\/github.com\/s0md3v\/roop."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00667"},{"key":"e_1_3_2_1_53_1","unstructured":"shaoanlu. 2019. FaceSwap. https:\/\/github.com\/MarekKowalski\/FaceSwap."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612386"},{"key":"e_1_3_2_1_55_1","unstructured":"Suno. 2023. Bark. https:\/\/github.com\/suno-ai\/bark."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414234"},{"key":"e_1_3_2_1_57_1","volume-title":"Proceedings of the 36th International Conference on Machine Learning, ICML 2019, 9--15","volume":"6114","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc V. Le. 2019. EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks. In Proceedings of the 36th International Conference on Machine Learning, ICML 2019, 9--15 June 2019, Long Beach, California, USA (Proceedings of Machine Learning Research, Vol. 97), Kamalika Chaudhuri and Ruslan Salakhutdinov (Eds.). PMLR, 6105--6114. http:\/\/proceedings.mlr.press\/v97\/tan19a.html"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2203.12602"},{"key":"e_1_3_2_1_59_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02048"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"crossref","unstructured":"X. Yang Y. Li and S. Lyu. 2019. Exposing Deep Fakes Using Inconsistent Head Poses. In ICASSP.","DOI":"10.1109\/ICASSP.2019.8683164"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683164"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/178"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746061"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01477"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01453"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413769"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680622","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680622","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:56Z","timestamp":1750295876000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680622"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":66,"alternative-id":["10.1145\/3664647.3680622","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680622","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}