{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,19]],"date-time":"2026-06-19T16:34:35Z","timestamp":1781886875589,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":96,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,20]],"date-time":"2021-10-20T00:00:00Z","timestamp":1634688000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Research Foundation of Korea (NRF) grant funded by Korea government MSIT Basic Science Research Program","award":["2020R1C1C1006004"],"award-info":[{"award-number":["2020R1C1C1006004"]}]},{"name":"IITP grant funded by the Korea government (MSIT) AI Graduate School Support Program","award":["2019-0-00421"],"award-info":[{"award-number":["2019-0-00421"]}]},{"name":"MSIT (Ministry of Science ICT) Korea under the High-Potential Individuals Global Training Program","award":["2020-0-01550"],"award-info":[{"award-number":["2020-0-01550"]}]},{"name":"IITP grant funded by the Korea government MSITOriginal Technology Development of Artificial Intelligence Industry","award":["2021-0-00017"],"award-info":[{"award-number":["2021-0-00017"]}]},{"name":"IITP grant funded by the Korea government (MSIT) Regional strategic industry convergence security core talent training business","award":["2019-0-01343"],"award-info":[{"award-number":["2019-0-01343"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,24]]},"DOI":"10.1145\/3476099.3484315","type":"proceedings-article","created":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T01:55:27Z","timestamp":1634435727000},"page":"7-15","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":95,"title":["Evaluation of an Audio-Video Multimodal Deepfake Dataset using Unimodal and Multimodal Detectors"],"prefix":"10.1145","author":[{"given":"Hasam","family":"Khalid","sequence":"first","affiliation":[{"name":"Sungkyunkwan University, Suwon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Minha","family":"Kim","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University, Suwon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shahroz","family":"Tariq","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University, Suwon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Simon S.","family":"Woo","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University, Suwon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2021,10,20]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/WIFS.2018.8630761"},{"key":"e_1_3_2_1_2_1","volume-title":"Protecting World Leaders Against Deep Fakes. In CVPR Workshops. CVPRW","author":"Agarwal Shruti","year":"2019"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00152"},{"key":"e_1_3_2_1_4_1","unstructured":"APTLY lab. 2020. The Fake-or-Real Dataset. http:\/\/bil.eecs.yorku.ca\/datasets\/ [Online; accessed 28-July-2021].  APTLY lab. 2020. The Fake-or-Real Dataset. http:\/\/bil.eecs.yorku.ca\/datasets\/ [Online; accessed 28-July-2021]."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.5555\/3327546.3327667"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2895466"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1162\/NECO_a_00801"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.195"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413700"},{"key":"e_1_3_2_1_10_1","volume-title":"Voxceleb2: Deep speaker recognition. arXiv preprint arXiv:1806.05622","author":"Chung Joon Son","year":"2018"},{"key":"e_1_3_2_1_11_1","volume-title":"Forensictransfer: Weakly-supervised domain adaptation for forgery detection. arXiv preprint arXiv:1812.02510","author":"Cozzolino Davide","year":"2018"},{"key":"e_1_3_2_1_12_1","volume-title":"Predictions of subjective ratings and spoofing assessments of voice conversion challenge 2020 submissions. arXiv preprint arXiv:2009.03554","author":"Das Rohan Kumar","year":"2020"},{"key":"e_1_3_2_1_13_1","unstructured":"David Griner. 2020. State Farm and Kenny Mayne Brilliantly Faked Us All Out During The Last Dance. https:\/\/www.adweek.com\/brand-marketing\/state-farm-and-kenny-mayne-brilliantly-faked-us-all-out-during-the-last-dance\/ [Online; accessed 31-May-2021].  David Griner. 2020. State Farm and Kenny Mayne Brilliantly Faked Us All Out During The Last Dance. https:\/\/www.adweek.com\/brand-marketing\/state-farm-and-kenny-mayne-brilliantly-faked-us-all-out-during-the-last-dance\/ [Online; accessed 31-May-2021]."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2477042"},{"key":"e_1_3_2_1_15_1","volume-title":"The deepfake detection challenge dataset. arXiv preprint arXiv:2006.07397","author":"Dolhansky Brian","year":"2020"},{"key":"e_1_3_2_1_16_1","volume-title":"Contributing data to deepfake detection research. Google AI Blog","author":"Dufour Nick","year":"2019"},{"key":"e_1_3_2_1_17_1","volume-title":"Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, 494--509","author":"Epstein Baruch","year":"2018"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.5555\/2969033.2969125"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/AVSS.2018.8639163"},{"key":"e_1_3_2_1_20_1","volume-title":"et almbox","author":"Hasan Md Rashidul","year":"2004"},{"key":"e_1_3_2_1_21_1","volume-title":"2020 b. T-GD: Transferable GAN-generated Images Detection Framework. arXiv preprint arXiv:2008.04115","author":"Jeon Hyeonseong","year":"2020"},{"key":"e_1_3_2_1_22_1","volume-title":"2020 a. FDFtNet: Facing Off Fake Images using Fake Detection Fine-tuning Network. arXiv preprint arXiv:2001.01265","author":"Jeon Hyeonseong","year":"2020"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.5555\/3327345.3327360"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00296"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.5220\/0004828606710678"},{"key":"e_1_3_2_1_26_1","volume-title":"Woo","author":"Khalid Hasam","year":"2021"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00336"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475535"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00111"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Tomi Kinnunen Md Sahidullah H\u00e9ctor Delgado Massimiliano Todisco Nicholas Evans Junichi Yamagishi and Kong Aik Lee. 2017. The ASVspoof 2017 challenge: Assessing the limits of replay spoofing attack detection. (2017).  Tomi Kinnunen Md Sahidullah H\u00e9ctor Delgado Massimiliano Todisco Nicholas Evans Junichi Yamagishi and Kong Aik Lee. 2017. The ASVspoof 2017 challenge: Assessing the limits of replay spoofing attack detection. (2017).","DOI":"10.21437\/Interspeech.2017-1111"},{"key":"e_1_3_2_1_31_1","first-page":"84","article-title":"Review of existing text-to-speech algorithms","volume":"8","author":"Kireev Nikita","year":"2020","journal-title":"International Journal of Open Information Technologies"},{"key":"e_1_3_2_1_32_1","volume-title":"Deepfakes: a new threat to face recognition? assessment and detection. arXiv preprint arXiv:1812.08685","author":"Korshunov Pavel","year":"2018"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.397"},{"key":"e_1_3_2_1_34_1","volume-title":"KoDF: A Large-scale Korean DeepFake Detection Dataset. arXiv preprint arXiv:2103.10094","author":"Kwon Patrick","year":"2021"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-78120-0_23"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2021.107256"},{"key":"e_1_3_2_1_37_1","volume-title":"Faceshifter: Towards high fidelity and occlusion aware face swapping. arXiv preprint arXiv:1912.13457","author":"Li Lingzhi","year":"2019"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/WIFS.2018.8630787"},{"key":"e_1_3_2_1_39_1","unstructured":"Yuezun Li Ming-Ching Chang and Siwei Lyu. 2018b. In ictu oculi: Exposing ai generated fake face videos by detecting eye blinking. arXiv preprint arXiv:1806.02877 (2018).  Yuezun Li Ming-Ching Chang and Siwei Lyu. 2018b. In ictu oculi: Exposing ai generated fake face videos by detecting eye blinking. arXiv preprint arXiv:1806.02877 (2018)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00327"},{"key":"e_1_3_2_1_41_1","volume-title":"The AMI meeting corpus. In Proceedings of the 5th international conference on methods and techniques in behavioral research","volume":"88","author":"McCowan Iain","year":"2005"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCNC.2019.8685502"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397482.3450726"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS45743.2020.9341699"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/2512530.2512532"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413570"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2020.101096"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2482819"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2461544"},{"key":"e_1_3_2_1_50_1","volume-title":"Use of a capsule network to detect fake images and videos. arXiv preprint arXiv:1910.12467","author":"Nguyen Huy H","year":"2019"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00728"},{"key":"e_1_3_2_1_52_1","volume-title":"Fast Fourier Transform and Convolution Algorithms","author":"Nussbaumer Henri J"},{"key":"e_1_3_2_1_53_1","volume-title":"Multimodal fusion with deep neural networks for audio-video emotion recognition. arXiv preprint arXiv:1907.03196","author":"Ortega Juan DS","year":"2019"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2018.2889273"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Tanvina B Patel and Hemant A Patil. 2015. Combining evidences from mel cepstral cochlear filter cepstral and instantaneous frequency features for detection of natural vs. spoofed speech. In Sixteenth annual conference of the international speech communication association .  Tanvina B Patel and Hemant A Patil. 2015. Combining evidences from mel cepstral cochlear filter cepstral and instantaneous frequency features for detection of natural vs. spoofed speech. In Sixteenth annual conference of the international speech communication association .","DOI":"10.21437\/Interspeech.2015-467"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442381.3449978"},{"key":"e_1_3_2_1_58_1","volume-title":"https:\/\/www.forbes.com\/sites\/robtoews\/2020\/05\/25\/deepfakes-are-going-to-wreak-havoc-on-society-we-are-not-prepared\/'sh=7885d8737494 [Online","author":"Toews Rob","year":"2021"},{"key":"e_1_3_2_1_59_1","unstructured":"Andreas Rossler Davide Cozzolino Luisa Verdoliva Christian Riess Justus Thies and Matthias Nie\u00dfner. 2019. Faceforensics  Andreas Rossler Davide Cozzolino Luisa Verdoliva Christian Riess Justus Thies and Matthias Nie\u00dfner. 2019. Faceforensics"},{"key":"e_1_3_2_1_60_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 1--11"},{"key":"e_1_3_2_1_62_1","volume-title":"Interfaces (GUI)","volume":"3","author":"Sabir Ekraam","year":"2019"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"crossref","unstructured":"Md Sahidullah Tomi Kinnunen and Cemal Hanilcc i. 2015. A comparison of features for synthetic speech detection. (2015).  Md Sahidullah Tomi Kinnunen and Cemal Hanilcc i. 2015. A comparison of features for synthetic speech detection. (2015).","DOI":"10.21437\/Interspeech.2015-472"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2015.2398812"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-01793-3_21"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1162\/089976601750264965"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.3390\/jimaging7070108"},{"key":"e_1_3_2_1_68_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014"},{"key":"e_1_3_2_1_69_1","volume-title":"Deepfakes are the most dangerous crime of the future. https:\/\/www.independent.co.uk\/life-style\/gadgets-and-tech\/news\/deepfakes-dangerous-crime-artificial-intelligence-a9655821.html [Online","author":"Smith Adam","year":"2021"},{"key":"e_1_3_2_1_70_1","volume-title":"Engineering & Applications","author":"Sridevi M","year":"2012"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_72_1","volume-title":"International Conference on Machine Learning. PMLR, 6105--6114","author":"Tan Mingxing","year":"2019"},{"key":"e_1_3_2_1_73_1","volume-title":"Am I a Real or Fake Celebrity? Measuring Commercial Face Recognition Web APIs under Deepfake Impersonation Attack. arXiv preprint arXiv:2103.00847","author":"Tariq Shahroz","year":"2021"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267357.3267367"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297280.3297410"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442381.3449809"},{"key":"e_1_3_2_1_77_1","volume-title":"A Convolutional LSTM based Residual Network for Deepfake Video Detection. arXiv preprint arXiv:2009.07480","author":"Tariq Shahroz","year":"2020"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_42"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2016-41"},{"key":"e_1_3_2_1_80_1","volume-title":"ASVspoof 2019: Future horizons in spoofed and fake audio detection. arXiv preprint arXiv:1904.05441","author":"Todisco Massimiliano","year":"2019"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2020.06.014"},{"key":"e_1_3_2_1_82_1","volume-title":"Multimodal for movie genre prediction. https:\/\/github.com\/dh1105\/Multi-modal-movie-genre-prediction [Online","author":"Verma Dhruv","year":"2021"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/2911996.2912064"},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-473"},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2020.101114"},{"key":"e_1_3_2_1_86_1","volume-title":"et almbox","author":"Wang Yuxuan","year":"2017"},{"key":"e_1_3_2_1_87_1","volume-title":"The Free Encyclopedia. https:\/\/en.wikipedia.org\/w\/index.php?title=Hann_function&oldid=1001711522 . [Online","author":"Wikipedia","year":"2021"},{"key":"e_1_3_2_1_88_1","volume-title":"Deepfake Video Detection Using Convolutional Vision Transformer. arXiv preprint arXiv:2102.11126","author":"Wodajo Deressa","year":"2021"},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-462"},{"key":"e_1_3_2_1_90_1","volume-title":"Multimodal classification. https:\/\/github.com\/xkaple01\/multimodal-classification [Online","year":"2021"},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"publisher","DOI":"10.32604\/cmc.2021.016760"},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683164"},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00534"},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2016.2647199"},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.229"},{"key":"e_1_3_2_1_96_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00116"},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413769"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 1st Workshop on Synthetic Multimedia - Audiovisual Deepfake Generation and Detection"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3476099.3484315","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3476099.3484315","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:12:02Z","timestamp":1750191122000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3476099.3484315"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,20]]},"references-count":96,"alternative-id":["10.1145\/3476099.3484315","10.1145\/3476099"],"URL":"https:\/\/doi.org\/10.1145\/3476099.3484315","relation":{},"subject":[],"published":{"date-parts":[[2021,10,20]]},"assertion":[{"value":"2021-10-20","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}