{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,18]],"date-time":"2026-04-18T08:57:53Z","timestamp":1776502673775,"version":"3.51.2"},"reference-count":75,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:00:00Z","timestamp":1764547200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:00:00Z","timestamp":1764547200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:00:00Z","timestamp":1764547200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001869","name":"Academia Sinica","doi-asserted-by":"publisher","award":["AS-GC-111-M01"],"award-info":[{"award-number":["AS-GC-111-M01"]}],"id":[{"id":"10.13039\/501100001869","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100020950","name":"National Council of Science and Technology, Taiwan","doi-asserted-by":"publisher","award":["NSTC 113-2634-F-002-003"],"award-info":[{"award-number":["NSTC 113-2634-F-002-003"]}],"id":[{"id":"10.13039\/501100020950","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100020950","name":"National Council of Science and Technology, Taiwan","doi-asserted-by":"publisher","award":["NSTC 112-2221-E-001-009-MY3"],"award-info":[{"award-number":["NSTC 112-2221-E-001-009-MY3"]}],"id":[{"id":"10.13039\/501100020950","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Cogn. Dev. Syst."],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1109\/tcds.2025.3554477","type":"journal-article","created":{"date-parts":[[2025,3,24]],"date-time":"2025-03-24T15:00:40Z","timestamp":1742828440000},"page":"1360-1376","source":"Crossref","is-referenced-by-count":9,"title":["AVTENet: A Human-Cognition-Inspired Audio-Visual Transformer-Based Ensemble Network for Video Deepfake Detection"],"prefix":"10.1109","volume":"17","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1973-6902","authenticated-orcid":false,"given":"Ammarah","family":"Hashmi","sequence":"first","affiliation":[{"name":"Social Networks and Human-Centered Computing Program, Taiwan International Graduate Program, Academia Sinica, Taipei, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5591-8423","authenticated-orcid":false,"given":"Sahibzada Adil","family":"Shahzad","sequence":"additional","affiliation":[{"name":"Social Networks and Human-Centered Computing Program, Taiwan International Graduate Program, Academia Sinica, Taipei, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9097-2318","authenticated-orcid":false,"given":"Chia Wen","family":"Lin","sequence":"additional","affiliation":[{"name":"Department of Electrical Engineering and the Institute of Communications Engineering, National Tsing Hua University, Hsinchu, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6956-0418","authenticated-orcid":false,"given":"Yu","family":"Tsao","sequence":"additional","affiliation":[{"name":"Research Center for Information Technology Innovation, Academia Sinica, Taipei, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3599-5071","authenticated-orcid":false,"given":"Hsin-Min","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Information Science, Academia Sinica, Taipei, Taiwan"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.53637\/DELS2700"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.5555\/2969033.2969125"},{"key":"ref3","article-title":"Auto-encoding variational Bayes","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kingma","year":"2014"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3425780"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1515\/revneuro-2022-0065"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.54097\/gew7ng02"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICMLC48188.2019.8949228"},{"key":"ref8","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy","year":"2021"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.23919\/APSIPAASC55919.2022.9980255"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.23919\/APSIPAASC55919.2022.9980296"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/WIFS.2018.8630761"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.195"},{"key":"ref13","first-page":"6105","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Tan","year":"2019"},{"key":"ref14","first-page":"19","article-title":"Two-stream neural networks for tampered face detection","volume-title":"Proc. IEEE\/CVF Conf. Comput. Vision Pattern Recognit. Workshops","author":"Han","year":"2017"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413700"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.32985\/ijeces.13.9.9"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-698"},{"key":"ref19","article-title":"Learning audio-visual speech representation by masked multimodal cluster prediction","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Shi","year":"2021"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TCDS.2023.3327081"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TCDS.2024.3383952"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TCDS.2024.3357618"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TCDS.2021.3071170"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TCDS.2022.3179427"},{"key":"ref25","first-page":"1","article-title":"FakeAVCeleb: A novel audio-video multimodal deepfake dataset","volume-title":"Proc. Neural Inf. Process. Syst. Track Datasets Benchmarks","author":"Khalid","year":"2021"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.262"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00955"},{"key":"ref28","article-title":"First order motion model for image animation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Siarohin","year":"2019"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201283"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00603"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2836316"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00112"},{"key":"ref34","first-page":"46","article-title":"Exposing deepfake videos by detecting face warping artifacts","volume-title":"Proc. IEEE Conf. Comput. Vision Pattern Recognit. Workshops","author":"Li","year":"2019"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683164"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-06788-4_52"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00284"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/WIFS.2018.8630787"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2023.104771"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW.2019.00020"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2024.3441821"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/WIFS49906.2020.9360904"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475332"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3549555.3549588"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TCDS.2022.3158613"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1177\/09567976221121348"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413570"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01453"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00101"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/URTC60662.2023.10534969"},{"key":"ref51","article-title":"A novel deepfake detection framework using audio-video-textual features","author":"Asha","year":"2022","journal-title":"Res. Square"},{"key":"ref52","article-title":"Self-supervised transformer for deepfake detection","author":"Zhao","year":"2022"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095247"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/TCDS.2021.3064679"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/TCDS.2021.3086011"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TCDS.2021.3062624"},{"key":"ref57","article-title":"The deepfake detection challenge (DFDC) dataset","author":"Dolhansky","year":"2020"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.397"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00728"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"ref62","first-page":"4485","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Jia","year":"2018"},{"key":"ref63","article-title":"kinetics human action video dataset","author":"Kay","year":"2017"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00500"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053841"},{"key":"ref66","article-title":"LRS3-TED: A large-scale dataset for visual speech recognition","author":"Afouras","year":"2018"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-950"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1145\/3476099.3484315"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00106"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2023.110124"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2023.3262148"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/icmla58977.2023.00207"},{"key":"ref73","article-title":"Deepfake video detection using convolutional vision transformer","author":"Wodajo","year":"2021"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP51287.2024.10648224"},{"key":"ref75","article-title":"DeepFakes: A new threat to face recognition? Assessment and detection","author":"Korshunov","year":"2018"}],"container-title":["IEEE Transactions on Cognitive and Developmental Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/7274989\/11288412\/10938399.pdf?arnumber=10938399","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,16]],"date-time":"2025-12-16T18:33:10Z","timestamp":1765909990000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10938399\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12]]},"references-count":75,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/tcds.2025.3554477","relation":{},"ISSN":["2379-8920","2379-8939"],"issn-type":[{"value":"2379-8920","type":"print"},{"value":"2379-8939","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,12]]}}}