{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:45:34Z","timestamp":1768322734466,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":77,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Key Research and Development Program of Shaanxi","award":["2021ZDLGY01-03"],"award-info":[{"award-number":["2021ZDLGY01-03"]}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2020AAA0140000"],"award-info":[{"award-number":["2020AAA0140000"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["ZDRC2102"],"award-info":[{"award-number":["ZDRC2102"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Joint Fund of Ministry of Education of China","award":["8091B022149"],"award-info":[{"award-number":["8091B022149"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3613842","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"8707-8718","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Spatio-Temporal Catcher: A Self-Supervised Transformer for Deepfake Video Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9569-7311","authenticated-orcid":false,"given":"Maosen","family":"Li","sequence":"first","affiliation":[{"name":"Xidian University &amp; Alibaba Group, Xi'an &amp; Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9671-8978","authenticated-orcid":false,"given":"Xurong","family":"Li","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3078-0867","authenticated-orcid":false,"given":"Kun","family":"Yu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2620-3247","authenticated-orcid":false,"given":"Cheng","family":"Deng","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3483-8333","authenticated-orcid":false,"given":"Heng","family":"Huang","sequence":"additional","affiliation":[{"name":"University of Maryland, College Park, College Park, MD, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6171-3168","authenticated-orcid":false,"given":"Feng","family":"Mao","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2093-2839","authenticated-orcid":false,"given":"Hui","family":"Xue","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1548-0329","authenticated-orcid":false,"given":"Minghao","family":"Li","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. Contributing data to deepfake detection research. https:\/\/ai.googleblog. com\/2019\/09\/contributing-data-to-deepfake-detection.html. Accessed: 2021-11-13."},{"key":"e_1_3_2_1_2_1","unstructured":"[n. d.]. FaceSwap. https:\/\/github.com\/MarekKowalski\/FaceSwap. [Accessed: 2020-11-12]."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/WIFS.2018.8630761"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.299"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/2909827.2930786"},{"key":"e_1_3_2_1_7_1","volume-title":"International conference on machine learning","volume":"2","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is space-time attention all you need for video understanding?. In International conference on machine learning, Vol. 2. 4."},{"key":"e_1_3_2_1_8_1","volume-title":"The Deepfake Detection Challenge (DFDC) Preview Dataset. arXiv","author":"Nicole Baram Cristian Canton Ben Pflaum","year":"1910","unstructured":"Ben Pflaum Nicole Baram Cristian Canton Ferrer Brian Dolhansky, Russ Howes. 2019. The Deepfake Detection Challenge (DFDC) Preview Dataset. arXiv 1910.08854 (2019)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.3390\/info11020125"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00408"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58574-7_7"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01815"},{"key":"e_1_3_2_1_13_1","unstructured":"Liang Chen Yong Zhang Yibing Song Jue Wang and Lingqiao Liu. 2022. OST: Improving Generalization of DeepFake Detection via One-Shot Test-Time Training. In Advances in Neural Information Processing Systems Alice H. Oh Alekh Agarwal Danielle Belgrave and Kyunghyun Cho (Eds.). https:\/\/openreview.net\/forum?id=YPoRoad6gzY"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2024156.2024164"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00582"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00421"},{"key":"e_1_3_2_1_17_1","volume-title":"Retinaface: Single-shot multi-level face localisation in the wild. In Pro- ceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 5203--5212.","author":"Deng Jiankang","year":"2020","unstructured":"Jiankang Deng, Jia Guo, Evangelos Ververas, Irene Kotsia, and Stefanos Zafeiriou. 2020. Retinaface: Single-shot multi-level face localisation in the wild. In Pro- ceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 5203--5212."},{"key":"e_1_3_2_1_18_1","volume-title":"The Deepfake Detection Challenge (DFDC) Preview Dataset. CoRR abs\/1910.08854","author":"Dolhansky Brian","year":"2019","unstructured":"Brian Dolhansky, Russ Howes, Ben Pflaum, Nicole Baram, and Cristian Canton-Ferrer. 2019. The Deepfake Detection Challenge (DFDC) Preview Dataset. CoRR abs\/1910.08854 (2019)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00389"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00925"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00791"},{"key":"e_1_3_2_1_22_1","unstructured":"Quanfu Fan Rameswar Panda et al. 2021. An Image Classifier Can Suffice For Video Understanding. arXiv preprint arXiv:2106.14104 (2021)."},{"key":"e_1_3_2_1_23_1","volume-title":"Masked Autoencoders As Spatiotemporal Learners. arXiv preprint arXiv:2205.09113","author":"Feichtenhofer Christoph","year":"2022","unstructured":"Christoph Feichtenhofer, Haoqi Fan, Yanghao Li, and Kaiming He. 2022. Masked Autoencoders As Spatiotemporal Learners. arXiv preprint arXiv:2205.09113 (2022)."},{"key":"e_1_3_2_1_24_1","volume-title":"Sharpness-aware minimization for efficiently improving generalization. arXiv preprint arXiv:2010.01412","author":"Foret Pierre","year":"2020","unstructured":"Pierre Foret, Ariel Kleiner, Hossein Mobahi, and Behnam Neyshabur. 2020. Sharpness-aware minimization for efficiently improving generalization. arXiv preprint arXiv:2010.01412 (2020)."},{"key":"e_1_3_2_1_25_1","volume-title":"International conference on machine learning. PMLR, 3247--3258","author":"Frank Joel","year":"2020","unstructured":"Joel Frank, Thorsten Eisenhofer, Lea Sch\u00f6nherr, Asja Fischer, Dorothea Kolossa, and Thorsten Holz. 2020. Leveraging frequency analysis for deep fake image recognition. In International conference on machine learning. PMLR, 3247--3258."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.537"},{"key":"e_1_3_2_1_27_1","volume-title":"Vdub: Modifying face video of actors for plausible visual alignment to a dubbed audio track. In Computer graphics forum","author":"Garrido Pablo","year":"2015","unstructured":"Pablo Garrido, Levi Valgaerts, Hamid Sarmadi, Ingmar Steiner, Kiran Varanasi, Patrick Perez, and Christian Theobalt. 2015. Vdub: Modifying face video of actors for plausible visual alignment to a dubbed audio track. In Computer graphics forum, Vol. 34. Wiley Online Library, 193--204."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536426"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00308"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475508"},{"key":"e_1_3_2_1_32_1","unstructured":"Jiazhi Guan Hang Zhou Zhibin Hong Errui Ding Jingdong Wang Cheng-bin Quan and Youjian Zhao. 2022. Delving into Sequential Patches for Deep-fake Detection. In Advances in Neural Information Processing Systems Alice H. Oh Alekh Agarwal Danielle Belgrave and Kyunghyun Cho (Eds.). https: \/\/openreview.net\/forum?id=osPA8Bs4MJB"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01453"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00500"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_36_1","volume-title":"Attgan: Facial attribute editing by only changing what you want","author":"He Zhenliang","year":"2019","unstructured":"Zhenliang He, Wangmeng Zuo, Meina Kan, Shiguang Shan, and Xilin Chen. 2019. Attgan: Facial attribute editing by only changing what you want. IEEE transactions on image processing 28, 11 (2019), 5464--5478."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00296"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Tero Karras Samuli Laine and Timo Aila. 2019. A style-based generator architecture for generative adversarial networks. In CVPR. 4401--4410.","DOI":"10.1109\/CVPR.2019.00453"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475332"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.23919\/BIOSIG.2018.8553251"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.5555\/1577069.1755843"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.397"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00639"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547832"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00512"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00505"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414034"},{"key":"e_1_3_2_1_48_1","volume-title":"Exposing deepfake videos by detecting face warping artifacts. arXiv preprint arXiv:1811.00656","author":"Li Yuezun","year":"2018","unstructured":"Yuezun Li and Siwei Lyu. 2018. Exposing deepfake videos by detecting face warping artifacts. arXiv preprint arXiv:1811.00656 (2018)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00327"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00083"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_2_1_52_1","volume-title":"Decoupled Weight Decay Regularization. In International Conference on Learning Representations.","author":"Loshchilov Ilya","year":"2018","unstructured":"Ilya Loshchilov and Frank Hutter. 2018. Decoupled Weight Decay Regularization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01605"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58571-6_39"},{"key":"e_1_3_2_1_55_1","volume-title":"Computer Graphics Forum","author":"Naruniec Jacek","unstructured":"Jacek Naruniec, Leonhard Helminger, Christopher Schroers, and Romann M Weber. 2020. High-resolution neural face swapping for visual effects. In Computer Graphics Forum, Vol. 39. Wiley Online Library, 173--184."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3230744.3230818"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2018.00024"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58610-2_6"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00009"},{"key":"e_1_3_2_1_60_1","first-page":"80","article-title":"Recurrent convolutional strategies for face manipulation detection in videos","volume":"3","author":"Sabir Ekraam","year":"2019","unstructured":"Ekraam Sabir, Jiaxin Cheng, Ayush Jaiswal, Wael AbdAlmageed, Iacopo Masi, and Prem Natarajan. 2019. Recurrent convolutional strategies for face manipulation detection in videos. Interfaces (GUI) 3, 1 (2019), 80--87.","journal-title":"Interfaces (GUI)"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01816"},{"key":"e_1_3_2_1_62_1","volume-title":"Tel Aviv","author":"Song Luchuan","year":"2022","unstructured":"Luchuan Song, Zheng Fang, Xiaodan Li, Xiaoyi Dong, Zhenchao Jin, Yuefeng Chen, and Siwei Lyu. 2022. Adaptive Face Forgery Detection in Cross Domain. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXIV. Springer, 467--484."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547806"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20130"},{"key":"e_1_3_2_1_65_1","volume-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. arXiv preprint arXiv:2203.12602","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2022. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. arXiv preprint arXiv:2203.12602 (2022)."},{"key":"e_1_3_2_1_66_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01468"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2017.08.026"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3172845"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00753"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-31456-9_15"},{"key":"e_1_3_2_1_72_1","volume-title":"Time Is MattEr: Temporal Self-supervision for Video Transformers. In International Conference on Machine Learning. PMLR, 25804--25816","author":"Yun Sukmin","year":"2022","unstructured":"Sukmin Yun, Jaehyung Kim, Dongyoon Han, Hwanjun Song, Jung-Woo Ha, and Jinwoo Shin. 2022. Time Is MattEr: Temporal Self-supervision for Video Transformers. In International Conference on Machine Learning. PMLR, 25804--25816."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547913"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_26"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00222"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01475"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01477"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613842","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3613842","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:54:04Z","timestamp":1755820444000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613842"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":77,"alternative-id":["10.1145\/3581783.3613842","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3613842","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}