{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:09:34Z","timestamp":1750219774670,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,12]],"date-time":"2023-06-12T00:00:00Z","timestamp":1686528000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Deng Feng Fund"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,12]]},"DOI":"10.1145\/3591106.3592284","type":"proceedings-article","created":{"date-parts":[[2023,6,8]],"date-time":"2023-06-08T22:33:38Z","timestamp":1686263618000},"page":"85-94","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Dual-Modality Co-Learning for Unveiling Deepfake in Spatio-Temporal Space"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5219-1097","authenticated-orcid":false,"given":"Jiazhi","family":"Guan","sequence":"first","affiliation":[{"name":"BNRist, DCST, Tsinghua University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2616-923X","authenticated-orcid":false,"given":"Hang","family":"Zhou","sequence":"additional","affiliation":[{"name":"VIS, Baidu Inc., China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0059-3810","authenticated-orcid":false,"given":"Zhizhi","family":"Guo","sequence":"additional","affiliation":[{"name":"China Telecom, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9881-0673","authenticated-orcid":false,"given":"Tianshu","family":"Hu","sequence":"additional","affiliation":[{"name":"VIS, Baidu Inc., China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8840-8876","authenticated-orcid":false,"given":"Lirui","family":"Deng","sequence":"additional","affiliation":[{"name":"BNRist, DCST, Tsinghua University, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5450-6567","authenticated-orcid":false,"given":"Chengbin","family":"Quan","sequence":"additional","affiliation":[{"name":"BNRist, DCST, Tsinghua University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6745-286X","authenticated-orcid":false,"given":"Meng","family":"Fang","sequence":"additional","affiliation":[{"name":"University of Liverpool, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9841-1796","authenticated-orcid":false,"given":"Youjian","family":"Zhao","sequence":"additional","affiliation":[{"name":"BNRist, DCST, Tsinghua University, China and Zhongguancun Laboratory, China"}]}],"member":"320","published-online":{"date-parts":[[2023,6,12]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00109"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00338"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00152"},{"key":"e_1_3_2_1_4_1","volume-title":"A short note on the kinetics-700 human action dataset. arXiv preprint arXiv:1907.06987","author":"Carreira Joao","year":"2019","unstructured":"Joao Carreira, Eric Noland, Chloe Hillier, and Andrew Zisserman. 2019. A short note on the kinetics-700 human action dataset. arXiv preprint arXiv:1907.06987 (2019)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58574-7_7"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16193"},{"volume-title":"Leveraging edges and optical flow on faces for deepfake detection. In 2020 IEEE international joint conference on biometrics (IJCB)","author":"Chintha Akash","key":"e_1_3_2_1_8_1","unstructured":"Akash Chintha, Aishwarya Rao, Saniat Sohrawardi, Kartavya Bhatt, Matthew Wright, and Raymond Ptucha. 2020. Leveraging edges and optical flow on faces for deepfake detection. In 2020 IEEE international joint conference on biometrics (IJCB). IEEE, 1\u201310."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.195"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00421"},{"key":"e_1_3_2_1_11_1","unstructured":"DeepFaceLab. 2022. deepfacelab. https:\/\/github.com\/iperov\/DeepFaceLab. Accessed: 2022-07-02."},{"key":"e_1_3_2_1_12_1","unstructured":"Deepfakes. 2022. Deepfakes github. https:\/\/github.com\/deepfakes\/faceswap. Accessed: 2022-07-02."},{"key":"e_1_3_2_1_13_1","volume-title":"The deepfake detection challenge dataset. arXiv e-prints","author":"Dolhansky Brian","year":"2020","unstructured":"Brian Dolhansky, Joanna Bitton, Ben Pflaum, Jikuo Lu, Russ Howes, Menglin Wang, and Cristian Canton\u00a0Ferrer. 2020. The deepfake detection challenge dataset. arXiv e-prints (2020), arXiv\u20132006."},{"key":"e_1_3_2_1_14_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_15_1","unstructured":"FaceApp. 2021. faceapp.com. https:\/\/www.faceapp.com\/. Accessed: 2022-07-02."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_1_17_1","volume-title":"Spatio-temporal Features for Generalized Detection of Deepfake Videos. arXiv preprint arXiv:2010.11844","author":"Ganiyusufoglu Ipek","year":"2020","unstructured":"Ipek Ganiyusufoglu, L\u00a0Minh Ng\u00f4, Nedko Savov, Sezer Karaoglu, and Theo Gevers. 2020. Spatio-temporal Features for Generalized Detection of Deepfake Videos. arXiv preprint arXiv:2010.11844 (2020)."},{"key":"e_1_3_2_1_18_1","volume-title":"Fighting deepfakes by detecting GAN DCT anomalies. arXiv preprint arXiv:2101.09781","author":"Giudice Oliver","year":"2021","unstructured":"Oliver Giudice, Luca Guarnera, and Sebastiano Battiato. 2021. Fighting deepfakes by detecting GAN DCT anomalies. arXiv preprint arXiv:2101.09781 (2021)."},{"key":"e_1_3_2_1_19_1","volume-title":"Generative adversarial networks. arXiv preprint arXiv:1406.2661","author":"Goodfellow J","year":"2014","unstructured":"Ian\u00a0J Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, and Yoshua Bengio. 2014. Generative adversarial networks. arXiv preprint arXiv:1406.2661 (2014)."},{"key":"e_1_3_2_1_20_1","volume-title":"Exploiting Fine-grained Face Forgery Clues via Progressive Enhancement Learning. arXiv preprint arXiv:2112.13977","author":"Gu Qiqi","year":"2021","unstructured":"Qiqi Gu, Shen Chen, Taiping Yao, Yang Chen, Shouhong Ding, and Ran Yi. 2021. Exploiting Fine-grained Face Forgery Clues via Progressive Enhancement Learning. arXiv preprint arXiv:2112.13977 (2021)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475508"},{"key":"e_1_3_2_1_22_1","unstructured":"Zhihao Gu Yang Chen Taiping Yao Shouhong Ding Jilin Li and Lizhuang Ma. 2022. Delving into the local: Dynamic inconsistency learning for deepfake video detection. AAAI."},{"key":"e_1_3_2_1_23_1","volume-title":"Detecting deepfake by creating spatio-temporal regularity disruption. arXiv preprint arXiv:2207.10402","author":"Guan Jiazhi","year":"2022","unstructured":"Jiazhi Guan, Hang Zhou, Mingming Gong, Youjian Zhao, Errui Ding, and Jingdong Wang. 2022. Detecting deepfake by creating spatio-temporal regularity disruption. arXiv preprint arXiv:2207.10402 (2022)."},{"key":"e_1_3_2_1_24_1","volume-title":"Delving into sequential patches for DeepFake detection. arXiv preprint arXiv:2207.02803","author":"Guan Jiazhi","year":"2022","unstructured":"Jiazhi Guan, Hang Zhou, Zhibin Hong, Errui Ding, Jingdong Wang, Chengbin Quan, and Youjian Zhao. 2022. Delving into sequential patches for DeepFake detection. arXiv preprint arXiv:2207.02803 (2022)."},{"key":"e_1_3_2_1_25_1","volume-title":"Advances in Neural Information Processing Systems, S.\u00a0Koyejo, S.\u00a0Mohamed, A.\u00a0Agarwal, D.\u00a0Belgrave, K.\u00a0Cho, and A.\u00a0Oh (Eds.). Vol.\u00a035. Curran Associates","author":"Guan Jiazhi","year":"2022","unstructured":"Jiazhi Guan, Hang Zhou, Zhibin Hong, Errui Ding, Jingdong Wang, Chengbin Quan, and Youjian Zhao. 2022. Delving into Sequential Patches for Deepfake Detection. In Advances in Neural Information Processing Systems, S.\u00a0Koyejo, S.\u00a0Mohamed, A.\u00a0Agarwal, D.\u00a0Belgrave, K.\u00a0Cho, and A.\u00a0Oh (Eds.). Vol.\u00a035. Curran Associates, Inc., 4517\u20134530. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/1d051fb631f104cb2a621451f37676b9-Paper-Conference.pdf"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/AVSS.2018.8639163"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00500"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2017.373"},{"key":"e_1_3_2_1_29_1","volume-title":"Determining optical flow. Artificial intelligence 17, 1-3","author":"Horn KP","year":"1981","unstructured":"Berthold\u00a0KP Horn and Brian\u00a0G Schunck. 1981. Determining optical flow. Artificial intelligence 17, 1-3 (1981), 185\u2013203."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3074259"},{"key":"e_1_3_2_1_31_1","unstructured":"Ziheng Hu Hongtao Xie Yuxin Wang Jiahong Li Zhongyuan Wang and Yongdong Zhang. 2021. Dynamic Inconsistency-aware DeepFake Video Detection. In IJCAI."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00296"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 7482\u20137491","author":"Kendall Alex","year":"2018","unstructured":"Alex Kendall, Yarin Gal, and Roberto Cipolla. 2018. Multi-task learning using uncertainty to weigh losses for scene geometry and semantics. In Proceedings of the IEEE conference on computer vision and pattern recognition. 7482\u20137491."},{"key":"e_1_3_2_1_34_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma P","year":"2013","unstructured":"Diederik\u00a0P Kingma and Max Welling. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_1_35_1","unstructured":"Marek Kowalski. 2022. FaceSwap github. https:\/\/github.com\/MarekKowalski\/FaceSwap. Accessed: 2022-07-02."},{"key":"e_1_3_2_1_36_1","volume-title":"Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems 25","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey\u00a0E Hinton. 2012. Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems 25 (2012), 1097\u20131105."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00639"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00512"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00505"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/WIFS.2018.8630787"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00327"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP42928.2021.9506381"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.108832"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58610-2_6"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00009"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.107950"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01816"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00361"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3323035"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/2929464.2929475"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_1_53_1","volume-title":"M2TR: Multi-modal Multi-scale Transformers for Deepfake Detection. arXiv preprint arXiv:2104.09770","author":"Wang Junke","year":"2021","unstructured":"Junke Wang, Zuxuan Wu, Jingjing Chen, and Yu-Gang Jiang. 2021. M2TR: Multi-modal Multi-scale Transformers for Deepfake Detection. arXiv preprint arXiv:2104.09770 (2021)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_20"},{"volume-title":"Joint pattern recognition symposium","author":"Zach Christopher","key":"e_1_3_2_1_55_1","unstructured":"Christopher Zach, Thomas Pock, and Horst Bischof. 2007. A duality based approach for realtime tv-l 1 optical flow. In Joint pattern recognition symposium. Springer, 214\u2013223."},{"key":"e_1_3_2_1_56_1","volume-title":"Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer. arXiv preprint arXiv:1612.03928","author":"Zagoruyko Sergey","year":"2016","unstructured":"Sergey Zagoruyko and Nikos Komodakis. 2016. Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer. arXiv preprint arXiv:1612.03928 (2016)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Daichi Zhang Chenyu Li Fanzhao Lin Dan Zeng and Shiming Ge. 2021. Detecting deepfake videos with temporal dropout 3DCNN. IJCAI.","DOI":"10.24963\/ijcai.2021\/178"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2016.2603342"},{"key":"e_1_3_2_1_59_1","volume-title":"Multi-attentional Deepfake Detection. arXiv preprint arXiv:2103.02406","author":"Zhao Hanqing","year":"2021","unstructured":"Hanqing Zhao, Wenbo Zhou, Dongdong Chen, Tianyi Wei, Weiming Zhang, and Nenghai Yu. 2021. Multi-attentional Deepfake Detection. arXiv preprint arXiv:2103.02406 (2021)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01475"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01477"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00295"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413769"}],"event":{"name":"ICMR '23: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Thessaloniki Greece","acronym":"ICMR '23"},"container-title":["Proceedings of the 2023 ACM International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592284","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3591106.3592284","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:37:30Z","timestamp":1750178250000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592284"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,12]]},"references-count":63,"alternative-id":["10.1145\/3591106.3592284","10.1145\/3591106"],"URL":"https:\/\/doi.org\/10.1145\/3591106.3592284","relation":{},"subject":[],"published":{"date-parts":[[2023,6,12]]},"assertion":[{"value":"2023-06-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}