{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,18]],"date-time":"2026-04-18T16:42:26Z","timestamp":1776530546729,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":62,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62171326, 62371350, 62471343"],"award-info":[{"award-number":["62171326, 62371350, 62471343"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Key Science and Technology Research Project of Xinjiang Production and Construction Corps","award":["KST-01"],"award-info":[{"award-number":["KST-01"]}]},{"name":"Guangdong OPPO Mobile Telecommunications Corp.","award":["OPPO-WHU-02"],"award-info":[{"award-number":["OPPO-WHU-02"]}]},{"name":"Wuhan University Supercomputing Center","award":["WUSC-001"],"award-info":[{"award-number":["WUSC-001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755563","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"8547-8556","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Query-Based Audio-Visual Temporal Forgery Localization with Register-Enhanced Representation Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-3342-4883","authenticated-orcid":false,"given":"Xiaodong","family":"Zhu","sequence":"first","affiliation":[{"name":"NERCMS, School of Computer Science, Wuhan University, Wuhan, Hubei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9156-5456","authenticated-orcid":false,"given":"Suting","family":"Wang","sequence":"additional","affiliation":[{"name":"NERCMS, School of Computer Science, Wuhan University, Wuhan, Hubei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6488-2961","authenticated-orcid":false,"given":"Junqi","family":"Yang","sequence":"additional","affiliation":[{"name":"NERCMS, School of Computer Science, Wuhan University, Wuhan, Hubei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3001-7957","authenticated-orcid":false,"given":"Yuhong","family":"Yang","sequence":"additional","affiliation":[{"name":"NERCMS, School of Computer Science, Wuhan University, Wuhan City, Hubei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6933-3298","authenticated-orcid":false,"given":"Weiping","family":"Tu","sequence":"additional","affiliation":[{"name":"NERCMS, School of Computer Science, Wuhan University, Wuhan, Hubei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9796-488X","authenticated-orcid":false,"given":"Zhongyuan","family":"Wang","sequence":"additional","affiliation":[{"name":"NERCMS, School of Computer Science, Wuhan University, Wuhan, Hubei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Mesonet: A Compact Facial Video Forgery Detection Network. In 2018 IEEE International Workshop on Information Forensics and Security (WIFS). 1--7.","author":"Afchar Darius","year":"2018","unstructured":"Darius Afchar, Vincent Nozick, Junichi Yamagishi, and Isao Echizen. 2018. Mesonet: A Compact Facial Video Forgery Detection Network. In 2018 IEEE International Workshop on Information Forensics and Security (WIFS). 1--7."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2889052"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Anurag Bagchi Jazib Mahmood Dolton Fernandes and Ravi Kiran Sarvadevabhatla. 2021. Hear Me Out: Fusional Approaches for Audio Augmented Temporal Action Localization. (2021). arXiv:2106.14118 [cs]","DOI":"10.5220\/0010832700003124"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_8"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680795"},{"key":"e_1_3_2_1_6_1","volume-title":"Glitch in the Matrix: A Large Scale Benchmark for Content Driven Audio-Visual Forgery Detection and Localization. Computer Vision and Image Understanding 236 (Nov","author":"Cai Zhixi","year":"2023","unstructured":"Zhixi Cai, Shreya Ghosh, Abhinav Dhall, Tom Gedeon, Kalin Stefanov, and Munawar Hayat. 2023. Glitch in the Matrix: A Large Scale Benchmark for Content Driven Audio-Visual Forgery Detection and Localization. Computer Vision and Image Understanding 236 (Nov. 2023), 103818."},{"key":"e_1_3_2_1_7_1","volume-title":"Content Driven Audio-Visual Deepfake Dataset and Multimodal Method for Temporal Forgery Localization. In 2022 International Conference on Digital Image Computing: Techniques and Applications","author":"Cai Zhixi","year":"2022","unstructured":"Zhixi Cai, Kalin Stefanov, Abhinav Dhall, and Munawar Hayat. 2022. Do You Really Mean That? Content Driven Audio-Visual Deepfake Dataset and Multimodal Method for Temporal Forgery Localization. In 2022 International Conference on Digital Image Computing: Techniques and Applications (Sydney, Australia) (DICTA). 1--10."},{"key":"e_1_3_2_1_8_1","volume-title":"Regression Analysis of Count Data","author":"Colin Cameron A.","unstructured":"A. Colin Cameron and Pravin Trivedi. 2013. Regression Analysis of Count Data. Vol. 53. Cambridge University Press."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_29"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3635717"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413700"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"e_1_3_2_1_14_1","volume-title":"Image Analysis and Processing -- ICIAP 2022 (Lecture Notes in Computer Science), Stan Sclaroff, Cosimo Distante, Marco Leo, Giovanni M","author":"Coccomini Davide Alessandro","unstructured":"Davide Alessandro Coccomini, Nicola Messina, Claudio Gennaro, and Fabrizio Falchi. 2022. Combining EfficientNet and Vision Transformers for Video Deepfake Detection. In Image Analysis and Processing -- ICIAP 2022 (Lecture Notes in Computer Science), Stan Sclaroff, Cosimo Distante, Marco Leo, Giovanni M. Farinella, and Federico Tombari (Eds.). Springer International Publishing, Cham, 219--229."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00101"},{"key":"e_1_3_2_1_16_1","volume-title":"Vision Transformers Need Registers. In International Conference on Learning Representations (ICLR).","author":"Darcet Timoth\u00e9e","year":"2024","unstructured":"Timoth\u00e9e Darcet, Maxime Oquab, Julien Mairal, and Piotr Bojanowski. 2024. Vision Transformers Need Registers. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01011"},{"key":"e_1_3_2_1_18_1","volume-title":"Contrastive Audio-Visual Masked Autoencoder. In International Conference on Learning Representations (ICLR).","author":"Gong Yuan","unstructured":"Yuan Gong, Andrew Rouditchenko, Alexander H. Liu, David Harwath, Leonid Karlinsky, Hilde Kuehne, and James R. Glass. 2023. Contrastive Audio-Visual Masked Autoencoder. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_19_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Haliassos Alexandros","year":"2023","unstructured":"Alexandros Haliassos, Pingchuan Ma, Rodrigo Mira, Stavros Petridis, and Maja Pantic. 2023. Jointly Learning Visual and Auditory Speech Representations from Raw Data. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_20_1","first-page":"20371","article-title":"MAViL: Masked Audio-Video Learners","volume":"36","author":"Huang Po-Yao","year":"2023","unstructured":"Po-Yao Huang, Vasu Sharma, Hu Xu, Chaitanya Ryali, Yanghao Li, Shang-Wen Li, Gargi Ghosh, Jitendra Malik, Christoph Feichtenhofer, et al. 2023. MAViL: Masked Audio-Video Learners. Advances in Neural Information Processing Systems 36 (2023), 20371--20393.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_21_1","volume-title":"MIS-AVoiDD: Modality Invariant and Specific Representation for Audio-Visual Deepfake Detection. In 2023 International Conference on Machine Learning and Applications","author":"Katamneni Vinaya Sree","year":"2023","unstructured":"Vinaya Sree Katamneni and Ajita Rattani. 2023. MIS-AVoiDD: Modality Invariant and Specific Representation for Audio-Visual Deepfake Detection. In 2023 International Conference on Machine Learning and Applications (Jacksonville, FL, USA) (ICMLA). 1371--1378."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 18837--18846","author":"Kim Ho-Joong","year":"2024","unstructured":"Ho-Joong Kim, Jung-Ho Hong, Heejo Kong, and Seong-Whan Lee. 2024. TETAD: Towards Full End-to-End Temporal Action Detection via Time-Aligned Coordinate Expression. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 18837--18846."},{"key":"e_1_3_2_1_23_1","volume-title":"EquiAV: Leveraging Equivariance for Audio-Visual Contrastive Learning. In International Conference on Machine Learning","author":"Kim Jongsuk","year":"2024","unstructured":"Jongsuk Kim, Hyeongkeun Lee, Kyeongha Rho, Junmo Kim, and Joon Son Chung. 2024. EquiAV: Leveraging Equivariance for Audio-Visual Contrastive Learning. In International Conference on Machine Learning (Vienna, Austria) (ICML). JMLR.org, 24327--24341."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00944"},{"key":"e_1_3_2_1_25_1","unstructured":"Christos Koutlis and Symeon Papadopoulos. 2024. DiMoDif: Discourse Modality- Information Differentiation for Audio-Visual Deepfake Detection and Localization. (2024). arXiv:2411.10193 [cs]"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00333"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00399"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_1"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_1_30_1","article-title":"End-to-End Temporal Action Detection with Transformer","author":"Liu Xiaolong","year":"2022","unstructured":"Xiaolong Liu, Qimeng Wang, Yao Hu, Xu Tang, Shiwei Zhang, Song Bai, and Xiang Bai. 2022. End-to-End Temporal Action Detection with Transformer. IEEE Transactions on Image Processing 31 (Aug. 2022), 5427--5441.","journal-title":"IEEE Transactions on Image Processing 31"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00043"},{"key":"e_1_3_2_1_32_1","volume-title":"Interspeech","author":"Ma Pingchuan","year":"2021","unstructured":"Pingchuan Ma, Rodrigo Mira, Stavros Petridis, Bj\u00f6rnW. Schuller, and Maja Pantic. 2021. LiRA: Learning Visual Speech Representations from Audio Through Self- Supervision. In Interspeech 2021. ISCA, 3011--3015."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00363"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413570"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02567"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01274"},{"key":"e_1_3_2_1_37_1","unstructured":"Megha Nawhal and Greg Mori. 2021. Activity Graph Transformer for Temporal Action Localization. (2021). arXiv:2101.08540 [cs]"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02559"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00944"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-205"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00055"},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 993--1000","author":"Raza Muhammad Anas","year":"2023","unstructured":"Muhammad Anas Raza and Khalid Mahmood Malik. 2023. Multimodaltrace: Deepfake Detection Using Audiovisual Representation Learning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 993--1000."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i8.26162"},{"key":"e_1_3_2_1_45_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Shi Bowen","year":"2022","unstructured":"Bowen Shi, Wei-Ning Hsu, Kushal Lakhotia, and Abdelrahman Mohamed. 2022. Learning Audio-Visual Speech Representation by Masked Multimodal Cluster Prediction. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01808"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Feng Wang Jiahao Wang Sucheng Ren Guoyizhe Wei Jieru Mei Wei Shao Yuyin Zhou Alan Yuille and Cihang Xie. 2024. Mamba-R: Vision Mamba ALSO Needs Registers. (2024). arXiv:2405.14858 [cs]","DOI":"10.1109\/CVPR52734.2025.01392"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"e_1_3_2_1_49_1","unstructured":"Luyu Wang Pauline Luc Adria Recasens Jean-Baptiste Alayrac and Aaron van den Oord. 2021. Multimodal Self-Supervised Learning of General Audio Representations. (2021). arXiv:2104.12807 [cs eess]"},{"key":"e_1_3_2_1_50_1","unstructured":"Yi Wang Kunchang Li Yizhuo Li Yinan He Bingkun Huang Zhiyu Zhao Hongjie Zhang Jilan Xu Yi Liu Zun Wang et al. 2022. InternVideo: General Video Foundation Models via Generative and Discriminative Learning. (2022). arXiv:2212.03191 [cs]"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01017"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2023.3262148"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3309899"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_29"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2025.111432"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613767"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01340"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01247"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6999"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01453"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3275873"},{"key":"e_1_3_2_1_62_1","volume-title":"Cross-Modality and Within-Modality Regularization for Audio- Visual DeepFake Detection. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 4900--4904","author":"Zou Heqing","year":"2024","unstructured":"Heqing Zou, Meng Shen, Yuchen Hu, Chen Chen, Eng Siong Chng, and Deepu Rajan. 2024. Cross-Modality and Within-Modality Regularization for Audio- Visual DeepFake Detection. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 4900--4904."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755563","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:07:44Z","timestamp":1765310864000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755563"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":62,"alternative-id":["10.1145\/3746027.3755563","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755563","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}