{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:56:18Z","timestamp":1781538978891,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["22404034"],"award-info":[{"award-number":["22404034"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100021171","name":"Basic and Applied Basic Research Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2023A1515012697"],"award-info":[{"award-number":["2023A1515012697"]}],"id":[{"id":"10.13039\/501100021171","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810812","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"949-957","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CHM: Context Hiding and Misguidance for Robust Adversarial Attacks on Active Speaker Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-0332-0701","authenticated-orcid":false,"given":"Xiangyu","family":"Ye","sequence":"first","affiliation":[{"name":"School of Computer Science and Cyber Engineering, Guangzhou University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1510-3443","authenticated-orcid":false,"given":"Yatie","family":"Xiao","sequence":"additional","affiliation":[{"name":"School of Computer Science and Cyber Engineering, Guangzhou University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8193-1234","authenticated-orcid":false,"given":"Qingxiao","family":"Guan","sequence":"additional","affiliation":[{"name":"School of Computer Science and Cyber Engineering, Guangzhou University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5783-9243","authenticated-orcid":false,"given":"Zhenbang","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Cyber Engineering, Guangzhou University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","volume-title":"International Conference on Machine Learning (ICML)","author":"Athalye Anish","year":"2018","unstructured":"Anish Athalye, Logan Engstrom, Andrew Ilyas, and Kevin Kwok. 2018. Synthesizing Robust Adversarial Examples. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01719"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3652583.3658101"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02573"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10022646"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0007"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890414"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00957"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00202"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446169"},{"key":"e_1_3_3_1_12_2","first-page":"4283","volume-title":"Interspeech","author":"Huang Xinghao","year":"2024","unstructured":"Xinghao Huang, Weiwei Jiang, Long Rao, Wei Xu, and Wenqing Cheng. 2024. Active Speaker Detection in Fisheye Meeting Scenes with Scene Spatial Spectrums. In Interspeech. 4283\u20134287."},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448124"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02196"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Junhua Liao Haihan Duan Kanghui Feng Wanbing Zhao Yanbing Yang Liangyin Chen and Yanru Chen. 2025. LR-ASD: Lightweight and Robust Network for Active Speaker Detection. International Journal of Computer Vision (IJCV) 133 7 (July 2025) 4749\u20134769.","DOI":"10.1007\/s11263-025-02399-2"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00393"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Jing Liu Sihan Chen Xingjian He Longteng Guo Xinxin Zhu Weining Wang and Jinhui Tang. 2025. VALOR: Vision-Audio-Language Omni-Perception Pretraining Model and Dataset. IEEE Transactions on Pattern Analysis and Machine Intelligence 47 2 (2025) 708\u2013724.","DOI":"10.1109\/TPAMI.2024.3479776"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02487"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02497"},{"key":"e_1_3_3_1_20_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Madry Aleksander","year":"2018","unstructured":"Aleksander Madry, Aleksandar Makelov, Ludwig Schmidt, Dimitris Tsipras, and Adrian Vladu. 2018. Towards Deep Learning Models Resistant to Adversarial Attacks. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_21_2","series-title":"(IJCAI \u201924)","volume-title":"Proceedings of the International Joint Conference on Artificial Intelligence (IJCAI)","author":"Mu Zhaoxi","year":"2024","unstructured":"Zhaoxi Mu and Xinyu Yang. 2024. Separate in the speech chain: cross-modal conditional audio-visual target speech extraction. In Proceedings of the International Joint Conference on Artificial Intelligence (IJCAI)(IJCAI \u201924). Article 709, 9\u00a0pages."},{"key":"e_1_3_3_1_22_2","unstructured":"Le\u00a0Thien\u00a0Phuc Nguyen Zhuoran Yu Khoa Quang\u00a0Nhat Cao Yuwei Guo Tu\u00a0Ho\u00a0Manh Pham Tuan\u00a0Tai Nguyen Toan Ngo\u00a0Duc Vo Lucas Poon Soochahn Lee and Yong\u00a0Jae Lee. 2025. UniTalk: Towards Universal Active Speaker Detection in Real World Scenarios. arxiv:https:\/\/arXiv.org\/abs\/2505.21954\u00a0[cs.CV]"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096874"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053900"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3731715.3734426"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01019"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475587"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00555"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Jiafeng Wang Zhaoyu Chen Kaixun Jiang Dingkang Yang Lingyi Hong Yan Wang and Wenqiang Zhang. 2024. Boosting the Transferability of Adversarial Attacks with Global Momentum Initialization. Expert Systems with Applications 255 (2024) 124757.","DOI":"10.1016\/j.eswa.2024.124757"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00190"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01747"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00425"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01210"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Peng Xu Xiatian Zhu and David\u00a0A. Clifton. 2023. Multimodal Learning With Transformers: A Survey. IEEE Transactions on Pattern Analysis and Machine Intelligence 45 10 (2023) 12113\u201312132.","DOI":"10.1109\/TPAMI.2023.3275156"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Zhe Yang Wenrui Li and Guanghui Cheng. 2025. SHMamba: Structured Hyperbolic State Space Model for Audio-Visual Question Answering. IEEE Transactions on Audio Speech and Language Processing 33 (2025) 3582\u20133593.","DOI":"10.1109\/TASLPRO.2025.3597461"},{"key":"e_1_3_3_1_36_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Yang Zequn","year":"2024","unstructured":"Zequn Yang, Yake Wei, Ce Liang, and Di Hu. 2024. Quantifying and Enhancing Multi-modal Robustness with Modality Preference. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_37_2","volume-title":"International Conference on Machine Learning (ICML)","author":"Yao Wei","year":"2025","unstructured":"Wei Yao, Zeliang Zhang, Huayi Tang, and Yong Liu. 2025. Understanding Model Ensemble in Transferable Adversarial Attack. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_3_1_38_2","first-page":"1","volume-title":"European Conference on Computer Vision (ECCV)","author":"Zhang Lin","year":"2024","unstructured":"Lin Zhang, Shentong Mo, Yijing Zhang, and Pedro Morgado. 2024. Audio-Synchronized Visual Animation. In European Conference on Computer Vision (ECCV). 1\u201318."},{"key":"e_1_3_3_1_39_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Zhang Zeliang","year":"2025","unstructured":"Zeliang Zhang, Susan Liang, Daiki Shimada, and Chenliang Xu. 2025. Rethinking Audio-Visual Adversarial Vulnerability from Temporal and Modality Perspectives. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Xujian Zhao Yixin Wang and Peiquan Jin. 2025. Audio-Visual Adaptive Fusion Network for Question Answering Based on Contrastive Learning. Proceedings of the AAAI Conference on Artificial Intelligence 39 10 10483\u201310491.","DOI":"10.1609\/aaai.v39i10.33138"},{"key":"e_1_3_3_1_41_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Zhu Bin","year":"2024","unstructured":"Bin Zhu, Bin Lin, Munan Ning, Yang Yan, Jiaxi Cui, WANG HongFa, Yatian Pang, Wenhao Jiang, Junwu Zhang, Zongwei Li, Cai\u00a0Wan Zhang, Zhifeng Li, Wei Liu, and Li Yuan. 2024. LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment. In International Conference on Learning Representations (ICLR)."}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:29:47Z","timestamp":1781537387000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810812"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":40,"alternative-id":["10.1145\/3805622.3810812","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810812","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}