{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:07:42Z","timestamp":1778080062774,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":75,"publisher":"ACM","funder":[{"name":"Start-up Package, School of EEECS, Queen?s University Belfast","award":["D8203EEC"],"award-info":[{"award-number":["D8203EEC"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754500","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:56:43Z","timestamp":1761371803000},"page":"2586-2595","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["EventVAD: Training-Free Event-Aware Video Anomaly Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-0475-7142","authenticated-orcid":false,"given":"Yihua","family":"Shao","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2554-2020","authenticated-orcid":false,"given":"Haojin","family":"He","sequence":"additional","affiliation":[{"name":"Guangdong University of Technology, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5272-8657","authenticated-orcid":false,"given":"Sijie","family":"Li","sequence":"additional","affiliation":[{"name":"The University of Sheffield, Sheffield, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7212-6782","authenticated-orcid":false,"given":"Siyu","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Science and Technology Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5097-5430","authenticated-orcid":false,"given":"Xinwei","family":"Long","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5167-0094","authenticated-orcid":false,"given":"Fanhu","family":"Zeng","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7188-972X","authenticated-orcid":false,"given":"Yuxuan","family":"Fan","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3774-9417","authenticated-orcid":false,"given":"Muyang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Nanjing University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7531-4057","authenticated-orcid":false,"given":"Ziyang","family":"Yan","sequence":"additional","affiliation":[{"name":"University of Trento, Trento, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1967-8020","authenticated-orcid":false,"given":"Ao","family":"Ma","sequence":"additional","affiliation":[{"name":"JD.com, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6058-3426","authenticated-orcid":false,"given":"Xiaochen","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Science and Technology Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2077-1246","authenticated-orcid":false,"given":"Hao","family":"Tang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5341-6040","authenticated-orcid":false,"given":"Yan","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5107-0338","authenticated-orcid":false,"given":"Shuyan","family":"Li","sequence":"additional","affiliation":[{"name":"Queen's University Belfast, Belfast, United Kingdom"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716-23736."},{"key":"e_1_3_2_1_2_1","volume-title":"Minigpt4-video: Advancing multimodal llms for video understanding with interleaved visual-textual tokens. arXiv preprint arXiv:2404.03413","author":"Ataallah Kirolos","year":"2024","unstructured":"Kirolos Ataallah, Xiaoqian Shen, Eslam Abdelrahman, Essam Sleiman, Deyao Zhu, Jian Ding, and Mohamed Elhoseiny. 2024. Minigpt4-video: Advancing multimodal llms for video understanding with interleaved visual-textual tokens. arXiv preprint arXiv:2404.03413 (2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"Shikra: Unleashing multimodal llm's referential dialogue magic. arXiv preprint arXiv:2306.15195","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023b. Shikra: Unleashing multimodal llm's referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Mgfn: Magnitude-contrastive glance-and-focus network for weakly-supervised video anomaly detection. In AAAI.","author":"Chen Yingxian","year":"2023","unstructured":"Yingxian Chen, Zhengzhe Liu, Baoheng Zhang, Wilton Fok, Xiaojuan Qi, and Yik-Chung Wu. 2023a. Mgfn: Magnitude-contrastive glance-and-focus network for weakly-supervised video anomaly detection. In AAAI."},{"key":"e_1_3_2_1_5_1","unstructured":"Zesen Cheng Sicong Leng Hang Zhang Yifei Xin Xin Li Guanzheng Chen Yongxin Zhu Wenqi Zhang Ziyang Luo Deli Zhao et al. 2024. Videollama 2: Advancing spatial-temporal modeling and audio understanding in video-llms. arXiv preprint arXiv:2406.07476 (2024)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-78125-4_25"},{"key":"e_1_3_2_1_7_1","volume-title":"Armand Joulin, and Ishan Misra.","author":"Girdhar Rohit","year":"2023","unstructured":"Rohit Girdhar, Alaaeldin El-Nouby, Zhuang Liu, Mannat Singh, Kalyan Vasudev Alwala, Armand Joulin, and Ishan Misra. 2023. Imagebind: One embedding space to bind them all. In CVPR."},{"key":"e_1_3_2_1_8_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Mahmudul Hasan Jonghyun Choi Jan Neumann Amit K Roy-Chowdhury and Larry S Davis. 2016. Learning temporal regularity in video sequences. In CVPR.","DOI":"10.1109\/CVPR.2016.86"},{"key":"e_1_3_2_1_10_1","unstructured":"Wenyi Hong Weihan Wang Ming Ding Wenmeng Yu Qingsong Lv Yan Wang Yean Cheng Shiyu Huang Junhui Ji Zhao Xue et al. 2024. Cogvlm2: Visual language models for image and video understanding. arXiv preprint arXiv:2408.16500 (2024)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01353"},{"key":"e_1_3_2_1_12_1","volume-title":"Video-lavit: Unified video-language pre-training with decoupled visual-motional tokenization. arXiv preprint arXiv:2402.03161","author":"Jin Yang","year":"2024","unstructured":"Yang Jin, Zhicheng Sun, Kun Xu, Liwei Chen, Hao Jiang, Quzhe Huang, Chengru Song, Yuliang Liu, Di Zhang, Yang Song, et al., 2024. Video-lavit: Unified video-language pre-training with decoupled visual-motional tokenization. arXiv preprint arXiv:2402.03161 (2024)."},{"key":"e_1_3_2_1_13_1","unstructured":"Yang Jin Kun Xu Liwei Chen Chao Liao Jianchao Tan Quzhe Huang Bin Chen Chenyi Lei An Liu Chengru Song et al. 2023. Unified language-vision pretraining in llm with dynamic discrete visual tokenization. arXiv preprint arXiv:2309.04669 (2023)."},{"key":"e_1_3_2_1_14_1","unstructured":"Hyekang Kevin Joo Khoa Vo Kashu Yamazaki and Ngan Le. 2023. CLIP-TSA: CLIP-Assisted Temporal Self-Attention for Weakly-Supervised Video Anomaly Detection. In ICIP."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jesp.2013.03.013"},{"key":"e_1_3_2_1_16_1","unstructured":"Guoqiu Li Guanxiong Cai Xingyu Zeng and Rui Zhao. 2022a. Scale-Aware Spatio-Temporal Relation Learning for Video Anomaly Detection. In ECCV."},{"key":"e_1_3_2_1_17_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In ICML."},{"key":"e_1_3_2_1_18_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022b. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"e_1_3_2_1_20_1","unstructured":"Shuo Li Fang Liu and Licheng Jiao. 2022c. Self-training multi-sequence learning with transformer for weakly supervised video anomaly detection. In AAAI."},{"key":"e_1_3_2_1_21_1","volume-title":"Xinyi Wang, Ziyang Yan, and Yihua Shao.","author":"Liao Minwen","year":"2025","unstructured":"Minwen Liao, Hao Bo Dong, Xinyi Wang, Ziyang Yan, and Yihua Shao. 2025. GM-MoE: Low-Light Enhancement with Gated-Mechanism Mixture-of-Experts. arXiv preprint arXiv:2503.07417 (2025)."},{"key":"e_1_3_2_1_22_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Improved baselines with visual instruction tuning. arXiv","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023a. Improved baselines with visual instruction tuning. arXiv (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023b. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_25_1","volume-title":"European Conference on Computer Vision. Springer, 1-18","author":"Liu Ruyang","year":"2024","unstructured":"Ruyang Liu, Chen Li, Haoran Tang, Yixiao Ge, Ying Shan, and Ge Li. 2024. St-llm: Large language models are effective temporal learners. In European Conference on Computer Vision. Springer, 1-18."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i23.34653"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3463070"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29837"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.450"},{"key":"e_1_3_2_1_30_1","unstructured":"Cewu Lu Jianping Shi and Jiaya Jia. 2013. Abnormal event detection at 150 fps in matlab. In ICCV."},{"key":"e_1_3_2_1_31_1","volume-title":"Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424","author":"Maaz Muhammad","year":"2023","unstructured":"Muhammad Maaz, Hanoona Rasheed, Salman Khan, and Fahad Shahbaz Khan. 2023. Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Umesh Chandra Pati, and Santos Kumar Das","author":"Nayak Rashmiranjan","year":"2021","unstructured":"Rashmiranjan Nayak, Umesh Chandra Pati, and Santos Kumar Das. 2021. A comprehensive review on deep learning-based methods for video anomaly detection. Image and Vision Computing (2021)."},{"key":"e_1_3_2_1_33_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3040591"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.3390\/rs15143585"},{"key":"e_1_3_2_1_36_1","volume-title":"Markus Hagenbuchner, and Gabriele Monfardini.","author":"Scarselli Franco","year":"2008","unstructured":"Franco Scarselli, Marco Gori, Ah Chung Tsoi, Markus Hagenbuchner, and Gabriele Monfardini. 2008. The graph neural network model. IEEE transactions on neural networks, Vol. 20, 1 (2008), 61-80."},{"key":"e_1_3_2_1_37_1","volume-title":"GWQ: Gradient-Aware Weight Quantization for Large Language Models. arXiv preprint arXiv:2411.00850","author":"Shao Yihua","year":"2024","unstructured":"Yihua Shao, Siyu Liang, Zijian Ling, Minxi Yan, Haiyang Liu, Siyu Chen, Ziyang Yan, Chenyu Zhang, Haotong Qin, Michele Magno, et al., 2024a. GWQ: Gradient-Aware Weight Quantization for Large Language Models. arXiv preprint arXiv:2411.00850 (2024)."},{"key":"e_1_3_2_1_38_1","unstructured":"Yihua Shao Deyang Lin Fanhu Zeng Minxi Yan Muyang Zhang Siyu Chen Yuxuan Fan Ziyang Yan Haozhe Wang Jingcai Guo et al. 2025a. TR-DQ: Time-Rotation Diffusion Quantization. arXiv preprint arXiv:2503.06564 (2025)."},{"key":"e_1_3_2_1_39_1","volume-title":"AccidentBlip: Agent of Accident Warning based on MA-former. arXiv preprint arXiv:2404.12149","author":"Shao Yihua","year":"2024","unstructured":"Yihua Shao, Yeling Xu, Xinwei Long, Siyu Chen, Ziyang Yan, Yang Yang, Haoting Liu, Yan Wang, Hao Tang, and Zhen Lei. 2024b. AccidentBlip: Agent of Accident Warning based on MA-former. arXiv preprint arXiv:2404.12149 (2024)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Yihua Shao Minxi Yan Yang Liu Siyu Chen Wenjie Chen Xinwei Long Ziyang Yan Lei Li Chenyu Zhang Nicu Sebe et al. 2025b. In-Context Meta LoRA Generation. arXiv preprint arXiv:2501.17635 (2025).","DOI":"10.24963\/ijcai.2025\/683"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Fahad Sohrab Jenni Raitoharju Moncef Gabbouj and Alexandros Iosifidis. 2018. Subspace support vector data description. In ICPR.","DOI":"10.1109\/ICPR.2018.8545819"},{"key":"e_1_3_2_1_42_1","volume-title":"Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355","author":"Su Yixuan","year":"2023","unstructured":"Yixuan Su, Tian Lan, Huayang Li, Jialu Xu, Yan Wang, and Deng Cai. 2023. Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355 (2023)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Waqas Sultani Chen Chen and Mubarak Shah. 2018. Real-world anomaly detection in surveillance videos. In CVPR.","DOI":"10.1109\/CVPR.2018.00678"},{"key":"e_1_3_2_1_44_1","volume-title":"Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang, and Yue Cao. 2023. Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)."},{"key":"e_1_3_2_1_45_1","volume-title":"Raft: Recurrent all-pairs field transforms for optical flow. In Computer Vision-ECCV 2020: 16th European Conference","author":"Teed Zachary","year":"2020","unstructured":"Zachary Teed and Jia Deng. 2020. Raft: Recurrent all-pairs field transforms for optical flow. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part II 16. Springer, 402-419."},{"key":"e_1_3_2_1_46_1","volume-title":"Heeseung Choi, Haksub Kim, and Ig-Jae Kim.","author":"Thakare Kamalakar Vijay","year":"2023","unstructured":"Kamalakar Vijay Thakare, Debi Prosad Dogra, Heeseung Choi, Haksub Kim, and Ig-Jae Kim. 2023a. RareAnom: A Benchmark Video Dataset for Rare Type Anomalies. Pattern Recognition (2023)."},{"key":"e_1_3_2_1_47_1","volume-title":"Heeseung Choi, and Ig-Jae Kim.","author":"Thakare Kamalakar Vijay","year":"2023","unstructured":"Kamalakar Vijay Thakare, Yash Raghuwanshi, Debi Prosad Dogra, Heeseung Choi, and Ig-Jae Kim. 2023b. DyAnNet: A Scene Dynamicity Guided Self-Trained Video Anomaly Detection Network. In WACV."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Yu Tian Guansong Pang Yuanhong Chen Rajvinder Singh Johan W Verjans and Gustavo Carneiro. 2021. Weakly-supervised video anomaly detection with robust temporal feature magnitude learning. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00493"},{"key":"e_1_3_2_1_49_1","volume-title":"Llama: Open and efficient foundation language models. arXiv","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al., 2023. Llama: Open and efficient foundation language models. arXiv (2023)."},{"key":"e_1_3_2_1_50_1","unstructured":"Anil Osman Tur Nicola Dall'Asen Cigdem Beyan and Elisa Ricci. 2023a. Exploring diffusion models for unsupervised video anomaly detection. In ICIP."},{"key":"e_1_3_2_1_51_1","unstructured":"Anil Osman Tur Nicola Dall'Asen Cigdem Beyan and Elisa Ricci. 2023b. Unsupervised Video Anomaly Detection with Diffusion Models Conditioned on Compact Motion Representations. In ICIAP."},{"key":"e_1_3_2_1_52_1","volume-title":"Attention is all you need. NeurIPS","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. NeurIPS (2017)."},{"key":"e_1_3_2_1_53_1","volume-title":"Graph Attention Networks. In International Conference on Learning Representations.","author":"Veli\u010dkovi\u0107 Petar","year":"2018","unstructured":"Petar Veli\u010dkovi\u0107, Guillem Cucurull, Arantxa Casanova, Adriana Romero, Pietro Li\u00f2, and Yoshua Bengio. 2018. Graph Attention Networks. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_54_1","volume-title":"Gods: Generalized one-class discriminative subspaces for anomaly detection. In ICCV.","author":"Wang Jue","year":"2019","unstructured":"Jue Wang and Anoop Cherian. 2019. Gods: Generalized one-class discriminative subspaces for anomaly detection. In ICCV."},{"key":"e_1_3_2_1_55_1","unstructured":"Nan Wang Yuantao Chen Lixing Xiao Weiqing Xiao Bohan Li Zhaoxi Chen Chongjie Ye Shaocong Xu Saining Zhang Ziyang Yan et al. 2025. Unifying Appearance Codes and Bilateral Grids for Driving Scene Gaussian Splatting. arXiv preprint arXiv:2506.05280 (2025)."},{"key":"e_1_3_2_1_56_1","first-page":"121475","article-title":"Cogvlm: Visual expert for pretrained language models","volume":"37","author":"Wang Weihan","year":"2024","unstructured":"Weihan Wang, Qingsong Lv, Wenmeng Yu, Wenyi Hong, Ji Qi, Yan Wang, Junhui Ji, Zhuoyi Yang, Lei Zhao, Song XiXuan, et al., 2024. Cogvlm: Visual expert for pretrained language models. Advances in Neural Information Processing Systems, Vol. 37 (2024), 121475-121499.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_57_1","unstructured":"Jhih-Ciang Wu He-Yen Hsieh Ding-Jie Chen Chiou-Shann Fuh and Tyng-Luh Liu. 2022. Self-supervised Sparse Representation for Video Anomaly Detection. In ECCV."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3062192"},{"key":"e_1_3_2_1_59_1","unstructured":"Peng Wu Jing Liu Yujia Shi Yujia Sun Fangtao Shao Zhaoyang Wu and Zhiwei Yang. 2020. Not only look but also listen: Learning multimodal violence detection under weak supervision. In ECCV."},{"key":"e_1_3_2_1_60_1","volume-title":"Renderworld: World model with self-supervised 3d label. arXiv preprint arXiv:2409.11356","author":"Yan Ziyang","year":"2024","unstructured":"Ziyang Yan, Wenzhen Dong, Yihua Shao, Yuhang Lu, Liu Haiyang, Jingwen Liu, Haozhe Wang, Zhe Wang, Yan Wang, Fabio Remondino, et al., 2024a. Renderworld: World model with self-supervised 3d label. arXiv preprint arXiv:2409.11356 (2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"3dsceneeditor","author":"Yan Ziyang","year":"2024","unstructured":"Ziyang Yan, Lei Li, Yihua Shao, Siyu Chen, Zongkai Wu, Jenq-Neng Hwang, Hao Zhao, and Fabio Remondino. 2024b. 3dsceneeditor: Controllable 3d scene editing with gaussian splatting. arXiv preprint arXiv:2412.01583 (2024)."},{"key":"e_1_3_2_1_62_1","first-page":"219","article-title":"NeRFBK: a holistic dataset for benchmarking NeRF-based 3D reconstruction. International Archives of the Photogrammetry","volume":"48","author":"Yan Ziyang","year":"2023","unstructured":"Ziyang Yan, Gabriele Mazzacca, Simone Rigon, Elisa Mariarosaria Farella, Pawel Trybala, Fabio Remondino, et al., 2023. NeRFBK: a holistic dataset for benchmarking NeRF-based 3D reconstruction. International Archives of the Photogrammetry, Remote Sensing and Spatial Information Sciences, Vol. 48, 1 (2023), 219-226.","journal-title":"Remote Sensing and Spatial Information Sciences"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.3390\/metrology5020020"},{"key":"e_1_3_2_1_64_1","volume-title":"Proceedings of the ieee\/cvf conference on computer vision and pattern recognition. 13040-13051","author":"Ye Qinghao","year":"2024","unstructured":"Qinghao Ye, Haiyang Xu, Jiabo Ye, Ming Yan, Anwen Hu, Haowei Liu, Qi Qian, Ji Zhang, and Fei Huang. 2024. mplug-owl2: Revolutionizing multi-modal large language model with modality collaboration. In Proceedings of the ieee\/cvf conference on computer vision and pattern recognition. 13040-13051."},{"key":"e_1_3_2_1_65_1","volume-title":"Claws: Clustering assisted weakly supervised learning with normalcy suppression for anomalous event detection. In ECCV.","author":"Zaheer Muhammad Zaigham","year":"2020","unstructured":"Muhammad Zaigham Zaheer, Arif Mahmood, Marcella Astrid, and Seung-Ik Lee. 2020. Claws: Clustering assisted weakly supervised learning with normalcy suppression for anomalous event detection. In ECCV."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"crossref","unstructured":"M Zaigham Zaheer Arif Mahmood M Haris Khan Mattia Segu Fisher Yu and Seung-Ik Lee. 2022. Generative cooperative learning for unsupervised video anomaly detection. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01433"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01753"},{"key":"e_1_3_2_1_68_1","volume-title":"Local-Prompt: Extensible Local Prompts for Few-Shot Out-of-Distribution Detection. In The Thirteenth International Conference on Learning Representations.","author":"Zeng Fanhu","year":"2025","unstructured":"Fanhu Zeng, Zhen Cheng, Fei Zhu, Hongxin Wei, and Xu-Yao Zhang. 2025b. Local-Prompt: Extensible Local Prompts for Few-Shot Out-of-Distribution Detection. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_69_1","volume-title":"Towards Efficient and General-Purpose Few-Shot Misclassification Detection for Vision-Language Models. arXiv preprint arXiv:2503.20492","author":"Zeng Fanhu","year":"2025","unstructured":"Fanhu Zeng, Zhen Cheng, Fei Zhu, and Xu-Yao Zhang. 2025a. Towards Efficient and General-Purpose Few-Shot Misclassification Detection for Vision-Language Models. arXiv preprint arXiv:2503.20492 (2025)."},{"key":"e_1_3_2_1_70_1","volume-title":"Parameter Efficient Merging for Multimodal Large Language Models with Complementary Parameter Adaptation. arXiv preprint arXiv:2502.17159","author":"Zeng Fanhu","year":"2025","unstructured":"Fanhu Zeng, Haiyang Guo, Fei Zhu, Li Shen, and Hao Tang. 2025c. Parameter Efficient Merging for Multimodal Large Language Models with Complementary Parameter Adaptation. arXiv preprint arXiv:2502.17159 (2025)."},{"key":"e_1_3_2_1_71_1","volume-title":"MambaIC: State Space Models for High-Performance Learned Image Compression. arXiv preprint arXiv:2503.12461","author":"Zeng Fanhu","year":"2025","unstructured":"Fanhu Zeng, Hao Tang, Yihua Shao, Siyu Chen, Ling Shao, and Yan Wang. 2025d. MambaIC: State Space Models for High-Performance Learned Image Compression. arXiv preprint arXiv:2503.12461 (2025)."},{"key":"e_1_3_2_1_72_1","volume-title":"Workshop on Machine Learning and Compression, NeurIPS","author":"Zeng Fanhu","year":"2024","unstructured":"Fanhu Zeng and Deli Yu. 2024. M2M-TAG: Training-Free Many-to-Many Token Aggregation for Vision Transformer Acceleration. In Workshop on Machine Learning and Compression, NeurIPS 2024."},{"key":"e_1_3_2_1_73_1","volume-title":"Modalprompt: Dual-modality guided prompt for continual learning of large multimodal models. arXiv preprint arXiv:2410.05849","author":"Zeng Fanhu","year":"2024","unstructured":"Fanhu Zeng, Fei Zhu, Haiyang Guo, Xu-Yao Zhang, and Cheng-Lin Liu. 2024. Modalprompt: Dual-modality guided prompt for continual learning of large multimodal models. arXiv preprint arXiv:2410.05849 (2024)."},{"key":"e_1_3_2_1_74_1","volume-title":"Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)."},{"key":"e_1_3_2_1_75_1","volume-title":"Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852","author":"Zhu Bin","year":"2023","unstructured":"Bin Zhu, Bin Lin, Munan Ning, Yang Yan, Jiaxi Cui, HongFa Wang, Yatian Pang, Wenhao Jiang, Junwu Zhang, Zongwei Li, et al., 2023. Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852 (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754500","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:05:10Z","timestamp":1765339510000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754500"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":75,"alternative-id":["10.1145\/3746027.3754500","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754500","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}