{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T02:09:30Z","timestamp":1768874970552,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Key Research and Development Program of Zhejiang Province","award":["2024C01017"],"award-info":[{"award-number":["2024C01017"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680934","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"5055-5064","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["TDSD: Text-Driven Scene-Decoupled Weakly Supervised Video Anomaly Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-7727-1904","authenticated-orcid":false,"given":"Shengyang","family":"Sun","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8435-064X","authenticated-orcid":false,"given":"Jiashen","family":"Hua","sequence":"additional","affiliation":[{"name":"Alibaba Cloud, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0341-333X","authenticated-orcid":false,"given":"Junyi","family":"Feng","sequence":"additional","affiliation":[{"name":"Alibaba Cloud, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9379-0181","authenticated-orcid":false,"given":"Dongxu","family":"Wei","sequence":"additional","affiliation":[{"name":"Alibaba Cloud, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0785-0294","authenticated-orcid":false,"given":"Baisheng","family":"Lai","sequence":"additional","affiliation":[{"name":"Alibaba Cloud, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9955-3569","authenticated-orcid":false,"given":"Xiaojin","family":"Gong","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2007.70825"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Qianyue Bao Fang Liu Yang Liu Licheng Jiao Xu Liu and Lingling Li. 2022. Hierarchical Scene Normality-Binding Modeling for Anomaly Detection in Surveillance Videos. In ACM MM. 6103--6112.","DOI":"10.1145\/3503161.3548199"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Congqi Cao Yue Lu Peng Wang and Yanning Zhang. 2023. A New Comprehensive Benchmark for Semi-supervised Video Anomaly Detection and Anticipation. In CVPR. 20392--20401.","DOI":"10.1109\/CVPR52729.2023.01953"},{"key":"e_1_3_2_1_4_1","volume-title":"Zi Jian Yew, Minhoe Hur, and David Aik-Aun Khoo.","author":"Chen Weiling","year":"2023","unstructured":"Weiling Chen, Keng Teck Ma, Zi Jian Yew, Minhoe Hur, and David Aik-Aun Khoo. 2023. TEVAD: Improved video anomaly detection with captions. In CVPR. 5548--5558."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25112"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"MyeongAh Cho Minjung Kim Sangwon Hwang Chaewon Park Kyungjae Lee and Sangyoun Lee. 2023. Look around for anomalies: weakly-supervised anomaly detection via context-motion relational learning. In CVPR. 12137--12146.","DOI":"10.1109\/CVPR52729.2023.01168"},{"key":"e_1_3_2_1_7_1","volume-title":"Mist: Multiple instance self-training framework for video anomaly detection. In CVPR. 14009--14018.","author":"Feng Jia-Chang","year":"2021","unstructured":"Jia-Chang Feng, Fa-Ting Hong, and Wei-Shi Zheng. 2021. Mist: Multiple instance self-training framework for video anomaly detection. In CVPR. 14009--14018."},{"key":"e_1_3_2_1_8_1","volume-title":"Svetha Venkatesh, and Anton van den Hengel.","author":"Gong Dong","year":"2019","unstructured":"Dong Gong, Lingqiao Liu, Vuong Le, Budhaditya Saha, Moussa Reda Mansour, Svetha Venkatesh, and Anton van den Hengel. 2019. Memorizing normality to detect anomaly: Memory-augmented deep autoencoder for unsupervised anomaly detection. In ICCV. 1705--1714."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3193857"},{"key":"e_1_3_2_1_10_1","volume-title":"Mariana-Iuliana Georgescu, and Ling Shao.","author":"Ionescu Radu Tudor","year":"2019","unstructured":"Radu Tudor Ionescu, Fahad Shahbaz Khan, Mariana-Iuliana Georgescu, and Ling Shao. 2019. Object-centric auto-encoders and dummy anomalies for abnormal event detection in video. In CVPR. 7842--7851."},{"key":"e_1_3_2_1_11_1","volume-title":"Detecting abnormal events in video using narrowed normality clusters","author":"Ionescu Radu Tudor","year":"1951","unstructured":"Radu Tudor Ionescu, Sorina Smeureanu, Marius Popescu, and Bogdan Alexe. 2019. Detecting abnormal events in video using narrowed normality clusters. In WACV. IEEE, 1951--1960."},{"key":"e_1_3_2_1_12_1","unstructured":"Chao Jia Yinfei Yang Ye Xia Yi-Ting Chen Zarana Parekh Hieu Pham Quoc Le Yun-Hsuan Sung Zhen Li and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In ICML. PMLR 4904--4916."},{"key":"e_1_3_2_1_13_1","volume-title":"Visual prompt tuning","author":"Jia Menglin","unstructured":"Menglin Jia, Luming Tang, Bor-Chun Chen, Claire Cardie, Serge Belongie, Bharath Hariharan, and Ser-Nam Lim. 2022. Visual prompt tuning. In ECCV. Springer, 709--727."},{"key":"e_1_3_2_1_14_1","volume-title":"Clip-tsa: Clip-assisted temporal self-attention for weakly-supervised video anomaly detection","author":"Joo Hyekang Kevin","year":"2023","unstructured":"Hyekang Kevin Joo, Khoa Vo, Kashu Yamazaki, and Ngan Le. 2023. Clip-tsa: Clip-assisted temporal self-attention for weakly-supervised video anomaly detection. In ICIP. IEEE, 3230--3234."},{"key":"e_1_3_2_1_15_1","volume-title":"Cees GM Snoek, and Rita Cucchiara","author":"Landi Federico","year":"2019","unstructured":"Federico Landi, Cees GM Snoek, and Rita Cucchiara. 2019. Anomaly locality in video surveillance. arXiv preprint arXiv:1901.10364 (2019)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20028"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","first-page":"18","DOI":"10.1109\/TPAMI.2013.111","article-title":"Anomaly detection and localization in crowded scenes","volume":"36","author":"Li Weixin","year":"2013","unstructured":"Weixin Li, Vijay Mahadevan, and Nuno Vasconcelos. 2013. Anomaly detection and localization in crowded scenes. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 36, 1 (2013), 18--32.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Kun Liu and Huadong Ma. 2019. Exploring background-bias for anomaly detection in surveillance videos. In ACM MM. 1490--1499.","DOI":"10.1145\/3343031.3350998"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Wen Liu Weixin Luo Dongze Lian and Shenghua Gao. 2018. Future frame prediction for anomaly detection--a new baseline. In CVPR. 6536--6545.","DOI":"10.1109\/CVPR.2018.00684"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3386339"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3402242"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Zhian Liu Yongwei Nie Chengjiang Long Qing Zhang and Guiqing Li. 2021. A hybrid video anomaly detection framework via memory-augmented flow reconstruction and flow-guided frame prediction. In ICCV. 13588--13597.","DOI":"10.1109\/ICCV48922.2021.01333"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Cewu Lu Jianping Shi and Jiaya Jia. 2013. Abnormal event detection at 150 fps in matlab. In ICCV. 2720--2727.","DOI":"10.1109\/ICCV.2013.338"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Hui Lv Chen Chen Zhen Cui Chunyan Xu Yong Li and Jian Yang. 2021. Learning normal dynamics in videos with meta prototype network. In CVPR. 15425--15434.","DOI":"10.1109\/CVPR46437.2021.01517"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Hui Lv Zhongqi Yue Qianru Sun Bin Luo Zhen Cui and Hanwang Zhang. 2023. Unbiased Multiple Instance Learning for Weakly Supervised Video Anomaly Detection. In CVPR. 8022--8031.","DOI":"10.1109\/CVPR52729.2023.00775"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3072863"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Trong-Nguyen Nguyen and Jean Meunier. 2019. Anomaly detection in video sequence with appearance-motion correspondence. In ICCV. 1273--1283.","DOI":"10.1109\/ICCV.2019.00136"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Hyunjong Park Jongyoun Noh and Bumsub Ham. 2020. Learning memory-guided normality for anomaly detection. In CVPR. 14372--14381.","DOI":"10.1109\/CVPR42600.2020.01438"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Didik Purwanto Yie-Tarng Chen and Wen-Hsien Fang. 2021. Dance with self-attention: A new look of conditional random fields on anomaly detection in videos. In ICCV. 173--183.","DOI":"10.1109\/ICCV48922.2021.00024"},{"key":"e_1_3_2_1_30_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML. PMLR, 8748--8763."},{"key":"e_1_3_2_1_31_1","first-page":"2293","article-title":"A Survey of Single-Scene Video Anomaly Detection","volume":"44","author":"Ramachandra Bharathkumar","year":"2022","unstructured":"Bharathkumar Ramachandra, Michael J. Jones, and Ranga Raju Vatsavai. 2022. A Survey of Single-Scene Video Anomaly Detection. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 44, 5 (2022), 2293--2312.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_32_1","volume-title":"Denseclip: Language-guided dense prediction with context-aware prompting. In CVPR. 18082--18091.","author":"Rao Yongming","year":"2022","unstructured":"Yongming Rao, Wenliang Zhao, Guangyi Chen, Yansong Tang, Zheng Zhu, Guan Huang, Jie Zhou, and Jiwen Lu. 2022. Denseclip: Language-guided dense prediction with context-aware prompting. In CVPR. 18082--18091."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2670780"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Waqas Sultani Chen Chen and Mubarak Shah. 2018. Real-world anomaly detection in surveillance videos. In CVPR. 6479--6488.","DOI":"10.1109\/CVPR.2018.00678"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Che Sun Yunde Jia Yao Hu and Yuwei Wu. 2020. Scene-aware context reasoning for unsupervised abnormal event detection in videos. In ACM MM. 184--192.","DOI":"10.1145\/3394171.3413887"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Che Sun Yunde Jia and Yuwei Wu. 2022. Evidential Reasoning for Video Anomaly Detection. In ACM MM. 2106--2114.","DOI":"10.1145\/3503161.3548091"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Shengyang Sun and Xiaojin Gong. 2023. Hierarchical Semantic Contrast for Scene-aware Video Anomaly Detection. In CVPR. 22846--22856.","DOI":"10.1109\/CVPR52729.2023.02188"},{"key":"e_1_3_2_1_39_1","volume-title":"Long-Short Temporal Co-Teaching for Weakly Supervised Video Anomaly Detection","author":"Sun Shengyang","unstructured":"Shengyang Sun and Xiaojin Gong. 2023. Long-Short Temporal Co-Teaching for Weakly Supervised Video Anomaly Detection. In ICME. IEEE, 2711--2716."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2024.105169"},{"key":"e_1_3_2_1_41_1","volume-title":"Multi-scale Bottleneck Transformer for Weakly Supervised Multimodal Violence Detection","author":"Sun Shengyang","unstructured":"Shengyang Sun and Xiaojin Gong. 2024. Multi-scale Bottleneck Transformer for Weakly Supervised Multimodal Violence Detection. In ICME. IEEE."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Yu Tian Guansong Pang Yuanhong Chen Rajvinder Singh Johan W Verjans and Gustavo Carneiro. 2021. Weakly-supervised video anomaly detection with robust temporal feature magnitude learning. In ICCV. 4975--4986.","DOI":"10.1109\/ICCV48922.2021.00493"},{"key":"e_1_3_2_1_43_1","volume-title":"NIPS","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. NIPS, Vol. 30 (2017)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3083152"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Jie Wu Wei Zhang Guanbin Li Wenhao Wu Xiao Tan Yingying Li Errui Ding and Liang Lin. 2021. Weakly-Supervised Spatio-Temporal Anomaly Detection in Surveillance Video. In IJCAI. 1172--1178.","DOI":"10.24963\/ijcai.2021\/162"},{"key":"e_1_3_2_1_46_1","volume-title":"Self-supervised sparse representation for video anomaly detection","author":"Wu Jhih-Ciang","unstructured":"Jhih-Ciang Wu, He-Yen Hsieh, Ding-Jie Chen, Chiou-Shann Fuh, and Tyng-Luh Liu. 2022. Self-supervised sparse representation for video anomaly detection. In ECCV. Springer, 729--745."},{"key":"e_1_3_2_1_47_1","volume-title":"Not only look, but also listen: Learning multimodal violence detection under weak supervision","author":"Wu Peng","unstructured":"Peng Wu, Jing Liu, Yujia Shi, Yujia Sun, Fangtao Shao, Zhaoyang Wu, and Zhiwei Yang. 2020. Not only look, but also listen: Learning multimodal violence detection under weak supervision. In ECCV. Springer, 322--339."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28423"},{"key":"e_1_3_2_1_49_1","volume-title":"Dual modality prompt tuning for vision-language pre-trained model","author":"Xing Yinghui","year":"2023","unstructured":"Yinghui Xing, Qirui Wu, De Cheng, Shizhou Zhang, Guoqiang Liang, Peng Wang, and Yanning Zhang. 2023. Dual modality prompt tuning for vision-language pre-trained model. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Guang Yu Siqi Wang Zhiping Cai En Zhu Chuanfu Xu Jianping Yin and Marius Kloft. 2020. Cloze test helps: Effective video anomaly detection via learning to complete video events. In ACM MM. 583--591.","DOI":"10.1145\/3394171.3413973"},{"key":"e_1_3_2_1_51_1","volume-title":"Delving into CLIP latent space for Video Anomaly Recognition. arXiv preprint arXiv:2310.02835","author":"Zanella Luca","year":"2023","unstructured":"Luca Zanella, Benedetta Liberatori, Willi Menapace, Fabio Poiesi, Yiming Wang, and Elisa Ricci. 2023. Delving into CLIP latent space for Video Anomaly Recognition. arXiv preprint arXiv:2310.02835 (2023)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Chen Zhang Guorong Li Yuankai Qi Shuhui Wang Laiyun Qing Qingming Huang and Ming-Hsuan Yang. 2023. Exploiting Completeness and Uncertainty of Pseudo Labels for Weakly Supervised Video Anomaly Detection. In CVPR. 16271--16280.","DOI":"10.1109\/CVPR52729.2023.01561"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Yiru Zhao Bing Deng Chen Shen Yao Liu Hongtao Lu and Xian-Sheng Hua. 2017. Spatio-temporal autoencoder for video anomaly detection. In ACM MM. 1933--1941.","DOI":"10.1145\/3123266.3123451"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Jia-Xing Zhong Nannan Li Weijie Kong Shan Liu Thomas H Li and Ge Li. 2019. Graph convolutional label noise cleaner: Train a plug-and-play action classifier for anomaly detection. In CVPR. 1237--1246.","DOI":"10.1109\/CVPR.2019.00133"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2723009"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"Hang Zhou Junqing Yu and Wei Yang. 2023. Dual Memory Units with Uncertainty Regulation for Weakly Supervised Video Anomaly Detection. In AAAI. Article 420 9 pages.","DOI":"10.1609\/aaai.v37i3.25489"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680934","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680934","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:34Z","timestamp":1750295854000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680934"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":57,"alternative-id":["10.1145\/3664647.3680934","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680934","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}