{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:54:03Z","timestamp":1781538843665,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Xinjiang Talent Development Fund","award":["XJRC-2025-KJ-PY-KJLJ-051"],"award-info":[{"award-number":["XJRC-2025-KJ-PY-KJLJ-051"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810882","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"874-883","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Rectifying Multimodal Variance: UAPA-HCF for Weakly Supervised Violence Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-5202-5114","authenticated-orcid":false,"given":"Longkun","family":"Shi","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Xinjiang University, Urumqi, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1899-0973","authenticated-orcid":false,"given":"Hanlin","family":"Hu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Xinjiang University, Urumqi, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2178-6552","authenticated-orcid":false,"given":"Haoze","family":"Zheng","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Xinjiang University, Urumqi, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9901-2613","authenticated-orcid":false,"given":"Yuanyuan","family":"Liao","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Xinjiang University, Urumqi, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1639-8899","authenticated-orcid":false,"given":"Turdi","family":"Tohti","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Xinjiang University, Urumqi, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Moloud Abdar Farhad Pourpanah Sadiq Hussain Dana Rezazadegan Li Liu Mohammad Ghavamzadeh Paul\u00a0W. Fieguth Xiaochun Cao Abbas Khosravi U.\u00a0Rajendra Acharya Vladimir Makarenkov and Saeid Nahavandi. 2020. A Review of Uncertainty Quantification in Deep Learning: Techniques Applications and Challenges. Inf. Fusion 76 (2020) 243\u2013297. https:\/\/api.semanticscholar.org\/CorpusID:226307260","DOI":"10.1016\/j.inffus.2021.05.008"},{"key":"e_1_3_3_1_3_2","series-title":"Proceedings of Machine Learning Research","first-page":"3142","volume-title":"Proceedings of The 28th International Conference on Artificial Intelligence and Statistics","volume":"258","author":"Bezirganyan Grigor","year":"2025","unstructured":"Grigor Bezirganyan, Sana Sellami, Laure Berti-Equille, and S\u00e9bastien Fournier. 2025. Multimodal Learning with Uncertainty Quantification based on Discounted Belief Fusion. In Proceedings of The 28th International Conference on Artificial Intelligence and Statistics(Proceedings of Machine Learning Research, Vol.\u00a0258), Yingzhen Li, Stephan Mandt, Shipra Agrawal, and Emtiyaz Khan (Eds.). PMLR, 3142\u20133150. https:\/\/proceedings.mlr.press\/v258\/bezirganyan25a.html"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Jo\u00e3o Carreira and Andrew Zisserman. 2017. Quo Vadis Action Recognition? A New Model and the Kinetics Dataset. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017) 4724\u20134733. https:\/\/api.semanticscholar.org\/CorpusID:206596127","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"crossref","unstructured":"Jo\u00e3o Carreira and Andrew Zisserman. 2017. Quo Vadis Action Recognition? A New Model and the Kinetics Dataset. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017) 4724\u20134733. https:\/\/api.semanticscholar.org\/CorpusID:206596127","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","unstructured":"Claire-H\u00e9l\u00e8ne Demarty C\u00e9dric Penet Mohammad Soleymani and Guillaume Gravier. 2015. VSD a public dataset for the detection of violent scenes in movies: design annotation analysis and evaluation. Multimedia Tools Appl. 74 17 (Sept. 2015) 7379\u20137404. 10.1007\/s11042-014-1984-4","DOI":"10.1007\/s11042-014-1984-4"},{"key":"e_1_3_3_1_8_2","volume-title":"International Conference on Learning Representations","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01379"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00179"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","unstructured":"Zongbo Han Changqing Zhang Huazhu Fu and Joey\u00a0Tianyi Zhou. 2023. Trusted Multi-View Classification With Dynamic Evidential Fusion. IEEE Transactions on Pattern Analysis and Machine Intelligence 45 2 (2023) 2551\u20132566. 10.1109\/TPAMI.2022.3171983","DOI":"10.1109\/TPAMI.2022.3171983"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.86"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"e_1_3_3_1_15_2","unstructured":"Maximilian Ilse Jakub\u00a0M. Tomczak and Max Welling. 2018. Attention-based Deep Multiple Instance Learning. ArXiv abs\/1802.04712 (2018). https:\/\/api.semanticscholar.org\/CorpusID:3537028"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00209"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","unstructured":"Wenping Jin Li Zhu and Jing Sun. 2025. Aligning First Then Fusing: A novel weakly supervised multimodal violence detection method. Knowledge-Based Systems 322 (2025) 113709. 10.1016\/j.knosys.2025.113709","DOI":"10.1016\/j.knosys.2025.113709"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295309"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","unstructured":"Shuo Li Fang Liu and Licheng Jiao. 2022. Self-Training Multi-Sequence Learning with Transformer for Weakly Supervised Video Anomaly Detection. Proceedings of the AAAI Conference on Artificial Intelligence 36 2 (Jun. 2022) 1395\u20131403. 10.1609\/aaai.v36i2.20028","DOI":"10.1609\/aaai.v36i2.20028"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/419"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"Ze Liu Yutong Lin Yue Cao Han Hu Yixuan Wei Zheng Zhang Stephen Lin and Baining Guo. 2021. Swin Transformer: Hierarchical Vision Transformer using Shifted Windows. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021) 9992\u201310002. https:\/\/api.semanticscholar.org\/CorpusID:232352874","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.45"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01517"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","unstructured":"Pablo Negre Ricardo\u00a0S. Alonso Alfonso Gonz\u00e1lez-Briones Javier Prieto and Sara Rodr\u00edguez-Gonz\u00e1lez. 2024. Literature Review of Deep-Learning-Based Detection of Violence in Video. Sensors 24 12 (2024). 10.3390\/s24124016","DOI":"10.3390\/s24124016"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Hyunjong Park Jongyoun Noh and Bumsub Ham. 2020. Learning Memory-Guided Normality for Anomaly Detection. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020) 14360\u201314369. https:\/\/api.semanticscholar.org\/CorpusID:214713500","DOI":"10.1109\/CVPR42600.2020.01438"},{"key":"e_1_3_3_1_26_2","series-title":"(NIPS\u201999)","first-page":"582","volume-title":"Proceedings of the 13th International Conference on Neural Information Processing Systems","author":"Sch\u00f6lkopf Bernhard","year":"1999","unstructured":"Bernhard Sch\u00f6lkopf, Robert Williamson, Alex Smola, John Shawe-Taylor, and John Platt. 1999. Support vector method for novelty detection. In Proceedings of the 13th International Conference on Neural Information Processing Systems (Denver, CO) (NIPS\u201999). MIT Press, Cambridge, MA, USA, 582\u2013588."},{"key":"e_1_3_3_1_27_2","volume-title":"Advances in Neural Information Processing Systems","author":"Sensoy Murat","year":"2018","unstructured":"Murat Sensoy, Lance Kaplan, and Melih Kandemir. 2018. Evidential Deep Learning to Quantify Classification Uncertainty. In Advances in Neural Information Processing Systems , S.\u00a0Bengio, H.\u00a0Wallach, H.\u00a0Larochelle, K.\u00a0Grauman, N.\u00a0Cesa-Bianchi, and R.\u00a0Garnett (Eds.), Vol.\u00a031. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2018\/file\/a981f2b708044d6fb4a71a1463242520-Paper.pdf"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Zheng Shou Dongang Wang and Shih-Fu Chang. 2016. Action Temporal Localization in Untrimmed Videos via Multi-stage CNNs. ArXiv abs\/1601.02129 (2016). https:\/\/api.semanticscholar.org\/CorpusID:6637122","DOI":"10.1109\/CVPR.2016.119"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-99-8181-6_16"},{"key":"e_1_3_3_1_30_2","volume-title":"Neural Information Processing Systems","author":"Snell Jake","year":"2017","unstructured":"Jake Snell, Kevin Swersky, and Richard\u00a0S. Zemel. 2017. Prototypical Networks for Few-shot Learning. In Neural Information Processing Systems. https:\/\/api.semanticscholar.org\/CorpusID:309759"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Waqas Sultani Chen Chen and Mubarak Shah. 2018. Real-World Anomaly Detection in Surveillance Videos. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2018) 6479\u20136488. https:\/\/api.semanticscholar.org\/CorpusID:1610415","DOI":"10.1109\/CVPR.2018.00678"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"crossref","unstructured":"Shengyang Sun and Xiaojin Gong. 2024. Multi-scale Bottleneck Transformer for Weakly Supervised Multimodal Violence Detection. 2024 IEEE International Conference on Multimedia and Expo (ICME) (2024) 1\u20136. https:\/\/api.semanticscholar.org\/CorpusID:269626606","DOI":"10.1109\/ICME57554.2024.10688202"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"Yu Tian Guansong Pang Yuanhong Chen Rajvinder Singh Johan Verjans and G. Carneiro. 2021. Weakly-supervised Video Anomaly Detection with Robust Temporal Feature Magnitude Learning. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021) 4955\u20134966. https:\/\/api.semanticscholar.org\/CorpusID:236950792","DOI":"10.1109\/ICCV48922.2021.00493"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME46284.2020.9102722"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","unstructured":"Dicong Wang Qilong Wang Qinghua Hu and Kaijun Wu. 2025. Multimodal VAD: Visual Anomaly Detection in Intelligent Monitoring System via Audio-Vision-Language. IEEE Transactions on Instrumentation and Measurement 74 (2025) 1\u201312. 10.1109\/TIM.2025.3578702","DOI":"10.1109\/TIM.2025.3578702"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_20"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","unstructured":"Peng Wu Xiaotao Liu and Jing Liu. 2023. Weakly Supervised Audio-Visual Violence Detection. IEEE Transactions on Multimedia 25 (2023) 1674\u20131685. 10.1109\/TMM.2022.3147369","DOI":"10.1109\/TMM.2022.3147369"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","unstructured":"Peng Wu Xuerong Zhou Guansong Pang Lingru Zhou Qingsen Yan Peng Wang and Yanning Zhang. 2024. VadCLIP: Adapting Vision-Language Models for Weakly Supervised Video Anomaly Detection. Proceedings of the AAAI Conference on Artificial Intelligence 38 6 (Mar. 2024) 6074\u20136082. 10.1609\/aaai.v38i6.28423","DOI":"10.1609\/aaai.v38i6.28423"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","unstructured":"Yang Xiao Guxue Gao Liejun Wang and Huicheng Lai. 2022. Optical Flow-Aware-Based Multi-Modal Fusion Network for Violence Detection. Entropy 24 7 (2022). 10.3390\/e24070939","DOI":"10.3390\/e24070939"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","unstructured":"Hong-Ming Yang Xu-Yao Zhang Fei Yin Qing Yang and Cheng-Lin Liu. 2022. Convolutional Prototype Network for Open Set Recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence 44 5 (2022) 2358\u20132370. 10.1109\/TPAMI.2020.3045079","DOI":"10.1109\/TPAMI.2020.3045079"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","unstructured":"Liu Yang Zhenjie Wu Junkun Hong and Jun Long. 2023. MCL: A Contrastive Learning Method for Multimodal Data Fusion in Violence Detection. IEEE Signal Processing Letters 30 (2023) 408\u2013412. 10.1109\/LSP.2022.3227818","DOI":"10.1109\/LSP.2022.3227818"},{"key":"e_1_3_3_1_45_2","unstructured":"Jiashuo Yu Jin-Yuan Liu Ying Cheng Rui Feng and Yuejie Zhang. 2022. Modality-aware Contrastive Instance Learning with Self-Distillation for Weakly-Supervised Audio-Visual Violence Detection. Proceedings of the 30th ACM International Conference on Multimedia (2022). https:\/\/api.semanticscholar.org\/CorpusID:250451515"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","unstructured":"Dasheng Zhang Chao Huang Chengliang Liu and Yong Xu. 2022. Weakly Supervised Video Anomaly Detection via Transformer-Enabled Temporal Relation Learning. IEEE Signal Processing Letters 29 (2022) 1197\u20131201. 10.1109\/LSP.2022.3175092","DOI":"10.1109\/LSP.2022.3175092"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"crossref","unstructured":"Jia-Xing Zhong Nannan Li Weijie Kong Shan Liu Thomas\u00a0H. Li and Ge Li. 2019. Graph Convolutional Label Noise Cleaner: Train a Plug-And-Play Action Classifier for Anomaly Detection. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019) 1237\u20131246. https:\/\/api.semanticscholar.org\/CorpusID:81981674","DOI":"10.1109\/CVPR.2019.00133"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","unstructured":"Hang Zhou Junqing Yu and Wei Yang. 2023. Dual Memory Units with Uncertainty Regulation for Weakly Supervised Video Anomaly Detection. Proceedings of the AAAI Conference on Artificial Intelligence 37 3 (Jun. 2023) 3769\u20133777. 10.1609\/aaai.v37i3.25489","DOI":"10.1609\/aaai.v37i3.25489"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","unstructured":"Xiao Zhou Xiaogang Peng Hao Wen Yikai Luo Keyang Yu Ping Yang and Zizhao Wu. 2024. Learning weakly supervised audio-visual violence detection in hyperbolic space. Image and Vision Computing 151 (2024) 105286. 10.1016\/j.imavis.2024.105286","DOI":"10.1016\/j.imavis.2024.105286"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:57:35Z","timestamp":1781535455000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810882"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":48,"alternative-id":["10.1145\/3805622.3810882","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810882","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}