{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T00:40:58Z","timestamp":1777336858552,"version":"3.51.4"},"reference-count":38,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.neucom.2026.133532","type":"journal-article","created":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T01:55:29Z","timestamp":1775181329000},"page":"133532","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["A multimodal framework for violent behavior recognition in surveillance videos"],"prefix":"10.1016","volume":"684","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0672-5593","authenticated-orcid":false,"given":"Chih-Yung","family":"Chang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Syu-Jhih","family":"Jhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu-Ting","family":"Chin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"I.-Hsiung","family":"Chang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Diptendu Sinha","family":"Roy","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.neucom.2026.133532_bib1","doi-asserted-by":"crossref","first-page":"3240","DOI":"10.1007\/s10489-022-03613-1","article-title":"Attention-based residual autoencoder for video anomaly detection","volume":"53","author":"Le","year":"2022","journal-title":"Appl. Intell."},{"issue":"9","key":"10.1016\/j.neucom.2026.133532_bib2","first-page":"1","article-title":"EADN: An efficient deep learning model for anomaly detection in videos","volume":"10","author":"Amin","year":"2022","journal-title":"Mathematics"},{"key":"10.1016\/j.neucom.2026.133532_bib3","series-title":"European conference on computer vision","first-page":"541","article-title":"Action detection with improved dense trajectories and sliding window","author":"Shu","year":"2014"},{"key":"10.1016\/j.neucom.2026.133532_bib4","doi-asserted-by":"crossref","unstructured":"L. Sun, X. Yang, and C. Hu, DSWHAR: A dynamic sliding window based human activity recognition method, 2022 IEEE Smartworld, Ubiquitous Intelligence & Computing, Scalable Computing & Communications, Digital Twin, Privacy Computing, Metaverse, Autonomous & Trusted Vehicles (SmartWorld\/UIC\/ScalCom\/DigitalTwin\/PriComp\/Meta), Haikou, China, 2022, pp. 1421-1426.","DOI":"10.1109\/SmartWorld-UIC-ATC-ScalCom-DigitalTwin-PriComp-Metaverse56740.2022.00205"},{"key":"10.1016\/j.neucom.2026.133532_bib5","doi-asserted-by":"crossref","unstructured":"V.-D. Le, T.-L. Nghiem, and T.-L. Le, Accurate continuous action and gesture recognition method based on skeleton and sliding windows techniques, 2023 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). 2023, pp. 284-290.","DOI":"10.1109\/APSIPAASC58517.2023.10317368"},{"issue":"17","key":"10.1016\/j.neucom.2026.133532_bib6","doi-asserted-by":"crossref","first-page":"1","DOI":"10.3390\/s22176563","article-title":"Anomaly detection in traffic surveillance videos using deep learning","volume":"22","author":"Khan","year":"2022","journal-title":"Sensors"},{"key":"10.1016\/j.neucom.2026.133532_bib7","doi-asserted-by":"crossref","unstructured":"S. Hamdi, S. Bouindour, K. Loukil, H. Snoussi, and M. Abid, Hybrid deep learning and HOF for anomaly detection, in Proc. Int. Conf. Control, Decis. Inf. Technol., Paris, France, Apr. 2019, pp. 1-6.","DOI":"10.1109\/CoDIT.2019.8820352"},{"issue":"19","key":"10.1016\/j.neucom.2026.133532_bib8","doi-asserted-by":"crossref","first-page":"1","DOI":"10.3390\/s19194145","article-title":"An efficient and robust unsupervised anomaly detection method using ensemble random projection in surveillance videos","volume":"19","author":"Hu","year":"2019","journal-title":"Sensors"},{"key":"10.1016\/j.neucom.2026.133532_bib9","doi-asserted-by":"crossref","unstructured":"M.B. Shaikh, D. Chai, S.M.S. Islam, and N. Akhtar, MAiVAR: Multimodal audio-image and video action recognizer, 2022 IEEE International Conference on Visual Communications and Image Processing (VCIP). 2022, pp. 1-5.","DOI":"10.1109\/VCIP56404.2022.10008833"},{"key":"10.1016\/j.neucom.2026.133532_bib10","unstructured":"J.D.S. Ortega, M. Senoussaoui, E. Granger, M. Pedersoli, P. Cardinal, and A.L. Koerich, Multimodal fusion with deep neural networks for audio-video emotion recognition, arXiv preprint arXiv:1907.03196, 2019."},{"key":"10.1016\/j.neucom.2026.133532_bib11","doi-asserted-by":"crossref","unstructured":"B. Yang, Q. Zhang, and Z. Liu, ICANet: A method of short video emotion recognition driven by multimodal data, 2022 2nd International Conference on Networking Systems of AI (INSAI). 2022, pp. 22-25.","DOI":"10.1109\/INSAI56792.2022.00014"},{"key":"10.1016\/j.neucom.2026.133532_bib12","doi-asserted-by":"crossref","unstructured":"J. He, Y. Ren, L. Zhai, and W. Liu, FCC-MF: Detecting violence in audio-visual context with frame-wise cluster contrast and modality-stage flooding, ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Seoul, Korea, Apr. 2024, pp. 8346-8350.","DOI":"10.1109\/ICASSP48485.2024.10447086"},{"key":"10.1016\/j.neucom.2026.133532_bib13","doi-asserted-by":"crossref","unstructured":"P. Wu, J. Liu, Y. Shi, Y. Sun, F. Shao, Z. Wu, and Z. Yang, Not only look, but also listen: Learning multimodal violence detection under weak supervision, European conference on computer vision. Glasgow, U.K., 2020, pp. 1-17.","DOI":"10.1007\/978-3-030-58577-8_20"},{"key":"10.1016\/j.neucom.2026.133532_bib14","doi-asserted-by":"crossref","unstructured":"Z. Liu, J. Ning, Y. Cao, Y. Wei, Z. Zhang, S. Lin, and H. Hu, Video Swin Transformer, Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3202\u20133211, 2022.","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"10.1016\/j.neucom.2026.133532_bib15","unstructured":"A. Radford et al., Learning Transferable Visual Models From Natural Language Supervision, Proc. Int. Conf. Machine Learning (ICML), 2021."},{"key":"10.1016\/j.neucom.2026.133532_bib16","unstructured":"C. Jia et al., ALIGN: Scaling Up Visual and Vision-Language Representation Learning, Proc. Int. Conf. Machine Learning (ICML), 2021."},{"key":"10.1016\/j.neucom.2026.133532_bib17","doi-asserted-by":"crossref","unstructured":"J. Lei et al., TVR: A Large-Scale Dataset for Video-Subtitle Moment Retrieval, Proc. European Conf. Computer Vision (ECCV), 2020.","DOI":"10.1007\/978-3-030-58589-1_27"},{"key":"10.1016\/j.neucom.2026.133532_bib18","unstructured":"H. Xu et al., Video\u2013Language Pre-training with Contrastive Learning, Advances in Neural Information Processing Systems (NeurIPS), 2021."},{"key":"10.1016\/j.neucom.2026.133532_bib19","doi-asserted-by":"crossref","first-page":"4646","DOI":"10.1007\/s11263-025-02409-3","article-title":"Multi-text guidance is important: Multi-modality image fusion via large generative vision-language model","volume":"133","author":"Wang","year":"2025","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.neucom.2026.133532_bib20","article-title":"SD-fuse: an image structure-driven model for multi-focus image fusion","volume":"103","author":"Wang","year":"2025","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.neucom.2026.133532_bib21","article-title":"Infrared and visible image fusion via iterative feature decomposition and deep balanced fusion","volume":"151","author":"Zhao","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.neucom.2026.133532_bib22","doi-asserted-by":"crossref","first-page":"1321","DOI":"10.1109\/TIP.2026.3654370","article-title":"Rethinking multi-focus image fusion: an input space optimization view","volume":"35","author":"Wang","year":"2026","journal-title":"IEEE Trans. Image Process."},{"issue":"11","key":"10.1016\/j.neucom.2026.133532_bib23","first-page":"12747","article-title":"Dynamic unary convolution in transformers","volume":"45","author":"Duan","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.neucom.2026.133532_bib24","unstructured":"Z. Wang, J. Zhang, H. Song, M. Ge, J. Wang, and H. Duan, Highlight what you want: Weakly-supervised instance-level controllable infrared-visible image fusion, in Proc. IEEE\/CVF Int. Conf. Computer Vision (ICCV), 2025, pp. 12637\u201312647."},{"key":"10.1016\/j.neucom.2026.133532_bib25","doi-asserted-by":"crossref","first-page":"701","DOI":"10.1109\/TASE.2023.3330704","article-title":"Edge-assisted epipolar transformer for industrial scene reconstruction","volume":"22","author":"Tong","year":"2025","journal-title":"IEEE Trans. Autom. Sci. Eng."},{"issue":"9","key":"10.1016\/j.neucom.2026.133532_bib26","doi-asserted-by":"crossref","first-page":"10761","DOI":"10.1109\/TII.2024.3392270","article-title":"Robust depth estimation based on parallax attention for aerial scene perception","volume":"20","author":"Tong","year":"2024","journal-title":"IEEE Trans. Ind. Inform."},{"key":"10.1016\/j.neucom.2026.133532_bib27","article-title":"Neural rendering and flow-assisted unsupervised multi-view stereo for real-time monocular tracking and scene perception","author":"Tong","year":"2025","journal-title":"IEEE Trans. Autom. Sci. Eng."},{"key":"10.1016\/j.neucom.2026.133532_bib28","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2024.102399","article-title":"Large-scale aerial scene perception based on self-supervised multi-view stereo via cycled generative adversarial network","volume":"109","author":"Tong","year":"2024","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.neucom.2026.133532_bib29","doi-asserted-by":"crossref","first-page":"4748","DOI":"10.1109\/TASE.2024.3448255","article-title":"Semi-supervised image domain adaption for aerial refueling drogue detection on embedded chip under foggy conditions","volume":"22","author":"Tong","year":"2025","journal-title":"IEEE Trans. Autom. Sci. Eng."},{"key":"10.1016\/j.neucom.2026.133532_bib30","unstructured":"J. Li, D. Li, C. Xiong, and S. Hoi, BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation, in Proceedings of the 39th International Conference on Machine Learning (ICML), 2022, pp. 12888-12900."},{"key":"10.1016\/j.neucom.2026.133532_bib31","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, and J. Sun, Deep residual learning for image recognition, Proc. IEEE Conf. Comput. Vis. Pattern Recognit., 2016, pp. 770-778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.neucom.2026.133532_bib32","doi-asserted-by":"crossref","unstructured":"J. Carreira and A. Zisserman, Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset, Proc. IEEE Conf. Comput. Vis. Pattern Recognit., 2017, pp. 4724-4733.","DOI":"10.1109\/CVPR.2017.502"},{"key":"10.1016\/j.neucom.2026.133532_bib33","doi-asserted-by":"crossref","unstructured":"P. Wu, J. Liu, Y. Shi, Y. Sun, F. Shao, Z. Wu, and Z. Yang, Not only look, but also listen: Learning multimodal violence detection under weak supervision, Eur. Conf. Comput. Vis., Glasgow, U.K., 2020, pp. 1-17.","DOI":"10.1007\/978-3-030-58577-8_20"},{"key":"10.1016\/j.neucom.2026.133532_bib34","unstructured":"A. Radford et al., Learning transferable visual models from natural language supervision, Proc. ICML, 2021, pp. 8748-8763."},{"key":"10.1016\/j.neucom.2026.133532_bib35","doi-asserted-by":"crossref","unstructured":"J. Carreira and A. Zisserman, Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset, Proc. IEEE CVPR, 2017, pp. 4724-4733.","DOI":"10.1109\/CVPR.2017.502"},{"issue":"6","key":"10.1016\/j.neucom.2026.133532_bib36","first-page":"2331","article-title":"Learning weakly supervised audio-visual violence detection in hyperbolic space","volume":"25","author":"Peng","year":"2024","journal-title":"IEEE Trans. Multimed."},{"issue":"10","key":"10.1016\/j.neucom.2026.133532_bib37","first-page":"6272","article-title":"Gated fusion networks for multi-modal violence detection","volume":"25","author":"Ahmad","year":"2025","journal-title":"Sensors"},{"key":"10.1016\/j.neucom.2026.133532_bib38","first-page":"36","article-title":"Multimodal violent video recognition based on mutual distillation","volume":"181","author":"Shang","year":"2024","journal-title":"Pattern Recognit. Lett."}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S092523122600929X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S092523122600929X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,27]],"date-time":"2026-04-27T23:48:19Z","timestamp":1777333699000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S092523122600929X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":38,"alternative-id":["S092523122600929X"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133532","relation":{},"ISSN":["0925-2312"],"issn-type":[{"value":"0925-2312","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"A multimodal framework for violent behavior recognition in surveillance videos","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133532","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Published by Elsevier B.V.","name":"copyright","label":"Copyright"}],"article-number":"133532"}}