{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,18]],"date-time":"2026-04-18T04:58:59Z","timestamp":1776488339000,"version":"3.51.2"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T00:00:00Z","timestamp":1770076800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T00:00:00Z","timestamp":1770076800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Shanxi Province Basic Research Program","award":["No.202203021221145"],"award-info":[{"award-number":["No.202203021221145"]}]},{"name":"Shanxi Province Basic Research Program","award":["No.202203021221145"],"award-info":[{"award-number":["No.202203021221145"]}]},{"name":"Shanxi Province Basic Research Program","award":["No.202203021221145"],"award-info":[{"award-number":["No.202203021221145"]}]},{"name":"Shanxi Province Basic Research Program","award":["No.202203021221145"],"award-info":[{"award-number":["No.202203021221145"]}]},{"name":"Shanxi Province Basic Research Program","award":["No.202203021221145"],"award-info":[{"award-number":["No.202203021221145"]}]},{"name":"Shanxi Province Postgraduate Joint Training Demonstration Base Program","award":["No.2022JD11"],"award-info":[{"award-number":["No.2022JD11"]}]},{"name":"Shanxi Province Postgraduate Joint Training Demonstration Base Program","award":["No.2022JD11"],"award-info":[{"award-number":["No.2022JD11"]}]},{"name":"Shanxi Province Postgraduate Joint Training Demonstration Base Program","award":["No.2022JD11"],"award-info":[{"award-number":["No.2022JD11"]}]},{"name":"Shanxi Province Postgraduate Joint Training Demonstration Base Program","award":["No.2022JD11"],"award-info":[{"award-number":["No.2022JD11"]}]},{"name":"Shanxi Province Postgraduate Joint Training Demonstration Base Program","award":["No.2022JD11"],"award-info":[{"award-number":["No.2022JD11"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1007\/s00530-025-02158-w","type":"journal-article","created":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T03:40:42Z","timestamp":1770090042000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Cross-modal spatio-temporal fusion weakly supervised video anomaly detection based on large-scale vision-language models"],"prefix":"10.1007","volume":"32","author":[{"given":"Lihu","family":"Pan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shouxin","family":"Peng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rui","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sendren Sheng-Dong","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Binhong","family":"Xie","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,3]]},"reference":[{"issue":"20","key":"2158_CR1","doi-asserted-by":"publisher","first-page":"23628","DOI":"10.1007\/s10489-023-04767-2","volume":"53","author":"J Lin","year":"2023","unstructured":"Lin, J., He, Y., Xu, W., Guan, J., Zhang, J., Zhou, S.: Latent feature reconstruction for unsupervised anomaly detection. Appl. Intell. 53(20), 23628\u201323640 (2023)","journal-title":"Appl. Intell."},{"issue":"8","key":"2158_CR2","doi-asserted-by":"publisher","first-page":"5171","DOI":"10.1109\/TII.2021.3122801","volume":"18","author":"C Huang","year":"2021","unstructured":"Huang, C., Wu, Z., Wen, J., Xu, Y., Jiang, Q., Wang, Y.: Abnormal event detection using deep contrastive learning for intelligent video surveillance system. IEEE Trans. Industr. Inf. 18(8), 5171\u20135179 (2021)","journal-title":"IEEE Trans. Industr. Inf."},{"key":"2158_CR3","doi-asserted-by":"crossref","unstructured":"Sultani, W., Chen, C., Shah, M.: Real-world anomaly detection in surveillance videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6479\u20136488 (2018)","DOI":"10.1109\/CVPR.2018.00678"},{"issue":"8","key":"2158_CR4","doi-asserted-by":"publisher","first-page":"5171","DOI":"10.1109\/TII.2021.3122801","volume":"18","author":"C Huang","year":"2022","unstructured":"Huang, C., Wu, Z., Wen, J., Xu, Y., Jiang, Q., Wang, Y.: Abnormal event detection using deep contrastive learning for intelligent video surveillance system. IEEE Trans. Industr. Inf. 18(8), 5171\u20135179 (2022)","journal-title":"IEEE Trans. Industr. Inf."},{"key":"2158_CR5","doi-asserted-by":"publisher","first-page":"4505","DOI":"10.1109\/TIP.2021.3072863","volume":"30","author":"H Lv","year":"2021","unstructured":"Lv, H., Zhou, C., Cui, Z., Xu, C., Li, Y., Yang, J.: Localizing anomalies from weakly-labeled videos. IEEE Trans. Image Process. 30, 4505\u20134515 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"2158_CR6","doi-asserted-by":"crossref","unstructured":"Hasan, M., Choi, J., Neumann, J., Roy-Chowdhury, A.K., Davis, L.S.: Learning temporal regularity in video sequences. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 733\u2013742 (2016)","DOI":"10.1109\/CVPR.2016.86"},{"issue":"2","key":"2158_CR7","doi-asserted-by":"publisher","first-page":"2843","DOI":"10.1109\/TII.2023.3298476","volume":"20","author":"Y Liu","year":"2024","unstructured":"Liu, Y., Liu, J., Yang, K., Ju, B., Liu, S., Wang, Y., Yang, D., Sun, P., Song, L.: Amp-net: Appearance-motion prototype network assisted automatic video anomaly detection system. IEEE Trans. Industr. Inf. 20(2), 2843\u20132855 (2024)","journal-title":"IEEE Trans. Industr. Inf."},{"issue":"23","key":"2158_CR8","doi-asserted-by":"publisher","first-page":"28133","DOI":"10.1007\/s10489-023-04940-7","volume":"53","author":"R Singh","year":"2023","unstructured":"Singh, R., Saini, K., Sethi, A., Tiwari, A., Saurav, S., Singh, S.: Stemgan: spatio-temporal generative adversarial network for video anomaly detection. Appl. Intell. 53(23), 28133\u201328152 (2023)","journal-title":"Appl. Intell."},{"key":"2158_CR9","first-page":"1","volume":"61","author":"Y Zhang","year":"2023","unstructured":"Zhang, Y., Wu, C., Guo, W., et al.: Cfanet: Efficient detection of uav image based on cross-layer feature aggregation. IEEE Trans. Geosci. Remote Sens. 61, 1\u201311 (2023)","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"2158_CR10","first-page":"1","volume":"62","author":"Y Zhang","year":"2024","unstructured":"Zhang, Y., Wu, C., Zhang, T., et al.: Full-scale feature aggregation and grouping feature reconstruction based uav image target detection. IEEE Trans. Geosci. Remote Sens. 62, 1\u201311 (2024)","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"issue":"1","key":"2158_CR11","doi-asserted-by":"publisher","first-page":"013005","DOI":"10.1117\/1.JEI.34.1.013005","volume":"34","author":"Y Zhang","year":"2025","unstructured":"Zhang, Y., Wang, S., Zhang, Y., et al.: Asymmetric light-aware progressive decoding network for rgb-thermal salient object detection. J. Electron. Imaging 34(1), 013005\u2013013005 (2025)","journal-title":"J. Electron. Imaging"},{"key":"2158_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2024.109806","volume":"141","author":"Y Zhang","year":"2025","unstructured":"Zhang, Y., Zhang, T., Wang, S., et al.: An efficient perceptual video compression scheme based on deep learning-assisted video saliency and just noticeable distortion. Eng. Appl. Artif. Intell. 141, 109806 (2025)","journal-title":"Eng. Appl. Artif. Intell."},{"key":"2158_CR13","doi-asserted-by":"publisher","first-page":"4183","DOI":"10.1109\/TMM.2023.3321394","volume":"26","author":"Y Zhang","year":"2023","unstructured":"Zhang, Y., Zhang, T., Wu, C., et al.: Multi-scale spatiotemporal feature fusion network for video saliency? IEEE Trans. Multimed. 26, 4183\u20134193 (2023)","journal-title":"IEEE Trans. Multimed."},{"issue":"4","key":"2158_CR14","doi-asserted-by":"publisher","first-page":"2775","DOI":"10.1109\/TCSVT.2023.3312325","volume":"34","author":"Y Zhang","year":"2023","unstructured":"Zhang, Y., Liu, Y., Kang, W., et al.: Vss-net: Visual semantic self-mining network for video summarization. IEEE Trans. Circuits Syst. Video Technol. 34(4), 2775\u20132788 (2023)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"2158_CR15","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Zhen, J., Liu, T., et al.: Adaptive differentiation siamese fusion network for remote sensing change detection. IEEE Geosci. Remote Sens. Lett. (2024)","DOI":"10.1109\/LGRS.2024.3516775"},{"key":"2158_CR16","first-page":"6074","volume":"38","author":"P Wu","year":"2024","unstructured":"Wu, P., Zhou, X., Pang, G., Zhou, L., Yan, Q., Wang, P., Zhang, Y.: Vadclip: Adapting vision-language models for weakly supervised video anomaly detection. Proc. AAAI Conf. Artif. Intell. 38, 6074\u20136082 (2024)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"2158_CR17","doi-asserted-by":"crossref","unstructured":"Wu, P., Zhou, X., Pang, G., Sun, Y., Liu, J., Wang, P., Zhang, Y.: Open-vocabulary video anomaly detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 18297\u201318307 (2024)","DOI":"10.1109\/CVPR52733.2024.01732"},{"key":"2158_CR18","unstructured":"Zhu, Y., Newsam, S.: Motion-aware feature for improved video anomaly detection. arXiv preprint arXiv:1907.10211 (2019)"},{"issue":"7","key":"2158_CR19","doi-asserted-by":"publisher","first-page":"4733","DOI":"10.1109\/TII.2020.3019788","volume":"17","author":"A Castellani","year":"2020","unstructured":"Castellani, A., Schmitt, S., Squartini, S.: Real-world anomaly detection by using digital twin systems and weakly supervised learning. IEEE Trans. Industr. Inf. 17(7), 4733\u20134742 (2020)","journal-title":"IEEE Trans. Industr. Inf."},{"key":"2158_CR20","doi-asserted-by":"crossref","unstructured":"Lv, H., Yue, Z., Sun, Q., Luo, B., Cui, Z., Zhang, H.: Unbiased multiple instance learning for weakly supervised video anomaly detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8022\u20138031 (2023)","DOI":"10.1109\/CVPR52729.2023.00775"},{"key":"2158_CR21","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: A joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1728\u20131738 (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"2158_CR22","unstructured":"Li, K., He, Y., Wang, Y., Li, Y., Wang, W., Luo, P., Wang, Y., Wang, L., Qiao, Y.: Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)"},{"key":"2158_CR23","unstructured":"Su, Y., Lan, T., Li, H., Xu, J., Wang, Y., Cai, D.: Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355 (2023)"},{"key":"2158_CR24","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"2158_CR25","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","volume":"508","author":"H Luo","year":"2022","unstructured":"Luo, H., Ji, L., Zhong, M., Chen, Y., Lei, W., Duan, N., Li, T.: Clip4clip: an empirical study of clip for end to end video clip retrieval and captioning. Neurocomputing 508, 293\u2013304 (2022)","journal-title":"Neurocomputing"},{"key":"2158_CR26","doi-asserted-by":"crossref","unstructured":"Ju, C., Han, T., Zheng, K., Zhang, Y., Xie, W.: Prompting visual-language models for efficient video understanding. In: Proceedings of the European Conference on Computer Vision, pp. 105\u2013124. Springer (2022)","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"2158_CR27","doi-asserted-by":"crossref","unstructured":"Yang, Z., Liu, J., Wu, P.: Text prompt with normality guidance for weakly supervised video anomaly detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 18899\u201318908 (2024)","DOI":"10.1109\/CVPR52733.2024.01788"},{"key":"2158_CR28","first-page":"3769","volume":"37","author":"H Zhou","year":"2023","unstructured":"Zhou, H., Yu, J., Yang, W.: Dual memory units with uncertainty regulation for weakly supervised video anomaly detection. Proc. AAAI Conf. Artif. Intell. 37, 3769\u20133777 (2023)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"2158_CR29","unstructured":"Gu, A., Dao, T.: Mamba: Linear-time sequence modeling with selective state spaces. arXiv preprint arXiv:2312.00752 (2023)"},{"key":"2158_CR30","unstructured":"Liu, Z., Wang, Y., Vaidya, S., Ruehle, F., Halverson, J., Solja\u010di\u0107, M., Hou, T.Y., Tegmark, M.: Kan: Kolmogorov-arnold networks. arXiv preprint arXiv:2404.19756 (2024)"},{"key":"2158_CR31","doi-asserted-by":"crossref","unstructured":"Yan, S., Xiong, Y., Lin, D.: Spatial temporal graph convolutional networks for skeleton-based action recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence, 32 (2018)","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"2158_CR32","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1016\/j.jpdc.2019.07.008","volume":"139","author":"MT Young","year":"2020","unstructured":"Young, M.T., Hinkle, J.D., Kannan, R., Ramanathan, A.: Distributed Bayesian optimization of deep reinforcement learning algorithms. J. Parallel Distributed Comput. 139, 43\u201352 (2020)","journal-title":"J. Parallel Distributed Comput."},{"key":"2158_CR33","unstructured":"Cowen-Rivers, A.I., Lyu, W., Wang, Z., Tutunov, R., Jianye, H., Wang, J., Ammar, H.B.: Hebo: Heteroscedastic evolutionary bayesian optimisation. arXiv preprint arXiv:2012.038267 (2020)"},{"key":"2158_CR34","doi-asserted-by":"crossref","unstructured":"Wu, P., Liu, J., Shi, Y., Sun, Y., Shao, F., Wu, Z., Yang, Z.: Not only look, but also listen: Learning multimodal violence detection under weak supervision. In: Proceedings of the European Conference on Computer Vision, 322\u2013339 (2020). Springer","DOI":"10.1007\/978-3-030-58577-8_20"},{"key":"2158_CR35","unstructured":"Sch\u00f6lkopf, B., Williamson, R.C., Smola, A., Shawe-Taylor, J., Platt, J.: Support vector method for novelty detection. Advances in Neural Information Processing Systems 12 (1999)"},{"key":"2158_CR36","doi-asserted-by":"publisher","first-page":"1674","DOI":"10.1109\/TMM.2022.3147369","volume":"25","author":"P Wu","year":"2022","unstructured":"Wu, P., Liu, X., Liu, J.: Weakly supervised audio-visual violence detection. IEEE Trans. Multimed. 25, 1674\u20131685 (2022)","journal-title":"IEEE Trans. Multimed."},{"key":"2158_CR37","doi-asserted-by":"crossref","unstructured":"Joo, H.K., Vo, K., Yamazaki, K., Le, N.: Clip-tsa: Clip-assisted temporal self-attention for weakly-supervised video anomaly detection. In: IEEE International Conference on Information Processing, pp. 3230\u20133234. IEEE (2023)","DOI":"10.1109\/ICIP49359.2023.10222289"},{"key":"2158_CR38","doi-asserted-by":"crossref","unstructured":"Luo, W., Liu, W., Gao, S.: Remembering history with convolutional lstm for anomaly detection. In: 2017 IEEE International Conference on Multimedia and Expo (ICME), pp. 439\u2013444. IEEE (2017)","DOI":"10.1109\/ICME.2017.8019325"},{"key":"2158_CR39","doi-asserted-by":"crossref","unstructured":"Liu, W., Luo, W., Lian, D., Gao, S.: Future frame prediction for anomaly detection\u2013a new baseline. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6536\u20136545 (2018)","DOI":"10.1109\/CVPR.2018.00684"},{"key":"2158_CR40","doi-asserted-by":"crossref","unstructured":"Gong, D., Liu, L., Le, V., Saha, B., Mansour, M.R., Venkatesh, S., Hengel, A.v.d.: Memorizing normality to detect anomaly: Memory-augmented deep autoencoder for unsupervised anomaly detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1705\u20131714 (2019)","DOI":"10.1109\/ICCV.2019.00179"},{"key":"2158_CR41","doi-asserted-by":"crossref","unstructured":"Park, H., Noh, J., Ham, B.: Learning memory-guided normality for anomaly detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14372\u201314381 (2020)","DOI":"10.1109\/CVPR42600.2020.01438"},{"key":"2158_CR42","doi-asserted-by":"crossref","unstructured":"Lu, Y., Kumar, K.M., Shahabeddin\u00a0Nabavi, S., Wang, Y.: Future frame prediction using convolutional vrnn for anomaly detection. In: 2019 16th IEEE International Conference on Advanced Video and Signal Based Surveillance (AVSS), pp. 1\u20138. IEEE (2019)","DOI":"10.1109\/AVSS.2019.8909850"},{"key":"2158_CR43","doi-asserted-by":"crossref","unstructured":"Lu, Y., Yu, F., Reddy, M.K.K., Wang, Y.: Few-shot scene-adaptive anomaly detection. In: Computer Vision - ECCV 2020: 16th European Conference, Glasgow, UK, August 23 - 28, 2020, Proceedings, Part V 16, 125\u2013141. Springer (2020)","DOI":"10.1007\/978-3-030-58558-7_8"},{"key":"2158_CR44","doi-asserted-by":"crossref","unstructured":"Lv, H., Chen, C., Cui, Z., Xu, C., Li, Y., Yang, J.: Learning normal dynamics in videos with meta prototype network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15425\u201315434 (2021)","DOI":"10.1109\/CVPR46437.2021.01517"},{"key":"2158_CR45","unstructured":"Hu, C., Wu, F., Wu, W., Qiu, W., Lai, S.: Normal learning in videos with attention prototype network. arXiv preprint arXiv:2108.11055 (2021)"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-02158-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-02158-w","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-02158-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,18]],"date-time":"2026-04-18T04:27:07Z","timestamp":1776486427000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-02158-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,3]]},"references-count":45,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2026,4]]}},"alternative-id":["2158"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-02158-w","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,3]]},"assertion":[{"value":"23 December 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 December 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 February 2026","order":5,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Update","order":6,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Original article has been corrected to update affiliation.","order":7,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This article does not contain studies with human participants or animals. Statement of informed consent is not applicable since the manuscript does not contain any patient data.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval and informed consent"}}],"article-number":"104"}}