{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T18:18:44Z","timestamp":1775067524406,"version":"3.50.1"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2024,6,15]],"date-time":"2024-06-15T00:00:00Z","timestamp":1718409600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,6,15]],"date-time":"2024-06-15T00:00:00Z","timestamp":1718409600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Social Science Foundation of Shandong Province","award":["21CZZJ04"],"award-info":[{"award-number":["21CZZJ04"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Real-Time Image Proc"],"published-print":{"date-parts":[[2024,8]]},"DOI":"10.1007\/s11554-024-01486-w","type":"journal-article","created":{"date-parts":[[2024,6,15]],"date-time":"2024-06-15T03:36:32Z","timestamp":1718422592000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Efficiently adapting large pre-trained models for real-time violence recognition in smart city surveillance"],"prefix":"10.1007","volume":"21","author":[{"given":"Xiaohui","family":"Ren","sequence":"first","affiliation":[]},{"given":"Wenze","family":"Fan","sequence":"additional","affiliation":[]},{"given":"Yinghao","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,6,15]]},"reference":[{"issue":"11","key":"1486_CR1","doi-asserted-by":"publisher","first-page":"4571","DOI":"10.1002\/ett.4571","volume":"34","author":"R Sharma","year":"2023","unstructured":"Sharma, R., Arya, R.: Security threats and measures in the internet of things for smart city infrastructure: a state of art. Trans. Emerg. Telecommun. Technol. 34(11), 4571 (2023)","journal-title":"Trans. Emerg. Telecommun. Technol."},{"issue":"2","key":"1486_CR2","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1007\/s11554-023-01286-8","volume":"20","author":"MA Khan","year":"2023","unstructured":"Khan, M.A., Menouar, H., Hamila, R.: LCDnet: a lightweight crowd density estimation model for real-time video surveillance. J. Real-Time Image Process. 20(2), 29 (2023)","journal-title":"J. Real-Time Image Process."},{"key":"1486_CR3","doi-asserted-by":"publisher","DOI":"10.1016\/j.micpro.2020.103303","volume":"79","author":"M Murugesan","year":"2020","unstructured":"Murugesan, M., Thilagamani, S.: Efficient anomaly detection in surveillance videos based on multi layer perception recurrent neural network. Microprocess. Microsyst. 79, 103303 (2020)","journal-title":"Microprocess. Microsyst."},{"key":"1486_CR4","doi-asserted-by":"publisher","first-page":"18772","DOI":"10.1109\/ACCESS.2023.3245521","volume":"11","author":"VD Husz\u00e1r","year":"2023","unstructured":"Husz\u00e1r, V.D., Adhikarla, V.K., N\u00e9gyesi, I., Krasznay, C.: Toward fast and accurate violence detection for automated video surveillance applications. IEEE Access 11, 18772\u201318793 (2023)","journal-title":"IEEE Access"},{"key":"1486_CR5","doi-asserted-by":"publisher","first-page":"37","DOI":"10.1016\/j.imavis.2016.01.006","volume":"48","author":"Y Gao","year":"2016","unstructured":"Gao, Y., Liu, H., Sun, X., Wang, C., Liu, Y.: Violence detection using oriented violent flows. Image Vis. Comput. 48, 37\u201341 (2016)","journal-title":"Image Vis. Comput."},{"key":"1486_CR6","doi-asserted-by":"crossref","unstructured":"Bermejo Nievas, E., Deniz Suarez, O., Bueno Garc\u00eda, G., Sukthankar, R.: Violence detection in video using computer vision techniques. In: Computer Analysis of Images and Patterns: 14th International Conference, CAIP 2011, Seville, Spain, August 29\u201331, 2011, Proceedings, Part II 14. pp. 332\u2013339. Springer (2011)","DOI":"10.1007\/978-3-642-23678-5_39"},{"key":"1486_CR7","doi-asserted-by":"crossref","unstructured":"Hassner, T., Itcher, Y., Kliper-Gross, O.: Violent flows: real-time detection of violent crowd behavior. In: 2012 IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops, pp. 1\u20136. IEEE (2012)","DOI":"10.1109\/CVPRW.2012.6239348"},{"key":"1486_CR8","doi-asserted-by":"crossref","unstructured":"Sudhakaran, S., Lanz, O.: Learning to detect violent videos using convolutional long short-term memory. In: 2017 14th IEEE International Conference on Advanced Video and Signal Based Surveillance (AVSS), pp. 1\u20136. IEEE (2017)","DOI":"10.1109\/AVSS.2017.8078468"},{"issue":"10","key":"1486_CR9","doi-asserted-by":"publisher","first-page":"4787","DOI":"10.1109\/TIP.2018.2845742","volume":"27","author":"I Serrano","year":"2018","unstructured":"Serrano, I., Deniz, O., Espinosa-Aranda, J.L., Bueno, G.: Fight recognition in video using Hough forests and 2D convolutional neural network. IEEE Trans. Image Process. 27(10), 4787\u20134797 (2018)","journal-title":"IEEE Trans. Image Process."},{"key":"1486_CR10","doi-asserted-by":"crossref","unstructured":"Dong, Z., Qin, J., Wang, Y.: Multi-stream deep networks for person to person violence detection in videos. In: Pattern Recognition: 7th Chinese Conference, CCPR 2016, Chengdu, China, November 5\u20137, 2016, Proceedings, Part I 7, pp. 517\u2013531. Springer (2016)","DOI":"10.1007\/978-981-10-3002-4_43"},{"key":"1486_CR11","doi-asserted-by":"crossref","unstructured":"Islam, Z., Rukonuzzaman, M., Ahmed, R., Kabir, M.H., Farazi, M.: Efficient two-stream network for violence detection using separable convolutional LSTM. In: 2021 International Joint Conference on Neural Networks (IJCNN), pp. 1\u20138. IEEE (2021)","DOI":"10.1109\/IJCNN52387.2021.9534280"},{"key":"1486_CR12","doi-asserted-by":"crossref","unstructured":"Li, J., Jiang, X., Sun, T., Xu, K.: Efficient violence detection using 3D convolutional neural networks. In: 2019 16th IEEE International Conference on Advanced Video and Signal Based Surveillance (AVSS), pp. 1\u20138, IEEE (2019)","DOI":"10.1109\/AVSS.2019.8909883"},{"key":"1486_CR13","doi-asserted-by":"crossref","unstructured":"Huillcen Baca, H.A., Gutierrez Caceres, J.C., Luz Palomino Valdivia, F.: Efficiency in human actions recognition in video surveillance using 3D CNN and DenseNet. In: Future of Information and Communication Conference, pp. 342\u2013355. Springer (2022)","DOI":"10.1007\/978-3-030-98012-2_26"},{"issue":"2","key":"1486_CR14","doi-asserted-by":"publisher","first-page":"668","DOI":"10.3390\/s24020668","volume":"24","author":"H A Huillcen Baca","year":"2024","unstructured":"Huillcen Baca, H. A., Palomino Valdivia, Fd. L., Gutierrez Caceres, J. C.: Efficient human violence recognition for surveillance in real time. Sensors 24(2), 668 (2024)","journal-title":"Sensors"},{"key":"1486_CR15","doi-asserted-by":"crossref","unstructured":"Cheng, M., Cai, K., Li, M.: RWF-2000: an open large scale video database for violence detection. In: 2020 25th International Conference on Pattern Recognition (ICPR), pp. 4183\u20134190. IEEE (2021)","DOI":"10.1109\/ICPR48806.2021.9412502"},{"key":"1486_CR16","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"1486_CR17","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et al.: Improving language understanding by generative pre-training (2018)"},{"key":"1486_CR18","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763. PMLR (2021)"},{"key":"1486_CR19","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: CVPR, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"1486_CR20","unstructured":"Bahng, H., Jahanian, A., Sankaranarayanan, S., Isola, P.: Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv:2203.17274 (2022)"},{"key":"1486_CR21","unstructured":"Gu, X., Lin, T.-Y., Kuo, W., Cui, Y.: Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)"},{"key":"1486_CR22","unstructured":"Li, B., Weinberger, K.Q., Belongie, S., Koltun, V., Ranftl, R.: Language-driven semantic segmentation (2022)"},{"key":"1486_CR23","unstructured":"Yang, T., Zhu, Y., Xie, Y., Zhang, A., Chen, C., Li, M.: AIM: adapting image models for efficient video action recognition. arXiv preprint arXiv:2302.03024 (2023)"},{"key":"1486_CR24","doi-asserted-by":"crossref","unstructured":"Park, J., Lee, J., Sohn, K.: Dual-path adaptation from image to video transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2203\u20132213 (2023)","DOI":"10.1109\/CVPR52729.2023.00219"},{"key":"1486_CR25","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: Advances in Neural Information Processing Systems, vol. 27 (2014)"},{"key":"1486_CR26","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C.: X3D: expanding architectures for efficient video recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 203\u2013213 (2020)","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"1486_CR27","unstructured":"Xiao, T., Xu, Z., He, W., Su, J., Zhang, Y., Opoku, R., Ison, R., Petho, J., Bian, J., Tighe, P., et al.: XTSFormer: cross-temporal-scale transformer for irregular time event prediction. arXiv preprint arXiv:2402.02258 (2024)"},{"key":"1486_CR28","doi-asserted-by":"crossref","unstructured":"Tian, Y., Yang, M., Zhang, L., Zhang, Z., Liu, Y., Xie, X., Que, X., Wang, W.: View while moving: Efficient video recognition in long-untrimmed videos. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 173\u2013183 (2023)","DOI":"10.1145\/3581783.3612035"},{"key":"1486_CR29","doi-asserted-by":"crossref","unstructured":"Weng, Y., Pan, Z., Han, M., Chang, X., Zhuang, B.: An efficient spatio-temporal pyramid transformer for action detection. In: European Conference on Computer Vision, pp. 358\u2013375. Springer (2022)","DOI":"10.1007\/978-3-031-19830-4_21"},{"key":"1486_CR30","unstructured":"He, W., Jiang, Z., Xiao, T., Xu, Z., Chen, S., Fick, R., Medina, M., Angelini, C.: A hierarchical spatial transformer for massive point samples in continuous space. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"1486_CR31","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1016\/j.neunet.2023.01.048","volume":"161","author":"FJ Rend\u00f3n-Segador","year":"2023","unstructured":"Rend\u00f3n-Segador, F.J., \u00c1lvarez-Garc\u00eda, J.A., Salazar-Gonz\u00e1lez, J.L., Tommasi, T.: CrimeNet: neural structured learning using vision transformer for violence detection. Neural Netw 161, 318\u2013329 (2023)","journal-title":"Neural Netw"},{"key":"1486_CR32","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"1486_CR33","unstructured":"Deniz, O., Serrano, I., Bueno, G., Kim, T.-K.: Fast violence detection in video. In: 2014 International Conference on Computer Vision Theory and Applications (VISAPP), vol. 2, pp. 478\u2013485. IEEE (2014)"},{"key":"1486_CR34","doi-asserted-by":"crossref","unstructured":"Bilinski, P., Bremond, F.: Human violence recognition and detection in surveillance videos. In: 2016 13th IEEE International Conference on Advanced Video and Signal Based Surveillance (AVSS), pp. 30\u201336. IEEE (2016)","DOI":"10.1109\/AVSS.2016.7738019"},{"issue":"3","key":"1486_CR35","doi-asserted-by":"publisher","first-page":"696","DOI":"10.1109\/TCSVT.2016.2589858","volume":"27","author":"T Zhang","year":"2016","unstructured":"Zhang, T., Jia, W., He, X., Yang, J.: Discriminative dictionary learning with motion weber local descriptor for violence detection. IEEE Trans. Circuits Syst Video Technol. 27(3), 696\u2013709 (2016)","journal-title":"IEEE Trans. Circuits Syst Video Technol."},{"key":"1486_CR36","doi-asserted-by":"crossref","unstructured":"Deb, T., Arman, A., Firoze, A.: Machine cognition of violence in videos using novel outlier-resistant vlad. In: 2018 17th IEEE International Conference on Machine Learning and Applications (ICMLA), pp. 989\u2013994. IEEE (2018)","DOI":"10.1109\/ICMLA.2018.00161"},{"key":"1486_CR37","doi-asserted-by":"crossref","unstructured":"Hanson, A., Pnvr, K., Krishnagopal, S., Davis, L.: Bidirectional convolutional LSTM for the detection of violence in videos. In: Proceedings of the European Conference on Computer Vision (ECCV) Workshops (2018)","DOI":"10.1007\/978-3-030-11012-3_24"},{"key":"1486_CR38","doi-asserted-by":"crossref","unstructured":"Huillcen\u00a0Baca, H.A., Luz Palomino\u00a0Valdivia, F., Solis, I.S., Cruz, M.A., Caceres, J.C.G.: Human violence recognition in video surveillance in real-time. In: Future of Information and Communication Conference, pp. 783\u2013795. Springer (2023)","DOI":"10.1007\/978-3-031-28073-3_52"},{"key":"1486_CR39","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"1486_CR40","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo Vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"}],"container-title":["Journal of Real-Time Image Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11554-024-01486-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11554-024-01486-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11554-024-01486-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,27]],"date-time":"2024-08-27T16:25:07Z","timestamp":1724775907000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11554-024-01486-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,15]]},"references-count":40,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2024,8]]}},"alternative-id":["1486"],"URL":"https:\/\/doi.org\/10.1007\/s11554-024-01486-w","relation":{},"ISSN":["1861-8200","1861-8219"],"issn-type":[{"value":"1861-8200","type":"print"},{"value":"1861-8219","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,6,15]]},"assertion":[{"value":"4 April 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 May 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 June 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"112"}}