{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T03:07:10Z","timestamp":1774321630517,"version":"3.50.1"},"reference-count":37,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,7,9]],"date-time":"2025-07-09T00:00:00Z","timestamp":1752019200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,7,9]],"date-time":"2025-07-09T00:00:00Z","timestamp":1752019200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Hefei Municipal Natural Science Foundation","award":["HZR2447"],"award-info":[{"award-number":["HZR2447"]}]},{"name":"Anhui Provincial Key Research and Development Program","award":["2022a05020042"],"award-info":[{"award-number":["2022a05020042"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62406095"],"award-info":[{"award-number":["62406095"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100003995","name":"Natural Science Foundation of Anhui Province","doi-asserted-by":"crossref","award":["2308085MF213"],"award-info":[{"award-number":["2308085MF213"]}],"id":[{"id":"10.13039\/501100003995","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Projects of Excellent Research and Innovation Teams in Anhui Province's Universities","award":["2022AH010095"],"award-info":[{"award-number":["2022AH010095"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s00530-025-01911-5","type":"journal-article","created":{"date-parts":[[2025,7,10]],"date-time":"2025-07-10T09:44:44Z","timestamp":1752140684000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Visual-language collaborative multimodal transformer network for group activity detection in surveillance videos"],"prefix":"10.1007","volume":"31","author":[{"given":"Fudong","family":"Nian","sequence":"first","affiliation":[],"role":[{"role":"author","vocab":"crossref"}]},{"given":"Weijie","family":"Lu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocab":"crossref"}]},{"given":"Jun","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocab":"crossref"}]},{"given":"Chengqian","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocab":"crossref"}]},{"given":"Yun","family":"Fu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocab":"crossref"}]},{"given":"Zhize","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocab":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,7,9]]},"reference":[{"key":"1911_CR1","doi-asserted-by":"crossref","unstructured":"Azar, S.M., Atigh, M.G., Nickabadi, A., Alahi, A.: Convolutional relational machine for group activity recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7892\u20137901 (2019)","DOI":"10.1109\/CVPR.2019.00808"},{"key":"1911_CR2","doi-asserted-by":"crossref","unstructured":"Gavrilyuk, K., Sanford, R., Javan, M., Snoek, C.G.: Actor-transformers for group activity recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 839\u2013848 (2020)","DOI":"10.1109\/CVPR42600.2020.00092"},{"key":"1911_CR3","doi-asserted-by":"crossref","unstructured":"Kim, D., Lee, J., Cho, M., Kwak, S.: Detector-free weakly supervised group activity recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20083\u201320093 (2022)","DOI":"10.1109\/CVPR52688.2022.01945"},{"key":"1911_CR4","doi-asserted-by":"crossref","unstructured":"Li, S., Cao, Q., Liu, L., Yang, K., Liu, S., Hou, J., Yi, S.: Groupformer: Group activity recognition with clustered spatial-temporal transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13668\u201313677 (2021)","DOI":"10.1109\/ICCV48922.2021.01341"},{"key":"1911_CR5","doi-asserted-by":"crossref","unstructured":"Li, Z., Chang, X., Li, Y., Su, J.: Skeleton-based group activity recognition via spatial-temporal panoramic graph. In: European Conference on Computer Vision, pp. 252\u2013269 (2024). Springer","DOI":"10.1007\/978-3-031-73202-7_15"},{"key":"1911_CR6","doi-asserted-by":"publisher","first-page":"1574","DOI":"10.1109\/TIP.2024.3362140","volume":"33","author":"Z Xie","year":"2024","unstructured":"Xie, Z., Jiao, C., Wu, K., Guo, D., Hong, R.: Active factor graph network for group activity recognition. IEEE Trans. Image Process. 33, 1574\u20131587 (2024)","journal-title":"IEEE Trans. Image Process."},{"key":"1911_CR7","doi-asserted-by":"crossref","unstructured":"Kim, D., Song, Y., Cho, M., Kwak, S.: Towards more practical group activity detection: a new benchmark and model. In: European Conference on Computer Vision, pp. 240\u2013258 (2024). Springer","DOI":"10.1007\/978-3-031-72970-6_14"},{"key":"1911_CR8","doi-asserted-by":"crossref","unstructured":"Ehsanpour, M., Abedin, A., Saleh, F., Shi, J., Reid, I., Rezatofighi, H.: Joint learning of social groups, individuals action and sub-group activities in videos. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part IX 16, pp. 177\u2013195 (2020). Springer","DOI":"10.1007\/978-3-030-58545-7_11"},{"key":"1911_CR9","doi-asserted-by":"crossref","unstructured":"Ehsanpour, M., Saleh, F., Savarese, S., Reid, I., Rezatofighi, H.: Jrdb-act: a large-scale dataset for spatio-temporal action, social group and activity detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20983\u201320992 (2022)","DOI":"10.1109\/CVPR52688.2022.02031"},{"key":"1911_CR10","doi-asserted-by":"crossref","unstructured":"Tamura, M., Vishwakarma, R., Vennelakanti, R.: Hunting group clues with transformers for social group activity recognition. In: European Conference on Computer Vision, pp. 19\u201335 (2022). Springer","DOI":"10.1007\/978-3-031-19772-7_2"},{"issue":"1","key":"1911_CR11","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1109\/TNNLS.2020.2978386","volume":"32","author":"Z Wu","year":"2020","unstructured":"Wu, Z., Pan, S., Chen, F., Long, G., Zhang, C., Philip, S.Y.: A comprehensive survey on graph neural networks. IEEE Trans. Neural Netw. Learn. Syst. 32(1), 4\u201324 (2020)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"1911_CR12","first-page":"1","volume":"14","author":"A Ng","year":"2001","unstructured":"Ng, A., Jordan, M., Weiss, Y.: On spectral clustering: analysis and an algorithm. Adv. Neural Inf. Process. Syst. 14, 1\u20138 (2001)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1911_CR13","first-page":"1","volume":"17","author":"L Zelnik-Manor","year":"2004","unstructured":"Zelnik-Manor, L., Perona, P.: Self-tuning spectral clustering. Adv. Neural Inf. Process. Syst. 17, 1\u20138 (2004)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1911_CR14","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable detr: deformable transformers for end-to-end object detection. arxiv 2020. arXiv preprint arXiv:2010.041593 (2010)"},{"key":"1911_CR15","first-page":"1","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30, 1\u201311 (2017)","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"6","key":"1911_CR16","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2016","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: towards real-time object detection with region proposal networks. IEEE Trans. Pattern Anal. Mach. Intell. 39(6), 1137\u20131149 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"1","key":"1911_CR17","doi-asserted-by":"publisher","first-page":"625","DOI":"10.1109\/TNNLS.2023.3331841","volume":"36","author":"M Wang","year":"2025","unstructured":"Wang, M., Xing, J., Mei, J., Liu, Y., Jiang, Y.: Actionclip: adapting language-image pretrained models for video action recognition. IEEE Trans. Neural Netw. Learn. Syst. 36(1), 625\u2013637 (2025)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"1911_CR18","doi-asserted-by":"publisher","first-page":"6386","DOI":"10.1109\/TMM.2024.3349923","volume":"26","author":"L Wu","year":"2024","unstructured":"Wu, L., Tian, M., Xiang, Y., Gu, K., Shi, G.: Learning label semantics for weakly supervised group activity recognition. IEEE Trans. Multimed. 26, 6386\u20136397 (2024)","journal-title":"IEEE Trans. Multimed."},{"key":"1911_CR19","unstructured":"Xu, G., Yin, J., Zhou, F., Dang, Y.: Activityclip: enhancing group activity recognition by mining complementary information from text to supplement image modality. arXiv preprint arXiv:2407.19820 (2024)"},{"key":"1911_CR20","unstructured":"Wang, Y., Li, X., Yan, Z., He, Y., Yu, J., Zeng, X., Wang, C., Ma, C., Huang, H., Gao, J., Dou, M., Chen, K., Wang, W., Qiao, Y., Wang, Y., Wang, L.: InternVideo2.5: empowering video MLLMs with long and rich context modeling (2025)"},{"key":"1911_CR21","unstructured":"Bai, S., Chen, K., Liu, X., Wang, J., Ge, W., Song, S., Dang, K., Wang, P., Wang, S., Tang, J., et al.: Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)"},{"key":"1911_CR22","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A.C., Lo, W.-Y., et al.: Segment anything. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4015\u20134026 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"1911_CR23","first-page":"47908","volume":"37","author":"Q Zhao","year":"2024","unstructured":"Zhao, Q., Wang, Y., Xu, J., He, Y., Song, Z., Wang, L., Qiao, Y., Zhao, C.: Does video-text pretraining help open-vocabulary online action detection? Adv. Neural Inf. Process. Syst. 37, 47908\u201347930 (2024)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1911_CR24","first-page":"5517","volume":"38","author":"M Wang","year":"2024","unstructured":"Wang, M., Xing, J., Jiang, B., Chen, J., Mei, J., Zuo, X., Dai, G., Wang, J., Liu, Y.: A multimodal, multi-task adapting framework for video action recognition. Proc. AAAI Conf. Artif. Intell. 38, 5517\u20135525 (2024)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"1911_CR25","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2025.111402","volume":"162","author":"Z Quan","year":"2025","unstructured":"Quan, Z., Chen, J., Deguchi, D., Sun, J., Zhang, C., Li, Y., Murase, H.: Semantic matters: a constrained approach for zero-shot video action recognition. Pattern Recogn. 162, 111402 (2025)","journal-title":"Pattern Recogn."},{"key":"1911_CR26","doi-asserted-by":"publisher","unstructured":"Xu, H., Gao, Y., Hui, Z., Li, J., Gao, X.: Language knowledge-assisted representation learning for skeleton-based action recognition. IEEE Trans. Multimed. 1\u201316 (2025). https:\/\/doi.org\/10.1109\/TMM.2025.3543034","DOI":"10.1109\/TMM.2025.3543034"},{"key":"1911_CR27","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.128623","volume":"611","author":"S Zhu","year":"2025","unstructured":"Zhu, S., Sun, L., Ma, Z., Li, C., He, D.: Prompt-supervised dynamic attention graph convolutional network for skeleton-based action recognition. Neurocomputing 611, 128623 (2025)","journal-title":"Neurocomputing"},{"key":"1911_CR28","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (long and Short Papers), pp. 4171\u20134186 (2019)"},{"key":"1911_CR29","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"1911_CR30","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"1911_CR31","unstructured":"Ge, Z., Liu, S., Wang, F., Li, Z., Sun, J.: Yolox: exceeding yolo series in 2021. arXiv preprint arXiv:2107.08430 (2021)"},{"key":"1911_CR32","unstructured":"Shao, S., Zhao, Z., Li, B., Xiao, T., Yu, G., Zhang, X., Sun, J.: Crowdhuman: a benchmark for detecting human in a crowd. arXiv preprint arXiv:1805.00123 (2018)"},{"key":"1911_CR33","doi-asserted-by":"publisher","first-page":"845","DOI":"10.1007\/s11263-020-01393-0","volume":"129","author":"P Dendorfer","year":"2021","unstructured":"Dendorfer, P., Osep, A., Milan, A., Schindler, K., Cremers, D., Reid, I., Roth, S., Leal-Taix\u00e9, L.: Motchallenge: a benchmark for single-camera multiple target tracking. Int. J. Comput. Vis. 129, 845\u2013881 (2021)","journal-title":"Int. J. Comput. Vis."},{"key":"1911_CR34","doi-asserted-by":"crossref","unstructured":"Zhang, S., Benenson, R., Schiele, B.: Citypersons: a diverse dataset for pedestrian detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3213\u20133221 (2017)","DOI":"10.1109\/CVPR.2017.474"},{"key":"1911_CR35","doi-asserted-by":"crossref","unstructured":"Ess, A., Leibe, B., Schindler, K., Van\u00a0Gool, L.: A mobile vision system for robust multi-person tracking. In: 2008 IEEE Conference on Computer Vision and Pattern Recognition, pp. 1\u20138 (2008). IEEE","DOI":"10.1109\/CVPR.2008.4587730"},{"key":"1911_CR36","doi-asserted-by":"crossref","unstructured":"Choi, W., Chao, Y.-W., Pantofaru, C., Savarese, S.: Discovering groups of people in images. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part IV 13, pp. 417\u2013433 (2014). Springer","DOI":"10.1007\/978-3-319-10593-2_28"},{"key":"1911_CR37","doi-asserted-by":"crossref","unstructured":"Wu, J., Wang, L., Wang, L., Guo, J., Wu, G.: Learning actor relation graphs for group activity recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9964\u20139974 (2019)","DOI":"10.1109\/CVPR.2019.01020"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01911-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01911-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01911-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T09:04:41Z","timestamp":1757927081000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01911-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,9]]},"references-count":37,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["1911"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01911-5","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7,9]]},"assertion":[{"value":"16 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 June 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 July 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"305"}}