{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:48:20Z","timestamp":1778082500408,"version":"3.51.4"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T00:00:00Z","timestamp":1755734400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T00:00:00Z","timestamp":1755734400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.72171004"],"award-info":[{"award-number":["No.72171004"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.72301010"],"award-info":[{"award-number":["No.72301010"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Humanities and Social Science Project of Ministry of Education of China","award":["No. 21YJCZH186"],"award-info":[{"award-number":["No. 21YJCZH186"]}]},{"name":"Project of Cultivation for Young Top-notch Talents of Beijing Municipal Institutions","award":["No. BPHR202203061"],"award-info":[{"award-number":["No. BPHR202203061"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1007\/s00530-025-01909-z","type":"journal-article","created":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:05:19Z","timestamp":1755774319000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["STAGVid2C: enhancing video-based commonsense captioning with spatio-temporal action graph"],"prefix":"10.1007","volume":"31","author":[{"given":"Haitao","family":"Xiong","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junhong","family":"Ding","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuchen","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuanyuan","family":"Cai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,8,21]]},"reference":[{"key":"1909_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3355390","volume":"52","author":"N Aafaq","year":"2019","unstructured":"Aafaq, N., Mian, A., Liu, W., Gilani, S.Z., Shah, M.: Video description: A survey of methods, datasets, and evaluation metrics. ACM Computing Surveys (CSUR). 52, 1\u201337 (2019)","journal-title":"ACM Computing Surveys (CSUR)."},{"key":"1909_CR2","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s42979-021-00487-x","volume":"2","author":"S Islam","year":"2021","unstructured":"Islam, S., Dash, A., Seum, A., Raj, A.H., Hossain, T., Shah, F.M.: Exploring video captioning techniques: A comprehensive survey on deep learning methods. SN Computer Science. 2, 1\u201328 (2021)","journal-title":"SN Computer Science."},{"key":"1909_CR3","doi-asserted-by":"crossref","unstructured":"Kehkashan, T., Alsaeedi, A., Yafooz, W.M., Ismail, N.A., Al-Dhaqm, A.: Combinatorial Analysis of Deep Learning and Machine Learning Video Captioning Studies: A Systematic Literature Review. IEEE Access. (2024)","DOI":"10.1109\/ACCESS.2024.3357980"},{"key":"1909_CR4","doi-asserted-by":"crossref","unstructured":"Fang, Z., Gokhale, T., Banerjee, P., Baral, C., Yang, Y.: Video2commonsense: Generating commonsense descriptions to enrich video captioning. arXiv preprint arXiv:2003.05162. (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.61"},{"key":"1909_CR5","doi-asserted-by":"crossref","unstructured":"Yu, W., Liang, J., Ji, L., Li, L., Fang, Y., Xiao, N., Duan, N.: Hybrid reasoning network for video-based commonsense captioning. In: Proceedings of the 29th ACM international conference on multimedia. pp. 5213\u20135221 (2021)","DOI":"10.1145\/3474085.3475638"},{"key":"1909_CR6","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. Advances in neural information processing systems. 27, (2014)"},{"key":"1909_CR7","doi-asserted-by":"publisher","first-page":"1414","DOI":"10.1109\/TPAMI.2013.244","volume":"36","author":"D Gong","year":"2013","unstructured":"Gong, D., Medioni, G., Zhao, X.: Structured time series analysis for human action segmentation and recognition. IEEE Trans. Pattern Anal. Mach. Intell. 36, 1414\u20131427 (2013)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1909_CR8","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"1909_CR9","doi-asserted-by":"crossref","unstructured":"Yao, L., Torabi, A., Cho, K., Ballas, N., Pal, C., Larochelle, H., Courville, A.: Describing videos by exploiting temporal structure. In: Proceedings of the IEEE international conference on computer vision. pp. 4507\u20134515 (2015)","DOI":"10.1109\/ICCV.2015.512"},{"key":"1909_CR10","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Carlos Niebles, J.: Dense-captioning events in videos. In: Proceedings of the IEEE international conference on computer vision. pp. 706\u2013715 (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"1909_CR11","doi-asserted-by":"crossref","unstructured":"Yang, A., Nagrani, A., Seo, P.H., Miech, A., Pont-Tuset, J., Laptev, I., Sivic, J., Schmid, C.: Vid2seq: Large-scale pretraining of a visual language model for dense video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10714\u201310726 (2023)","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"1909_CR12","doi-asserted-by":"crossref","unstructured":"Tan, G., Liu, D., Wang, M., Zha, Z.-J.: Learning to discretely compose reasoning module networks for video captioning. In: Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence. , Yokohama, Yokohama, Japan (2021)","DOI":"10.24963\/ijcai.2020\/104"},{"key":"1909_CR13","doi-asserted-by":"crossref","unstructured":"Shao, H., Fang, Z., Yang, Y.: CAVAN: Commonsense knowledge anchored video captioning. In: 2022 26th International Conference on Pattern Recognition (ICPR). pp. 4095\u20134102. IEEE (2022)","DOI":"10.1109\/ICPR56361.2022.9956241"},{"key":"1909_CR14","doi-asserted-by":"publisher","first-page":"1124369","DOI":"10.3389\/fpsyg.2023.1124369","volume":"14","author":"H Xiong","year":"2023","unstructured":"Xiong, H., Zhou, Y., Liu, J., Cai, Y.: Class-dependent and cross-modal memory network considering sentimental features for video-based captioning. Front. Psychol. 14, 1124369 (2023)","journal-title":"Front. Psychol."},{"key":"1909_CR15","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1007\/s11633-022-1369-5","volume":"20","author":"F-L Chen","year":"2023","unstructured":"Chen, F.-L., Zhang, D.-Z., Han, M.-L., Chen, X.-Y., Shi, J., Xu, S., Xu, B.: Vlp: A survey on vision-language pre-training. Mach Intell Res 20, 38\u201356 (2023)","journal-title":"Mach Intell Res"},{"key":"1909_CR16","doi-asserted-by":"crossref","unstructured":"Dou, Z.-Y., Xu, Y., Gan, Z., Wang, J., Wang, S., Wang, L., Zhu, C., Zhang, P., Yuan, L., Peng, N., others: An empirical study of training end-to-end vision-and-language transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 18166\u201318176 (2022)","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"1909_CR17","unstructured":"Huang, Z., Zeng, Z., Liu, B., Fu, D., Fu, J.: Pixel-bert: Aligning image pixels with text by deep multi-modal transformers. arXiv preprint arXiv:2004.00849. (2020)"},{"key":"1909_CR18","doi-asserted-by":"crossref","unstructured":"Huang, Z., Zeng, Z., Huang, Y., Liu, B., Fu, D., Fu, J.: Seeing out of the box: End-to-end pre-training for vision-language representation learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 12976\u201312985 (2021)","DOI":"10.1109\/CVPR46437.2021.01278"},{"key":"1909_CR19","doi-asserted-by":"crossref","unstructured":"Jiang, H., Misra, I., Rohrbach, M., Learned-Miller, E., Chen, X.: In defense of grid features for visual question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 10267\u201310276 (2020)","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"1909_CR20","unstructured":"Shen, S., Li, L.H., Tan, H., Bansal, M., Rohrbach, A., Chang, K.-W., Yao, Z., Keutzer, K.: How much can clip benefit vision-and-language tasks? arXiv preprint arXiv:2107.06383. (2021)"},{"key":"1909_CR21","doi-asserted-by":"crossref","unstructured":"Wu, D., Li, H., Gu, C., Guo, L., Liu, H.: Improving fusion of region features and grid features via two-step interaction for image-text retrieval. In: Proceedings of the 30th ACM International Conference on Multimedia. pp. 5055\u20135064 (2022)","DOI":"10.1145\/3503161.3548223"},{"key":"1909_CR22","doi-asserted-by":"crossref","unstructured":"Desai, K., Johnson, J.: Virtex: Learning visual representations from textual annotations. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 11162\u201311173 (2021)","DOI":"10.1109\/CVPR46437.2021.01101"},{"key":"1909_CR23","doi-asserted-by":"crossref","unstructured":"Woo, S., Park, J., Lee, J.-Y., Kweon, I.S.: Cbam: Convolutional block attention module. In: Proceedings of the European conference on computer vision (ECCV). pp. 3\u201319 (2018)","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"1909_CR24","doi-asserted-by":"crossref","unstructured":"Li, X., Wang, W., Hu, X., Yang, J.: Selective kernel networks. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 510\u2013519 (2019)","DOI":"10.1109\/CVPR.2019.00060"},{"key":"1909_CR25","unstructured":"Kim, W., Son, B., Kim, I.: Vilt: Vision-and-language transformer without convolution or region supervision. In: International conference on machine learning. pp. 5583\u20135594. PMLR (2021)"},{"key":"1909_CR26","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109659","volume":"141","author":"BJ Kim","year":"2023","unstructured":"Kim, B.J., Choi, H., Jang, H., Lee, D.G., Jeong, W., Kim, S.W.: Improved robustness of vision transformers via prelayernorm in patch embedding. Pattern Recogn. 141, 109659 (2023)","journal-title":"Pattern Recogn."},{"key":"1909_CR27","doi-asserted-by":"crossref","unstructured":"Ma, J., Bai, Y., Zhong, B., Zhang, W., Yao, T., Mei, T.: Visualizing and understanding patch interactions in vision transformer. IEEE Transactions on Neural Networks and Learning Systems. (2023)","DOI":"10.1109\/TNNLS.2023.3270479"},{"key":"1909_CR28","doi-asserted-by":"crossref","unstructured":"Song, Y., Shao, X., Chen, K., Zhang, W., Jing, Z., Li, M.: Clipvg: Text-guided image manipulation using differentiable vector graphics. In: Proceedings of the AAAI Conference on Artificial Intelligence. pp. 2312\u20132320 (2023)","DOI":"10.1609\/aaai.v37i2.25326"},{"key":"1909_CR29","doi-asserted-by":"crossref","unstructured":"Zhao, Z., Xu, X., Li, S., Plaza, A.: Hyperspectral Image Classification Using Groupwise Separable Convolutional Vision Transformer Network. IEEE Transactions on Geoscience and Remote Sensing. (2024)","DOI":"10.1109\/TGRS.2024.3377610"},{"key":"1909_CR30","doi-asserted-by":"publisher","first-page":"294","DOI":"10.1016\/j.neucom.2020.12.137","volume":"472","author":"P Li","year":"2022","unstructured":"Li, P., Zhang, P., Xu, X.: Graph convolutional network meta-learning with multi-granularity POS guidance for video captioning. Neurocomputing 472, 294\u2013305 (2022)","journal-title":"Neurocomputing"},{"key":"1909_CR31","doi-asserted-by":"crossref","unstructured":"Lyu, C., Li, W., Ji, T., Wang, L., Zhou, L., Gurrin, C., Yang, L., Yu, Y., Graham, Y., Foster, J.: Graph-Based Video-Language Learning with Multi-Grained Audio-Visual Alignment. In: Proceedings of the 31st ACM International Conference on Multimedia. pp. 3975\u20133984 (2023)","DOI":"10.1145\/3581783.3612132"},{"key":"1909_CR32","doi-asserted-by":"publisher","DOI":"10.1016\/j.compeleceng.2023.108641","volume":"107","author":"W Shi","year":"2023","unstructured":"Shi, W., Wang, H., Lou, X.: Multi-modal graph reasoning for structured video text extraction. Comput. Electr. Eng. 107, 108641 (2023)","journal-title":"Comput. Electr. Eng."},{"key":"1909_CR33","doi-asserted-by":"publisher","first-page":"6209","DOI":"10.1109\/TIP.2020.2988435","volume":"29","author":"J Zhang","year":"2020","unstructured":"Zhang, J., Peng, Y.: Video captioning with object-aware spatio-temporal correlation and aggregation. IEEE Trans. Image Process. 29, 6209\u20136222 (2020)","journal-title":"IEEE Trans. Image Process."},{"key":"1909_CR34","doi-asserted-by":"crossref","unstructured":"Xiao, X., Zhang, Y., Feng, R., Zhang, T., Gao, S., Fan, W.: Video captioning with temporal and region graph convolution network. In: 2020 IEEE International Conference on Multimedia and Expo (ICME). pp. 1\u20136. IEEE (2020)","DOI":"10.1109\/ICME46284.2020.9102967"},{"key":"1909_CR35","doi-asserted-by":"publisher","first-page":"88","DOI":"10.1016\/j.neucom.2022.02.062","volume":"488","author":"L Ji","year":"2022","unstructured":"Ji, L., Tu, R., Lin, K., Wang, L., Duan, N.: Multimodal graph neural network for video procedural captioning. Neurocomputing 488, 88\u201396 (2022)","journal-title":"Neurocomputing"},{"key":"1909_CR36","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.109204","volume":"136","author":"Y Tu","year":"2023","unstructured":"Tu, Y., Zhou, C., Guo, J., Li, H., Gao, S., Yu, Z.: Relation-aware attention for video captioning via graph learning. Pattern Recogn. 136, 109204 (2023)","journal-title":"Pattern Recogn."},{"key":"1909_CR37","doi-asserted-by":"crossref","unstructured":"Pan, B., Cai, H., Huang, D.-A., Lee, K.-H., Gaidon, A., Adeli, E., Niebles, J.C.: Spatio-temporal graph for video captioning with knowledge distillation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 10870\u201310879 (2020)","DOI":"10.1109\/CVPR42600.2020.01088"},{"key":"1909_CR38","doi-asserted-by":"publisher","first-page":"5150","DOI":"10.1109\/TIP.2022.3192709","volume":"31","author":"H Wang","year":"2022","unstructured":"Wang, H., Lin, G., Hoi, S.C., Miao, C.: Cross-modal graph with meta concepts for video captioning. IEEE Trans. Image Process. 31, 5150\u20135162 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"1909_CR39","doi-asserted-by":"crossref","unstructured":"Dong, X., Long, C., Xu, W., Xiao, C.: Dual graph convolutional networks with transformer and curriculum learning for image captioning. In: Proceedings of the 29th ACM International Conference on Multimedia. pp. 2615\u20132624 (2021)","DOI":"10.1145\/3474085.3475439"},{"key":"1909_CR40","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the Knowledge in a Neural Network, http:\/\/arxiv.org\/abs\/1503.02531, (2015)"},{"key":"1909_CR41","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.-J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics. pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"1909_CR42","unstructured":"Banerjee, S., Lavie, A.: METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. pp. 65\u201372 (2005)"},{"key":"1909_CR43","unstructured":"Rouge, L.C.: A package for automatic evaluation of summaries. In: Proceedings of Workshop on Text Summarization of ACL, Spain (2004)"},{"key":"1909_CR44","unstructured":"Kingma, D.P., Ba, J.: Adam: A Method for Stochastic Optimization. CoRR. abs\/1412.6980, (2014)"},{"key":"1909_CR45","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., Saenko, K.: Sequence to sequence-video to text. In: Proceedings of the IEEE international conference on computer vision. pp. 4534\u20134542 (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"1909_CR46","doi-asserted-by":"publisher","first-page":"2045","DOI":"10.1109\/TMM.2017.2729019","volume":"19","author":"L Gao","year":"2017","unstructured":"Gao, L., Guo, Z., Zhang, H., Xu, X., Shen, H.T.: Video captioning with attention-based LSTM and semantic consistency. IEEE Trans. Multimedia 19, 2045\u20132055 (2017)","journal-title":"IEEE Trans. Multimedia"},{"key":"1909_CR47","doi-asserted-by":"crossref","unstructured":"Zhou, L., Zhou, Y., Corso, J.J., Socher, R., Xiong, C.: End-to-end dense video captioning with masked transformer. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 8739\u20138748 (2018)","DOI":"10.1109\/CVPR.2018.00911"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01909-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01909-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01909-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T10:24:58Z","timestamp":1761387898000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01909-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,21]]},"references-count":47,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,10]]}},"alternative-id":["1909"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01909-z","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,8,21]]},"assertion":[{"value":"15 January 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 June 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 August 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"332"}}