{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T17:47:49Z","timestamp":1757612869394,"version":"3.44.0"},"reference-count":83,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2025,4,21]],"date-time":"2025-04-21T00:00:00Z","timestamp":1745193600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,21]],"date-time":"2025-04-21T00:00:00Z","timestamp":1745193600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"the Natural Science Foundation of Hubei Province","award":["2023AFB206"],"award-info":[{"award-number":["2023AFB206"]}]},{"name":"the Scientific Research Foundation of Hubei University of Education for Talent Introduction","award":["ESRC20230009"],"award-info":[{"award-number":["ESRC20230009"]}]},{"name":"the Fundamental Research Funds for the Central Universities","award":["WHUTIOT2023-006"],"award-info":[{"award-number":["WHUTIOT2023-006"]}]},{"name":"the Hubei Institute of Education Science","award":["2022ZA41"],"award-info":[{"award-number":["2022ZA41"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s00530-025-01787-5","type":"journal-article","created":{"date-parts":[[2025,4,21]],"date-time":"2025-04-21T11:33:42Z","timestamp":1745235222000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Refined linguistic deliberation for video captioning via cascade transformer and LSTM"],"prefix":"10.1007","volume":"31","author":[{"given":"Shuqin","family":"Chen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhixin","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Li","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yikang","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shifeng","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yi","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,4,21]]},"reference":[{"key":"1787_CR1","doi-asserted-by":"crossref","unstructured":"Aafaq, N., Akhtar, N., Liu, W., et\u00a0al.: Spatio-temporal dynamics and semantic attribute enriched visual encoding for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 12487\u201312496 (2019)","DOI":"10.1109\/CVPR.2019.01277"},{"key":"1787_CR2","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: ACL Workshops, pp. 65\u201372 (2005)"},{"key":"1787_CR3","unstructured":"Bengio, S., Vinyals, O., Jaitly, N., et\u00a0al.: Scheduled sampling for sequence prediction with recurrent neural networks. Adv. Neural Inf. Process. Syst. 28 (2015)"},{"issue":"1","key":"1787_CR4","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TPAMI.2021.3137605","volume":"45","author":"X Chang","year":"2023","unstructured":"Chang, X., Ren, P., Xu, P., et al.: A comprehensive survey of scene graphs: generation and application. IEEE Trans. Pattern Anal. Mach. Intell. 45(1), 1\u201326 (2023). https:\/\/doi.org\/10.1109\/TPAMI.2021.3137605","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1787_CR5","doi-asserted-by":"crossref","unstructured":"Chen, C., Ye, M., Qi, M., et\u00a0al.: Sketchtrans: disentangled prototype learning with transformer for sketch-photo recognition. IEEE Trans. Pattern Anal. Mach. Intell. (2023)","DOI":"10.1109\/TPAMI.2023.3337005"},{"key":"1787_CR6","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103878","volume":"238","author":"J Chen","year":"2024","unstructured":"Chen, J.: Transform, contrast and tell: coherent entity-aware multi-image captioning. Comput. Vis. Image Underst. 238, 103878 (2024)","journal-title":"Comput. Vis. Image Underst."},{"key":"1787_CR7","doi-asserted-by":"crossref","unstructured":"Chen, J., Chao, H.: VideoTRM: pre-training for video captioning challenge 2020. In: ACM MM, pp. 4605\u20134609 (2020)","DOI":"10.1145\/3394171.3416291"},{"key":"1787_CR8","doi-asserted-by":"crossref","unstructured":"Chen, J., Pan, Y., Li, Y., et\u00a0al.: Temporal deformable convolutional encoder-decoder networks for video captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 8167\u20138174 (2019)","DOI":"10.1609\/aaai.v33i01.33018167"},{"issue":"1s","key":"1787_CR9","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3539225","volume":"19","author":"J Chen","year":"2023","unstructured":"Chen, J., Pan, Y., Li, Y., et al.: Retrieval augmented convolutional encoder-decoder networks for video captioning. ACM Trans. Multimed. Comput. Commun. Appl. 19(1s), 1\u201324 (2023)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"1787_CR10","unstructured":"Chen, M., Li, Y., Zhang, Z., et\u00a0al.: Tvt: two-view transformer network for video captioning. In: Asian Conference on Machine Learning, PMLR, pp. 847\u2013862 (2018)"},{"key":"1787_CR11","doi-asserted-by":"crossref","unstructured":"Chen, S., Jiang, Y.: Motion guided spatial attention for video captioning. In: AAAI, pp. 8191\u20138198 (2019)","DOI":"10.1609\/aaai.v33i01.33018191"},{"key":"1787_CR12","doi-asserted-by":"crossref","unstructured":"Chen, S., Jiang, Y.G.: Motion guided region message passing for video captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1543\u20131552 (2021)","DOI":"10.1109\/ICCV48922.2021.00157"},{"key":"1787_CR13","doi-asserted-by":"crossref","unstructured":"Chen, S., Jiang, W., Liu, W., et\u00a0al.: Learning modality interaction for temporal sentence localization and event captioning in videos. In: ECCV, pp. 333\u2013351 (2020)","DOI":"10.1007\/978-3-030-58548-8_20"},{"issue":"3","key":"1787_CR14","doi-asserted-by":"publisher","first-page":"2353","DOI":"10.1007\/s11063-020-10352-2","volume":"52","author":"S Chen","year":"2020","unstructured":"Chen, S., Zhong, X., Li, L., et al.: Adaptively converting auxiliary attributes and textual embedding for video captioning based on bilstm. Neural Process. Lett. 52(3), 2353\u20132369 (2020)","journal-title":"Neural Process. Lett."},{"issue":"10","key":"1787_CR15","doi-asserted-by":"publisher","first-page":"326:1","DOI":"10.1145\/3679203","volume":"20","author":"S Chen","year":"2024","unstructured":"Chen, S., Zhong, X., Zhang, Y., et al.: Action-aware linguistic skeleton optimization network for non-autoregressive video captioning. ACM Trans. Multimedia Comput. Commun. Appl. 20(10), 326:1-326:24 (2024)","journal-title":"ACM Trans. Multimedia Comput. Commun. Appl."},{"key":"1787_CR16","doi-asserted-by":"crossref","unstructured":"Chen, Y.C., Gan, Z., Cheng, Y., et\u00a0al.: Distilling knowledge learned in bert for text generation. arXiv preprint arXiv:1911.03829 (2019)","DOI":"10.18653\/v1\/2020.acl-main.705"},{"key":"1787_CR17","doi-asserted-by":"crossref","unstructured":"Cherian, A., Wang, J., Hori, C., et\u00a0al.: Spatio-temporal ranked-attention networks for video captioning. In: WACV, pp. 1606\u20131615 (2020)","DOI":"10.1109\/WACV45572.2020.9093291"},{"issue":"2","key":"1787_CR18","doi-asserted-by":"publisher","first-page":"880","DOI":"10.1109\/TCSVT.2021.3063423","volume":"32","author":"J Deng","year":"2021","unstructured":"Deng, J., Li, L., Zhang, B., et al.: Syntax-guided hierarchical attention network for video captioning. IEEE Trans. Circuits Syst. Video Technol. 32(2), 880\u2013892 (2021)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"2","key":"1787_CR19","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3550276","volume":"19","author":"S Dong","year":"2023","unstructured":"Dong, S., Niu, T., Luo, X., et al.: Semantic embedding guided attention with explicit visual feature fusion for video captioning. ACM Trans. Multimed. Comput. Commun. Appl. 19(2), 1\u201318 (2023)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"1787_CR20","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2022.103617","volume":"228","author":"Y Duan","year":"2023","unstructured":"Duan, Y., Wang, Z., Li, Y., et al.: Cross-domain multi-style merge for image captioning. Comput. Vis. Image Underst. 228, 103617 (2023)","journal-title":"Comput. Vis. Image Underst."},{"issue":"5","key":"1787_CR21","first-page":"1112","volume":"42","author":"L Gao","year":"2020","unstructured":"Gao, L., Li, X., Song, J., et al.: Hierarchical lstms with adaptive attention for visual captioning. IEEE Trans. Pattern Anal. Mach. Intell. 42(5), 1112\u20131131 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1787_CR22","doi-asserted-by":"publisher","first-page":"202","DOI":"10.1109\/TIP.2021.3120867","volume":"31","author":"L Gao","year":"2021","unstructured":"Gao, L., Lei, Y., Zeng, P., et al.: Hierarchical representation network with auxiliary tasks for video captioning and video question answering. IEEE Trans. Image Process. 31, 202\u2013215 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"1787_CR23","doi-asserted-by":"crossref","unstructured":"Gu, X., Chen, G., Wang, Y., et\u00a0al.: Text with knowledge graph augmented transformer for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18941\u201318951 (2023)","DOI":"10.1109\/CVPR52729.2023.01816"},{"key":"1787_CR24","doi-asserted-by":"crossref","unstructured":"Guadarrama, S., Krishnamoorthy, N., Malkarnenkar, G., et\u00a0al.: YouTube2Text: Recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition. In: CVPR, pp. 2712\u20132719 (2013)","DOI":"10.1109\/ICCV.2013.337"},{"key":"1787_CR25","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., et\u00a0al.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"8","key":"1787_CR26","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"1787_CR27","doi-asserted-by":"crossref","unstructured":"Jin, T., Huang, S., Chen, M., et\u00a0al.: SBAT: video captioning with sparse boundary-aware transformer. In: IJCAI, pp. 3119\u20133127 (2020)","DOI":"10.24963\/ijcai.2020\/88"},{"key":"1787_CR28","doi-asserted-by":"publisher","first-page":"2367","DOI":"10.1109\/TMM.2023.3295098","volume":"26","author":"S Jing","year":"2024","unstructured":"Jing, S., Zhang, H., Zeng, P., et al.: Memory-based augmentation network for video captioning. IEEE Trans. Multimedia 26, 2367\u20132379 (2024)","journal-title":"IEEE Trans. Multimedia"},{"issue":"1","key":"1787_CR29","doi-asserted-by":"publisher","first-page":"17","DOI":"10.1109\/TCSVT.2020.3045735","volume":"32","author":"L Li","year":"2020","unstructured":"Li, L., Zhang, Y., Tang, S., et al.: Adaptive spatial location with balanced loss for video captioning. IEEE Trans. Circuits Syst. Video Technol. 32(1), 17\u201330 (2020)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"1787_CR30","doi-asserted-by":"publisher","first-page":"2726","DOI":"10.1109\/TIP.2022.3158546","volume":"31","author":"L Li","year":"2022","unstructured":"Li, L., Gao, X., Deng, J., et al.: Long short-term relation transformer with global gating for video captioning. IEEE Trans. Image Process. 31, 2726\u20132738 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"1787_CR31","doi-asserted-by":"publisher","unstructured":"Lian, R., Ling, H.: Checkerpose: progressive dense keypoint localization for object pose estimation with graph neural network. In: IEEE\/CVF International Conference on Computer Vision, ICCV 2023, Paris, France, October 1\u20136, 2023. IEEE, pp. 13976\u201313987. https:\/\/doi.org\/10.1109\/ICCV51070.2023.01289, https:\/\/doi.org\/10.1109\/ICCV51070.2023.01289 (2023)","DOI":"10.1109\/ICCV51070.2023.01289"},{"key":"1787_CR32","unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. In: ACL (2004)"},{"key":"1787_CR33","doi-asserted-by":"crossref","unstructured":"Liu, L., Utiyama, M., Finch, A., et\u00a0al.: Agreement on target-bidirectional neural machine translation. In: Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 411\u2013416 (2016)","DOI":"10.18653\/v1\/N16-1046"},{"issue":"9","key":"1787_CR34","doi-asserted-by":"publisher","first-page":"3259","DOI":"10.1109\/TPAMI.2019.2940007","volume":"43","author":"S Liu","year":"2021","unstructured":"Liu, S., Ren, Z., Yuan, J.: SibNet: sibling convolutional encoder for video captioning. IEEE Trans. Pattern Anal. Mach. Intell. 43(9), 3259\u20133272 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1787_CR35","doi-asserted-by":"publisher","first-page":"4213","DOI":"10.1109\/TMM.2022.3172548","volume":"25","author":"S Liu","year":"2023","unstructured":"Liu, S., Bao, R., Zhu, D., et al.: Fine-grained face editing via personalized spatial-aware affine modulation. IEEE Trans. Multimedia 25, 4213\u20134224 (2023). https:\/\/doi.org\/10.1109\/TMM.2022.3172548","journal-title":"IEEE Trans. Multimedia"},{"key":"1787_CR36","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2022.103453","volume":"221","author":"H Munusamy","year":"2022","unstructured":"Munusamy, H., et al.: Video captioning using semantically contextual generative adversarial network. Comput. Vis. Image Underst. 221, 103453 (2022)","journal-title":"Comput. Vis. Image Underst."},{"key":"1787_CR37","doi-asserted-by":"crossref","unstructured":"Pan, B., Cai, H., Huang, D., et\u00a0al.: Spatio-temporal graph for video captioning with knowledge distillation. In: CVPR, pp. 10867\u201310876 (2020)","DOI":"10.1109\/CVPR42600.2020.01088"},{"key":"1787_CR38","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T.: Bleu: a method for automatic evaluation of machine translation. In: ACL, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"1787_CR39","doi-asserted-by":"crossref","unstructured":"Pasunuru, R., Bansal, M.: Reinforced video captioning with entailment rewards. In: EMNLP, pp. 979\u2013985 (2017)","DOI":"10.18653\/v1\/D17-1103"},{"key":"1787_CR40","doi-asserted-by":"crossref","unstructured":"Pei, W., Zhang, J., Wang, X., et\u00a0al.: Memory-attended recurrent network for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8347\u20138356 (2019)","DOI":"10.1109\/CVPR.2019.00854"},{"key":"1787_CR41","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103864","volume":"238","author":"Q Rao","year":"2024","unstructured":"Rao, Q., Yu, X., Li, G., et al.: Cmgnet: Collaborative multi-modal graph network for video captioning. Comput. Vis. Image Underst. 238, 103864 (2024)","journal-title":"Comput. Vis. Image Underst."},{"key":"1787_CR42","doi-asserted-by":"crossref","unstructured":"Ryu, H., Kang, S., Kang, H., et\u00a0al.: Semantic grouping network for video captioning. In: AAAI, pp. 2514\u20132522 (2021)","DOI":"10.1609\/aaai.v35i3.16353"},{"key":"1787_CR43","doi-asserted-by":"crossref","unstructured":"Shi, X., Cai, J., Joty, S., et\u00a0al.: Watch it twice: Video captioning with a refocused video encoder. In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 818\u2013826 (2019)","DOI":"10.1145\/3343031.3351060"},{"key":"1787_CR44","doi-asserted-by":"crossref","unstructured":"Song, P., Guo, D., Yang, X., et\u00a0al.: Emotion-prior awareness network for emotional video captioning. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 589\u2013600 (2023)","DOI":"10.1145\/3581783.3611726"},{"key":"1787_CR45","doi-asserted-by":"crossref","unstructured":"Song, P., Guo, D., Yang, X., et\u00a0al.: Emotional video captioning with vision-based emotion interpretation network. IEEE Trans. Image Process. (2024)","DOI":"10.1109\/TIP.2024.3359045"},{"issue":"6","key":"1787_CR46","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3682067","volume":"15","author":"P Song","year":"2024","unstructured":"Song, P., Zhou, Y., Yang, X., et al.: Efficiently gluing pre-trained language and vision models for image captioning. ACM Trans. Intell. Syst. Technol. 15(6), 1\u201316 (2024)","journal-title":"ACM Trans. Intell. Syst. Technol."},{"key":"1787_CR47","unstructured":"Srivastava, R.K., Greff, K., Schmidhuber, J.: Training very deep networks. In: NeurIPS, pp. 2377\u20132385 (2015)"},{"key":"1787_CR48","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Ioffe, S., Vanhoucke, V., et\u00a0al.: Inception-v4, inception-resnet and the impact of residual connections on learning. In: Proceedings of the AAAI Conference on Artificial Intelligence (2017)","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"1787_CR49","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., et\u00a0al.: Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"1787_CR50","doi-asserted-by":"crossref","unstructured":"Tu, Y., Zhang, X., Liu, B., et\u00a0al.: Video description with spatial-temporal attention. In: Proceedings of the 25th ACM International Conference on Multimedia, pp. 1014\u20131022 (2017)","DOI":"10.1145\/3123266.3123354"},{"key":"1787_CR51","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107702","volume":"111","author":"Y Tu","year":"2021","unstructured":"Tu, Y., Zhou, C., Guo, J., et al.: Enhancing the alignment between target words and corresponding frames for video captioning. Pattern Recogn. 111, 107702 (2021)","journal-title":"Pattern Recogn."},{"key":"1787_CR52","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et\u00a0al.: Attention is all you need. In: NeurIPS, pp. 5998\u20136008"},{"key":"1787_CR53","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: CIDEr: Consensus-based image description evaluation. In: CVPR, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"1787_CR54","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Xu, H., Donahue, J., et\u00a0al.: Translating videos to natural language using deep recurrent neural networks. arXiv preprint arXiv:1412.4729 (2014)","DOI":"10.3115\/v1\/N15-1173"},{"key":"1787_CR55","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., et\u00a0al.: Sequence to sequence-video to text. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4534\u20134542 (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"1787_CR56","doi-asserted-by":"crossref","unstructured":"Wang, B., Ma, L., Zhang, W., et\u00a0al.: Controllable video captioning with POS sequence guidance based on gated fusion network. In: Proc. IEEE\/CVF Int. Conf. Comput. Vis. (ICCV), pp. 2641\u20132650 (2019)","DOI":"10.1109\/ICCV.2019.00273"},{"key":"1787_CR57","doi-asserted-by":"crossref","unstructured":"Wang, C., Yang, H., Bartz, C., et\u00a0al.: Image captioning with deep bidirectional lstms. In: Proceedings of the 24th ACM International Conference on Multimedia, pp. 988\u2013997 (2016)","DOI":"10.1145\/2964284.2964299"},{"key":"1787_CR58","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103799","volume":"235","author":"H Wang","year":"2023","unstructured":"Wang, H., Zhang, L., Fan, H., et al.: Collaborative three-stream transformers for video captioning. Comput. Vis. Image Underst. 235, 103799 (2023)","journal-title":"Comput. Vis. Image Underst."},{"key":"1787_CR59","doi-asserted-by":"crossref","unstructured":"Wang, L., Bai, Z., Zhang, Y., et\u00a0al.: Show, recall, and tell: image captioning with recall mechanism. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 12176\u201312183 (2020)","DOI":"10.1609\/aaai.v34i07.6898"},{"key":"1787_CR60","doi-asserted-by":"crossref","unstructured":"Wang, T., Zhang, R., Lu, Z., et\u00a0al.: End-to-end dense video captioning with parallel decoding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6847\u20136857 (2021)","DOI":"10.1109\/ICCV48922.2021.00677"},{"issue":"3","key":"1787_CR61","doi-asserted-by":"publisher","first-page":"1635","DOI":"10.1109\/TPAMI.2022.3168530","volume":"46","author":"W Wang","year":"2024","unstructured":"Wang, W., Sun, G., Gool, L.V.: Looking beyond single images for weakly supervised semantic segmentation learning. IEEE Trans. Pattern Anal. Mach. Intell. 46(3), 1635\u20131649 (2024). https:\/\/doi.org\/10.1109\/TPAMI.2022.3168530","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1787_CR62","doi-asserted-by":"crossref","unstructured":"Wang, X., Wu, J., Chen, J., et\u00a0al.: Vatex: a large-scale, high-quality multilingual dataset for video-and-language research. In: Proc. IEEE\/CVF Int. Conf. Comput. Vis., pp. 4580\u20134590 (2019)","DOI":"10.1109\/ICCV.2019.00468"},{"issue":"10","key":"1787_CR63","doi-asserted-by":"publisher","first-page":"6753","DOI":"10.1109\/TCSVT.2022.3169894","volume":"32","author":"B Wu","year":"2022","unstructured":"Wu, B., Niu, G., Yu, J., et al.: Towards knowledge-aware video captioning via transitive visual relationship detection. IEEE Trans. Circuits Syst. Video Technol. 32(10), 6753\u20136765 (2022)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"1787_CR64","unstructured":"Xia, Y., Tian, F., Wu, L., et\u00a0al.: Deliberation networks: sequence generation beyond one-pass decoding. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"1787_CR65","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R.B., Doll\u00e1r, P., et\u00a0al.: Aggregated residual transformations for deep neural networks. In: CVPR, pp. 5987\u20135995 (2017)","DOI":"10.1109\/CVPR.2017.634"},{"key":"1787_CR66","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., et\u00a0al.: MSR-VTT: a large video description dataset for bridging video and language. In: CVPR, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"1787_CR67","unstructured":"Yang, A., Nagrani, A., Laptev, I., et\u00a0al.: Vidchapters-7m: video chapters at scale. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"1787_CR68","doi-asserted-by":"crossref","unstructured":"Yang, B., Zou, Y., Liu, F., et\u00a0al.: Non-autoregressive coarse-to-fine video captioning. In: AAAI (2021)","DOI":"10.1609\/aaai.v35i4.16421"},{"key":"1787_CR69","doi-asserted-by":"crossref","unstructured":"Yao, L., Torabi, A., Cho, K., et\u00a0al.: Describing videos by exploiting temporal structure. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4507\u20134515 (2015)","DOI":"10.1109\/ICCV.2015.512"},{"key":"1787_CR70","doi-asserted-by":"crossref","unstructured":"Ye, H., Li, G., Qi, Y., et\u00a0al.: Hierarchical modular network for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17939\u201317948 (2022)","DOI":"10.1109\/CVPR52688.2022.01741"},{"issue":"2","key":"1787_CR71","doi-asserted-by":"publisher","first-page":"924","DOI":"10.1109\/TPAMI.2020.3013379","volume":"44","author":"M Ye","year":"2020","unstructured":"Ye, M., Shen, J., Zhang, X., et al.: Augmentation invariant and instance spreading feature for softmax embedding. IEEE Trans. Pattern Anal. Mach. Intell. 44(2), 924\u2013939 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1787_CR72","doi-asserted-by":"crossref","unstructured":"Ye, M., Wu, Z., Chen, C., et\u00a0al.: Channel augmentation for visible-infrared re-identification. IEEE Trans. Pattern Anal. Mach. Intell. (2023)","DOI":"10.1109\/TPAMI.2023.3332875"},{"key":"1787_CR73","doi-asserted-by":"publisher","first-page":"1775","DOI":"10.1109\/TMM.2021.3072479","volume":"24","author":"L Yu","year":"2022","unstructured":"Yu, L., Zhang, J., Wu, Q.: Dual attention on pyramid feature maps for image captioning. IEEE Trans. Multimedia 24, 1775\u20131786 (2022)","journal-title":"IEEE Trans. Multimedia"},{"issue":"12","key":"1787_CR74","doi-asserted-by":"publisher","first-page":"3088","DOI":"10.1109\/TPAMI.2019.2920899","volume":"42","author":"W Zhang","year":"2020","unstructured":"Zhang, W., Wang, B., Ma, L., et al.: Reconstruct and represent video contents for captioning via reinforcement learning. IEEE Trans. Pattern Anal. Mach. Intell. 42(12), 3088\u20133101 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1787_CR75","doi-asserted-by":"publisher","unstructured":"Zhang, Z., Crandall, D.: Hierarchically decoupled spatial-temporal contrast for self-supervised video representation learning. In: IEEE\/CVF Winter Conference on Applications of Computer Vision, WACV 2022, Waikoloa, HI, USA, January 3\u20138, 2022. IEEE, pp. 975\u2013985. https:\/\/doi.org\/10.1109\/WACV51458.2022.00105 (2022)","DOI":"10.1109\/WACV51458.2022.00105"},{"key":"1787_CR76","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Wu, S., Liu, S., et\u00a0al.: Regularizing neural machine translation by target-bidirectional agreement. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 443\u2013450 (2019)","DOI":"10.1609\/aaai.v33i01.3301443"},{"key":"1787_CR77","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Shi, Y., Yuan, C., et\u00a0al.: Object relational graph with teacher-recommended learning for video captioning. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit. (CVPR), pp. 13275\u201313285 (2020)","DOI":"10.1109\/CVPR42600.2020.01329"},{"key":"1787_CR78","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Qi, Z., Yuan, C., et\u00a0al.: Open-book video captioning with retrieve-copy-generate network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9837\u20139846 (2021)","DOI":"10.1109\/CVPR46437.2021.00971"},{"key":"1787_CR79","doi-asserted-by":"crossref","unstructured":"Zheng, Q., Wang, C., Tao, D.: Syntax-aware action targeting for video captioning. In: IJCAI, pp. 13093\u201313102 (2020)","DOI":"10.1109\/CVPR42600.2020.01311"},{"key":"1787_CR80","doi-asserted-by":"crossref","unstructured":"Zhong, X., Li, Z., Chen, S., et\u00a0al.: Refined semantic enhancement towards frequency diffusion for video captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 3724\u20133732 (2023)","DOI":"10.1609\/aaai.v37i3.25484"},{"key":"1787_CR81","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1162\/tacl_a_00256","volume":"7","author":"L Zhou","year":"2019","unstructured":"Zhou, L., Zhang, J., Zong, C.: Synchronous bidirectional neural machine translation. Trans. Assoc. Comput. Linguist. 7, 91\u2013105 (2019)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"1787_CR82","unstructured":"Zhou, Y., Hu, Z., Liu, D., et\u00a0al.: Compact bidirectional transformer for image captioning. arXiv preprint arXiv:2201.01984 (2022)"},{"key":"1787_CR83","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Jiang, S.: Attention-based densely connected LSTM for video captioning. In: Proc. ACM Int. Conf. Multimedia (MM), pp. 802\u2013810 (2019)","DOI":"10.1145\/3343031.3350932"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01787-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01787-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01787-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,4]],"date-time":"2025-09-04T15:02:45Z","timestamp":1756998165000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01787-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,21]]},"references-count":83,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["1787"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01787-5","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"type":"print","value":"0942-4962"},{"type":"electronic","value":"1432-1882"}],"subject":[],"published":{"date-parts":[[2025,4,21]]},"assertion":[{"value":"30 August 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 April 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 April 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no Conflict of interest to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This study did not involve any human or animal trials and the required permits and approvals have been obtained.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Research involving human and animal participants"}}],"article-number":"200"}}