{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2023,11,23]],"date-time":"2023-11-23T00:35:45Z","timestamp":1700699745457},"reference-count":55,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2023,8,25]],"date-time":"2023-08-25T00:00:00Z","timestamp":1692921600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,8,25]],"date-time":"2023-08-25T00:00:00Z","timestamp":1692921600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Scientific Research Foundation of Hubei University of Education for Talent Introduction","award":["ESRC20230009"],"award-info":[{"award-number":["ESRC20230009"]}]},{"name":"Hubei Institute of Education Science","award":["2022ZA41"],"award-info":[{"award-number":["2022ZA41"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Process Lett"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s11063-023-11386-y","type":"journal-article","created":{"date-parts":[[2023,8,25]],"date-time":"2023-08-25T18:02:25Z","timestamp":1692986545000},"page":"11509-11526","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Video Captioning Based on Cascaded Attention-Guided Visual Feature Fusion"],"prefix":"10.1007","volume":"55","author":[{"given":"Shuqin","family":"Chen","sequence":"first","affiliation":[]},{"given":"Li","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Yikang","family":"Hu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,8,25]]},"reference":[{"issue":"11","key":"11386_CR1","first-page":"2286","volume":"41","author":"W Gao","year":"2020","unstructured":"Gao W, Chen LDZ et al (2020) Implementation of pre-standardized transformer in Ukrainian\u2013English machine translation. Small Microcomput Syst 41(11):2286\u20132291","journal-title":"Small Microcomput Syst"},{"issue":"11","key":"11386_CR2","first-page":"2300","volume":"42","author":"H Zhang","year":"2021","unstructured":"Zhang H, Shao YYD et al (2021) Neural machine translation based on hierarchical analysis of syntactic rules. Small Microcomput Syst 42(11):2300\u20132306","journal-title":"Small Microcomput Syst"},{"key":"11386_CR3","doi-asserted-by":"crossref","unstructured":"Fang K, Zhou L, Jin C et\u00a0al (2019) Fully convolutional video captioning with coarse-to-fine and inherited attention. In: proceedings of the AAAI conference on artificial intelligence, pp 8271\u20138278","DOI":"10.1609\/aaai.v33i01.33018271"},{"key":"11386_CR4","doi-asserted-by":"crossref","unstructured":"Lian Z, Li H, Wang R et\u00a0al (2020) Enhanced soft attention mechanism with an inception-like module for image captioning. In: 2020 IEEE 32nd international conference on tools with artificial intelligence (ICTAI), IEEE, pp 748\u2013752","DOI":"10.1109\/ICTAI50040.2020.00119"},{"issue":"05","key":"11386_CR5","first-page":"1064","volume":"40","author":"X Li","year":"2019","unstructured":"Li X, Zhang LXH (2019) Research on multi-theme image description generation method. Small Microcomput Syst 40(05):1064\u20131068","journal-title":"Small Microcomput Syst"},{"key":"11386_CR6","doi-asserted-by":"crossref","unstructured":"Wu X, Li G, Cao Q et\u00a0al (2018) Interpretable video captioning via trajectory structured localization. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6829\u20136837","DOI":"10.1109\/CVPR.2018.00714"},{"key":"11386_CR7","doi-asserted-by":"crossref","unstructured":"Liu S, Ren Z, Yuan J (2018) Sibnet: sibling convolutional encoder for video captioning. In: Proceedings of the 26th ACM international conference on multimedia, pp 1425\u20131434","DOI":"10.1145\/3240508.3240667"},{"key":"11386_CR8","doi-asserted-by":"crossref","unstructured":"Chen S, Jiang YG (2019) Motion guided spatial attention for video captioning. In: Proceedings of the AAAI conference on artificial intelligence, pp 8191\u20138198","DOI":"10.1609\/aaai.v33i01.33018191"},{"key":"11386_CR9","doi-asserted-by":"crossref","unstructured":"Cherian A, Wang J, Hori C et\u00a0al (2020) Spatio-temporal ranked-attention networks for video captioning. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 1617\u20131626","DOI":"10.1109\/WACV45572.2020.9093291"},{"key":"11386_CR10","doi-asserted-by":"crossref","unstructured":"Huang L, Wang W, Chen J et\u00a0al (2019) Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 4634\u20134643","DOI":"10.1109\/ICCV.2019.00473"},{"key":"11386_CR11","doi-asserted-by":"crossref","unstructured":"Chen S, Zhong X, Wu S et\u00a0al (2021) Memory-attended semantic context-aware network for video captioning. Soft Comput pp 1\u201313","DOI":"10.1007\/s00500-021-06360-6"},{"key":"11386_CR12","doi-asserted-by":"crossref","unstructured":"Sun Z, Zhong X, Chen S et\u00a0al (2021) Modeling context-guided visual and linguistic semantic feature for video captioning. In: International conference on artificial neural networks, Springer, pp 677\u2013689","DOI":"10.1007\/978-3-030-86383-8_54"},{"key":"11386_CR13","doi-asserted-by":"crossref","unstructured":"Zhang J, Peng Y (2019) Object-aware aggregation with bidirectional temporal graph for video captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 8327\u20138336","DOI":"10.1109\/CVPR.2019.00852"},{"key":"11386_CR14","doi-asserted-by":"crossref","unstructured":"Hu Y, Chen Z, Zha ZJ et\u00a0al (2019) Hierarchical global-local temporal modeling for video captioning. In: Proceedings of the 27th ACM international conference on multimedia, pp 774\u2013783","DOI":"10.1145\/3343031.3351072"},{"key":"11386_CR15","doi-asserted-by":"crossref","unstructured":"Qin Y, Du J, Zhang Y et\u00a0al (2019) Look back and predict forward in image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 8367\u20138375","DOI":"10.1109\/CVPR.2019.00856"},{"key":"11386_CR16","doi-asserted-by":"crossref","unstructured":"Shi J, Li Y, Wang S (2019) Cascade attention: multiple feature based learning for image captioning. In: 2019 IEEE international conference on image processing (ICIP), IEEE, pp 1970\u20131974","DOI":"10.1109\/ICIP.2019.8803149"},{"issue":"2","key":"11386_CR17","doi-asserted-by":"publisher","first-page":"880","DOI":"10.1109\/TCSVT.2021.3063423","volume":"32","author":"J Deng","year":"2021","unstructured":"Deng J, Li L, Zhang B et al (2021) Syntax-guided hierarchical attention network for video captioning. IEEE Trans Circuits Syst Video Technol 32(2):880\u2013892","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"11386_CR18","doi-asserted-by":"crossref","unstructured":"Zhu Y, Jiang S (2019) Attention-based densely connected lstm for video captioning. In: Proceedings of the 27th ACM international conference on multimedia, pp 802\u2013810","DOI":"10.1145\/3343031.3350932"},{"issue":"3","key":"11386_CR19","doi-asserted-by":"publisher","first-page":"2353","DOI":"10.1007\/s11063-020-10352-2","volume":"52","author":"S Chen","year":"2020","unstructured":"Chen S, Zhong X, Li L et al (2020) Adaptively converting auxiliary attributes and textual embedding for video captioning based on bilstm. Neural Process Lett 52(3):2353\u20132369","journal-title":"Neural Process Lett"},{"key":"11386_CR20","doi-asserted-by":"crossref","unstructured":"Pan B, Cai H, Huang DA, et\u00a0al (2020) Spatio-temporal graph for video captioning with knowledge distillation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10,870\u201310,879","DOI":"10.1109\/CVPR42600.2020.01088"},{"issue":"5","key":"11386_CR21","first-page":"1112","volume":"42","author":"L Gao","year":"2019","unstructured":"Gao L, Li X, Song J et al (2019) Hierarchical lstms with adaptive attention for visual captioning. IEEE Trans Pattern Anal Mach Intell 42(5):1112\u20131131","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"11386_CR22","doi-asserted-by":"crossref","unstructured":"Li X, Zhao B, Lu X, et\u00a0al (2017) Mam-RNN: multi-level attention model based RNN for video captioning. In: IJCAI, pp 2208\u20132214","DOI":"10.24963\/ijcai.2017\/307"},{"key":"11386_CR23","doi-asserted-by":"crossref","unstructured":"Wang H, Xu Y, Han Y (2018) Spotting and aggregating salient regions for video captioning. In: Proceedings of the 26th ACM international conference on multimedia, pp 1519\u20131526","DOI":"10.1145\/3240508.3240677"},{"issue":"10","key":"11386_CR24","doi-asserted-by":"publisher","first-page":"3981","DOI":"10.1109\/TCSVT.2020.3044887","volume":"31","author":"K Jiang","year":"2020","unstructured":"Jiang K, Wang Z, Yi P et al (2020) Decomposition makes better rain removal: an improved attention-guided de raining network. IEEE Trans Circuits Syst Video Technol 31(10):3981\u20133995","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"issue":"7","key":"11386_CR25","doi-asserted-by":"publisher","first-page":"2739","DOI":"10.1109\/TCSVT.2020.3031303","volume":"31","author":"Z Huang","year":"2020","unstructured":"Huang Z, Wang Z, Tsai CC et al (2020) Dotscn: group re-identification via domain-transferred single and couple representation learning. IEEE Trans Circuits Syst Video Technol 31(7):2739\u20132750","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"11386_CR26","doi-asserted-by":"crossref","unstructured":"Tan G, Liu D, Wang M et\u00a0al (2020) Learning to discretely compose reasoning module networks for video captioning. arXiv preprint arXiv:2007.09049","DOI":"10.24963\/ijcai.2020\/104"},{"key":"11386_CR27","doi-asserted-by":"crossref","unstructured":"Zheng Q, Wang C, Tao D (2020) Syntax-aware action targeting for video captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 13,096\u201313,105","DOI":"10.1109\/CVPR42600.2020.01311"},{"key":"11386_CR28","unstructured":"Xu K, Ba J, Kiros R et\u00a0al (2015) Show, attend and tell: neural image caption generation with visual attention. In: International conference on machine learning, PMLR, pp 2048\u20132057"},{"key":"11386_CR29","doi-asserted-by":"crossref","unstructured":"Pan Y, Yao T, Li Y et\u00a0al (2020) X-linear attention networks for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10,971\u201310,980","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"11386_CR30","unstructured":"Ren S, He K, Girshick R et\u00a0al (2015) Faster R-CNN: towards real-time object detection with region proposal networks. Adv Neural Inf Process Syst 28"},{"key":"11386_CR31","doi-asserted-by":"crossref","unstructured":"Venugopalan S, Rohrbach DJM et\u00a0al (2015) Sequence to sequence -video to text. In: Proceedings of IEEE international conference on computer vision, pp 4534\u20134542","DOI":"10.1109\/ICCV.2015.515"},{"key":"11386_CR32","doi-asserted-by":"crossref","unstructured":"Xu J, Mei T, Yao T et\u00a0al (2016) Msr-vtt: a large video description dataset for bridging video and language. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 5288\u20135296","DOI":"10.1109\/CVPR.2016.571"},{"key":"11386_CR33","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S et\u00a0al (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"11386_CR34","doi-asserted-by":"crossref","unstructured":"Xie S, Girshick R, Doll\u00e1r P et\u00a0al (2017) Aggregated residual transformations for deep neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1492\u20131500","DOI":"10.1109\/CVPR.2017.634"},{"issue":"3","key":"11386_CR35","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky O, Deng J, Su H et al (2015) Imagenet large scale visual recognition challenge. Int J Comput Vision 115(3):211\u2013252","journal-title":"Int J Comput Vision"},{"key":"11386_CR36","unstructured":"Kay W, Carreira J, Simonyan K et\u00a0al (2017) The kinetics human action video dataset. arXiv preprint arXiv:1705.06950"},{"key":"11386_CR37","doi-asserted-by":"crossref","unstructured":"Chen J, Pan Y, Li Y et\u00a0al (2019) Temporal deformable convolutional encoder-decoder networks for video captioning. In: Proceedings of the AAAI conference on artificial intelligence, pp 8167\u20138174","DOI":"10.1609\/aaai.v33i01.33018167"},{"key":"11386_CR38","doi-asserted-by":"crossref","unstructured":"Chen Y, Wang S, Zhang W et\u00a0al (2018) Less is more: picking informative frames for video captioning. In: Proceedings of the European conference on computer vision (ECCV), pp 358\u2013373","DOI":"10.1007\/978-3-030-01261-8_22"},{"key":"11386_CR39","doi-asserted-by":"crossref","unstructured":"Wang B, Ma L, Zhang W et\u00a0al (2018) Reconstruction network for video captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7622\u20137631","DOI":"10.1109\/CVPR.2018.00795"},{"key":"11386_CR40","doi-asserted-by":"crossref","unstructured":"Aafaq N, Akhtar N, Liu W et\u00a0al (2019) Spatio-temporal dynamics and semantic attribute enriched visual encoding for video captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 12,487\u201312,496","DOI":"10.1109\/CVPR.2019.01277"},{"key":"11386_CR41","doi-asserted-by":"crossref","unstructured":"Shi X, Cai J, Joty S et\u00a0al (2019) Watch it twice: video captioning with a refocused video encoder. In: Proceedings of the 27th ACM international conference on multimedia, pp 818\u2013826","DOI":"10.1145\/3343031.3351060"},{"key":"11386_CR42","doi-asserted-by":"crossref","unstructured":"Wang B, Ma L, Zhang W et\u00a0al (2019) Controllable video captioning with pos sequence guidance based on gated fusion network. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 2641\u20132650","DOI":"10.1109\/ICCV.2019.00273"},{"key":"11386_CR43","doi-asserted-by":"crossref","unstructured":"Hou J, Wu X, Zhao W et\u00a0al (2019) Joint syntax representation learning and visual cue translation for video captioning. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 8918\u20138927","DOI":"10.1109\/ICCV.2019.00901"},{"key":"11386_CR44","doi-asserted-by":"crossref","unstructured":"Chen J, Pan Y, Li Y et\u00a0al (2019) Temporal deformable convolutional encoder-decoder networks for video captioning. In: Proceedings of the AAAI conference on artificial intelligence, pp 8167\u20138174","DOI":"10.1609\/aaai.v33i01.33018167"},{"key":"11386_CR45","doi-asserted-by":"publisher","first-page":"31,751","DOI":"10.1109\/ACCESS.2022.3160451","volume":"10","author":"S Li","year":"2022","unstructured":"Li S, Yang B, Zou Y (2022) Adaptive curriculum learning for video captioning. IEEE Access 10:31,751-31,759","journal-title":"IEEE Access"},{"issue":"1s","key":"11386_CR46","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3539225","volume":"19","author":"J Chen","year":"2023","unstructured":"Chen J, Pan Y, Li Y et al (2023) Retrieval augmented convolutional encoder-decoder networks for video captioning. ACM Trans Multimed Comput Commun Appl 19(1s):1\u201324","journal-title":"ACM Trans Multimed Comput Commun Appl"},{"issue":"107","key":"11386_CR47","first-page":"702","volume":"111","author":"Y Tu","year":"2021","unstructured":"Tu Y, Zhou C, Guo J et al (2021) Enhancing the alignment between target words and corresponding frames for video captioning. Pattern Recogn 111(107):702","journal-title":"Pattern Recogn"},{"key":"11386_CR48","doi-asserted-by":"crossref","unstructured":"Ye H, Li G, Qi Y et\u00a0al (2022) Hierarchical modular network for video captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 17,939\u201317,948","DOI":"10.1109\/CVPR52688.2022.01741"},{"key":"11386_CR49","doi-asserted-by":"crossref","unstructured":"Pei W, Zhang J, Wang X et\u00a0al (2019) Memory-attended recurrent network for video captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 8347\u20138356","DOI":"10.1109\/CVPR.2019.00854"},{"key":"11386_CR50","doi-asserted-by":"crossref","unstructured":"Chen J, Chao H (2020) Videotrm: pre-training for video captioning challenge 2020. In: Proceedings of the 28th ACM international conference on multimedia, pp 4605\u20134609","DOI":"10.1145\/3394171.3416291"},{"key":"11386_CR51","doi-asserted-by":"crossref","unstructured":"Liu S, Ren Z, Yuan J (2018) Sibnet: sibling convolutional encoder for video captioning. In: Proceedings of the 26th ACM international conference on multimedia, pp 1425\u20131434","DOI":"10.1145\/3240508.3240667"},{"key":"11386_CR52","doi-asserted-by":"crossref","unstructured":"Lin K, Li L, Lin CC et\u00a0al (2022) Swinbert: end-to-end transformers with sparse attention for video captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 17,949\u201317,958","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"11386_CR53","doi-asserted-by":"crossref","unstructured":"Li L, Gao X, Deng J et al (2022) Long short-term relation transformer with global gating for video captioning. IEEE Trans Image Process 31: 2726\u20132738","DOI":"10.1109\/TIP.2022.3158546"},{"key":"11386_CR54","doi-asserted-by":"crossref","unstructured":"Wang T, Zhang R, Lu Z et\u00a0al (2021) End-to-end dense video captioning with parallel decoding. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 6847\u20136857","DOI":"10.1109\/ICCV48922.2021.00677"},{"key":"11386_CR55","doi-asserted-by":"crossref","unstructured":"Iashin V, Rahtu E (2020) Multi-modal dense video captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops, pp 958\u2013959","DOI":"10.1109\/CVPRW50498.2020.00487"}],"container-title":["Neural Processing Letters"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-023-11386-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11063-023-11386-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-023-11386-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,22]],"date-time":"2023-11-22T05:19:17Z","timestamp":1700630357000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11063-023-11386-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,8,25]]},"references-count":55,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["11386"],"URL":"https:\/\/doi.org\/10.1007\/s11063-023-11386-y","relation":{},"ISSN":["1370-4621","1573-773X"],"issn-type":[{"value":"1370-4621","type":"print"},{"value":"1573-773X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,8,25]]},"assertion":[{"value":"1 August 2023","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 August 2023","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This study did not involve any human or animal trials and the required permits and approvals have been obtained.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval"}}]}}