{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T00:26:27Z","timestamp":1770078387081,"version":"3.49.0"},"reference-count":35,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2021,9,5]],"date-time":"2021-09-05T00:00:00Z","timestamp":1630800000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2021,9,5]],"date-time":"2021-09-05T00:00:00Z","timestamp":1630800000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100003787","name":"natural science foundation of hebei province","doi-asserted-by":"publisher","award":["F2021202038"],"award-info":[{"award-number":["F2021202038"]}],"id":[{"id":"10.13039\/501100003787","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2022,12]]},"DOI":"10.1007\/s00371-021-02294-0","type":"journal-article","created":{"date-parts":[[2021,9,5]],"date-time":"2021-09-05T03:14:41Z","timestamp":1630811681000},"page":"4267-4278","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Video captioning with global and local text attention"],"prefix":"10.1007","volume":"38","author":[{"given":"Yuqing","family":"Peng","sequence":"first","affiliation":[]},{"given":"Chenxi","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yixin","family":"Pei","sequence":"additional","affiliation":[]},{"given":"Yingjun","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,9,5]]},"reference":[{"key":"2294_CR1","doi-asserted-by":"crossref","unstructured":"Yu, H. , Wang, J. , Huang, Z. , Yang, Y. : Video paragraph captioning using hierarchical recurrent neural networks. In:IEEE Conference on Computer Vision and Pattern Recognition.IEEE,pp. 4584\u20134593 (2016)","DOI":"10.1109\/CVPR.2016.496"},{"key":"2294_CR2","doi-asserted-by":"crossref","unstructured":"Zanfir, M. , Marinoiu, E. , Sminchisescu, C.\u00a0:Spatio-temporal attention models for grounded video captioning.In:Asian Conference on Computer Vision. Springer\u00a0, pp.104\u2013119 (2016)","DOI":"10.1007\/978-3-319-54190-7_7"},{"issue":"3","key":"2294_CR3","doi-asserted-by":"publisher","first-page":"445","DOI":"10.1007\/s00371-018-1566-y","volume":"35","author":"X Liu","year":"2019","unstructured":"Liu, X., Xu, Q., Wang, N.: A survey on deep neural network-based image captioning. Vis. Comput. 35(3), 445\u2013470 (2019)","journal-title":"Vis. Comput."},{"key":"2294_CR4","unstructured":"Yao,L. ,\u00a0Torabi,\u00a0A.,\u00a0Cho,K.: Video caption Generation Incorporating Spatio - temporal Features and a Soft- attention Mechanism. In:IEEE Conference on Computer Vision (ICCV). Eprint Arxiv,\u00a053, pp.199\u2013211 (2015)"},{"key":"2294_CR5","doi-asserted-by":"crossref","unstructured":"Venugopalan, S. , Rohrbach, M. , Donahue, J.:Sequence to sequence-video to text.In:IEEE international Conference on Computer Vision. IEEE,pp.4534\u20134542 (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"2294_CR6","doi-asserted-by":"crossref","unstructured":"Zhang, J. , Peng,Y.: Object-Aware Aggregation With Bidirectional Temporal Graph for Video Captioning. CVPR ,pp. 8327\u20138336 (2019)","DOI":"10.1109\/CVPR.2019.00852"},{"key":"2294_CR7","doi-asserted-by":"crossref","unstructured":"Hori, C. , Hori, T. , Lee, T. Y. , Sumi, K.: Attention-based multimodal fusion for video caption.In: 2017 IEEE International Conference on Computer Vision (ICCV). IEEE ,pp.4203\u20134212 (2017)","DOI":"10.1109\/ICCV.2017.450"},{"key":"2294_CR8","doi-asserted-by":"crossref","unstructured":"Jin,T. ,Li,Y. ,Zhang,Z. : Recurrent convolutional video captioning with global and local attention. Neurocomputing 370, pp. 118\u2013127(2019)","DOI":"10.1016\/j.neucom.2019.08.042"},{"key":"2294_CR9","doi-asserted-by":"crossref","unstructured":"Jin, Q., Chen J., Chen S.: Describing videos using multi-modal fusion,\u00a0ACM Multimedia, pp.1087\u20131091. (2016)","DOI":"10.1145\/2964284.2984065"},{"key":"2294_CR10","doi-asserted-by":"crossref","unstructured":"Wang, H. , Gao, C. , Han, Y.\u00a0:Sequence in sequence for video captioning. Pattern Recognit,pp.327\u2013334 (2020)","DOI":"10.1016\/j.patrec.2018.07.024"},{"key":"2294_CR11","doi-asserted-by":"crossref","unstructured":"Pasunuru, R. , Bansal, M.: Multi-Task Video Captioning with Video and Entailment Generation. ACL ,pp. 1273\u20131283 (2017)","DOI":"10.18653\/v1\/D17-1103"},{"key":"2294_CR12","doi-asserted-by":"crossref","unstructured":"Pan, Y. , Yao, T. , Li, H.:Video Captioning with Transferred Semantic Attributes. CVPR,pp. 984\u2013992 (2017)","DOI":"10.1109\/CVPR.2017.111"},{"key":"2294_CR13","unstructured":"Lebret,R., O,P., Pinheiro, Collobert,R.: Phrase-based image captioning. ICML ,pp.2085\u20132094 (2015)"},{"key":"2294_CR14","doi-asserted-by":"crossref","unstructured":"Rohrbach, M. , Wei, Q. , Titov, I. Translating video content to natural language descriptions. ICCV,pp.433\u2013440 (2013)","DOI":"10.1109\/ICCV.2013.61"},{"key":"2294_CR15","unstructured":"Technicolor, T. , Related, S. , Technicolor, T. :ImageNet classification with deep convolutional neural networks[C]. In NIPS (2012)"},{"key":"2294_CR16","unstructured":"Russakovsky,O., Deng, J., Su,H.:ImageNet large scale visual recognition challenge. IJCV , pp.1\u201342 (2015)"},{"key":"2294_CR17","doi-asserted-by":"crossref","unstructured":"Zhu,Y. ,Liu,G. : Fine-grained action recognition using multi-view attentions. The Visual Computer, pp. 1771\u20131881 (2020)","DOI":"10.1007\/s00371-019-01770-y"},{"key":"2294_CR18","doi-asserted-by":"crossref","unstructured":"Venugopalan, S. , Xu, H. , Donahue, J.: Translating videos to natural language using deep recurrent neural networks. arXiv, pp.1494\u20131504 (2014)","DOI":"10.3115\/v1\/N15-1173"},{"key":"2294_CR19","doi-asserted-by":"crossref","unstructured":"Zheng, Q. , Wang, C. , Tao, D.\u00a0: Syntax-Aware Action Targeting for Video Captioning. CVPR ,pp.13093 -13102. (2020)","DOI":"10.1109\/CVPR42600.2020.01311"},{"key":"2294_CR20","unstructured":"Corpus English stop words.[EB\/OL](2012). https:\/\/blog.csdn.net\/weixin_30360497\/article\/details\/95088316"},{"key":"2294_CR21","doi-asserted-by":"crossref","unstructured":"Guadarrama,S. ,Krishnamoorthy,N. , Malkar-nenkar,G.: Youtube2text: Recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition,\u00a0ICCV, pp. 2712\u20132719. (2013)","DOI":"10.1109\/ICCV.2013.337"},{"key":"2294_CR22","doi-asserted-by":"crossref","unstructured":"Xu,J. , Mei,T. ,Yao,T. ,Rui,Y.:Msr-vtt: A large video caption dataset for bridging video and language.CVPR , pp.5288\u20135296. (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"2294_CR23","unstructured":"Xu,K. , Ba,J. , Kiros, R. :Show attend and tell: Neural image caption generation with visual attention.ICML,pp. 2048\u20132057 (2015)"},{"key":"2294_CR24","doi-asserted-by":"crossref","unstructured":"Wang, B. , Ma, L. , Zhang, W. , Liu, W. :Reconstruction network for video captioning.CVPR, pp.7622\u20137631(2018)","DOI":"10.1109\/CVPR.2018.00795"},{"key":"2294_CR25","doi-asserted-by":"crossref","unstructured":"N Aafaq, N Akhtar, Liu, W. : Spatio-temporal dynamics and semantic attribute enriched visual encoding for video captioning. CVPR, pp.12487 -12496 (2019)","DOI":"10.1109\/CVPR.2019.01277"},{"key":"2294_CR26","doi-asserted-by":"crossref","unstructured":"Xu,J. , Yao, T. ,Zhang,Y. ,Mei, T.:Learning multimodal attention lstm networks for video captioning .\u00a0ACM Multimedia , pp.537\u2013545 (2017)","DOI":"10.1145\/3123266.3123448"},{"key":"2294_CR27","doi-asserted-by":"crossref","unstructured":"Zheng, Q. , Wang, C. , Tao, D. : Syntax-Aware Action Targeting for Video Captioning. CVPR pp.13093\u201313102 (2020)","DOI":"10.1109\/CVPR42600.2020.01311"},{"key":"2294_CR28","doi-asserted-by":"crossref","unstructured":"Shen, Z. , Li, J. , Su, Z. :Weakly supervised dense video captioning.In:IEEE Conference on Computer Vision and Pattern Recognition. CVPR ,pp. 5159\u20135167 (2017)","DOI":"10.1109\/CVPR.2017.548"},{"key":"2294_CR29","doi-asserted-by":"crossref","unstructured":"Wang, B. , Ma, L. , Zhang, W. Controllable.: video captioning with pos sequence guidance based on gated fusion network.\u00a0ICCV ,pp.2641\u20132650 (2019)","DOI":"10.1109\/ICCV.2019.00273"},{"key":"2294_CR30","doi-asserted-by":"crossref","unstructured":"Cherian, A. , J Wang, Hori, C. : Spatio-Temporal Ranked-Attention Networks for Video Captioning. WACV ,pp.1606\u20131615 (2020)","DOI":"10.1109\/WACV45572.2020.9093291"},{"key":"2294_CR31","doi-asserted-by":"crossref","unstructured":"Zhang, J. ,Peng, Y. :Hierarchical vision- language alignment for video captioning. MMM, Springer, pp. 42\u201354 (2019)","DOI":"10.1007\/978-3-030-05710-7_4"},{"key":"2294_CR32","doi-asserted-by":"crossref","unstructured":"Gao,L. , Guo,Z. , Zhang,H. :Video captioning with attention -based LSTM and semantic consistency.IEEE Trans. Multimed,pp.2045\u20132055 (2017)","DOI":"10.1109\/TMM.2017.2729019"},{"key":"2294_CR33","doi-asserted-by":"crossref","unstructured":"Yu, H. , Wang, J. , Huang, Z. :Video paragraph captioning using hierarchical recurrent neural networks. CVPR, pp. 4584 -4593 (2016)","DOI":"10.1109\/CVPR.2016.496"},{"key":"2294_CR34","doi-asserted-by":"crossref","unstructured":"Xu,Y. ,Han,Y. ,Hong,R.\u00a0:Sequential video VLAD: training the aggregation locally and temporally.IEEE Trans. Image Process,pp.4933\u20134944. (2018)","DOI":"10.1109\/TIP.2018.2846664"},{"key":"2294_CR35","doi-asserted-by":"crossref","unstructured":"Song, J. , Guo, Y. , Gao, L.\u00a0:From deterministic to generative: multimodal stochastic RNNs for video captioning. IEEE Trans. Neural Netw. Learn. Syst, pp.3047\u20133058(2018)","DOI":"10.1109\/TNNLS.2018.2851077"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-021-02294-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-021-02294-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-021-02294-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,14]],"date-time":"2022-12-14T15:06:35Z","timestamp":1671030395000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-021-02294-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,9,5]]},"references-count":35,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2022,12]]}},"alternative-id":["2294"],"URL":"https:\/\/doi.org\/10.1007\/s00371-021-02294-0","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,9,5]]},"assertion":[{"value":"25 August 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 September 2021","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}