{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T03:05:09Z","timestamp":1740107109406,"version":"3.37.3"},"reference-count":69,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2022,1,13]],"date-time":"2022-01-13T00:00:00Z","timestamp":1642032000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,1,13]],"date-time":"2022-01-13T00:00:00Z","timestamp":1642032000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2023,1]]},"DOI":"10.1007\/s00371-021-02309-w","type":"journal-article","created":{"date-parts":[[2022,1,13]],"date-time":"2022-01-13T00:03:47Z","timestamp":1642032227000},"page":"9-25","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Cross-language multimodal scene semantic guidance and leap sampling for video captioning"],"prefix":"10.1007","volume":"39","author":[{"given":"Bo","family":"Sun","sequence":"first","affiliation":[]},{"given":"Yong","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Yijia","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Zhuo","family":"Hao","sequence":"additional","affiliation":[]},{"given":"Lejun","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Jun","family":"He","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,1,13]]},"reference":[{"key":"2309_CR1","doi-asserted-by":"crossref","unstructured":"Ma L., Lu Z., Shang L., Li H: Multimodal convolutional neural networks for matching image and sentence. In: Proceedings of the IEEE international conference on computer vision, pp. 2623\u20132631 (2015)","DOI":"10.1109\/ICCV.2015.301"},{"issue":"1","key":"2309_CR2","doi-asserted-by":"publisher","first-page":"34","DOI":"10.1109\/JPROC.2015.2487976","volume":"104","author":"J Wang","year":"2016","unstructured":"Wang, J., Liu, W., Kumar, S., Chang, S.: Learning to hash for indexing big data\u2014a survey. Proc IEEE 104(1), 34\u201357 (2016)","journal-title":"Proc IEEE"},{"key":"2309_CR3","doi-asserted-by":"publisher","first-page":"75","DOI":"10.1109\/MMUL.2016.39","volume":"23","author":"W Liu","year":"2016","unstructured":"Liu, W., Zhang, T.: Multimedia hashing and networking. IEEE Multimedia 23, 75\u201379 (2016)","journal-title":"IEEE Multimedia"},{"key":"2309_CR4","doi-asserted-by":"publisher","first-page":"175","DOI":"10.1016\/j.patcog.2017.03.021","volume":"75","author":"J Song","year":"2018","unstructured":"Song, J., Gao, L., Liu, L., Zhu, X., Sebe, N.: Quantization-based hashing: a general framework for scalable image and video retrieval. Pattern Recogn 75, 175\u2013187 (2018)","journal-title":"Pattern Recogn"},{"issue":"4","key":"2309_CR5","doi-asserted-by":"publisher","first-page":"769","DOI":"10.1109\/TPAMI.2017.2699960","volume":"40","author":"J Wang","year":"2018","unstructured":"Wang, J., Zhang, T., Song, J., Sebe, N., Shen, H.: A Survey on Learning to Hash. IEEE Trans. Pattern Anal. Mach. Intel. 40(4), 769\u2013790 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intel."},{"key":"2309_CR6","doi-asserted-by":"crossref","unstructured":"Pradhan J., Ajad A., Pal A.K., et al.: Multi-level colored directional motif histograms for content-based image retrieval. Visual Computer, 36(9) (2020)","DOI":"10.1007\/s00371-019-01773-9"},{"issue":"1","key":"2309_CR7","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1007\/s00371-010-0510-6","volume":"27","author":"B Feng","year":"2011","unstructured":"Feng, B., Cao, J., et al.: Graph-based multi-space semantic correlation propagation for video retrieval. Visual Comput 27(1), 21\u201334 (2011)","journal-title":"Visual Comput"},{"issue":"7","key":"2309_CR8","first-page":"119","volume":"37","author":"SH Hashemi","year":"2021","unstructured":"Hashemi, S.H., Safayani, M., Mirzaei, A.: Multiple answers to a question: a new approach for visual question answering. Visual Comput 37(7), 119\u2013131 (2021)","journal-title":"Visual Comput"},{"key":"2309_CR9","first-page":"16","volume":"3","author":"L Ma","year":"2016","unstructured":"Ma, L., Lu, Z., Li, H.: Learning to answer questions from image using convolutional neural network. Assoc Adv Artificial Intell 3, 16 (2016)","journal-title":"Assoc Adv Artificial Intell"},{"key":"2309_CR10","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-018-3579-x","author":"Z Haijun","year":"2019","unstructured":"Haijun, Z., Yuzhu, J., Wang, H., Linlin, L.: Sitcom-star-based clothing retrieval for video advertising: a deep learning framework. Neural Comput Appl (2019). https:\/\/doi.org\/10.1007\/s00521-018-3579-x","journal-title":"Neural Comput Appl"},{"key":"2309_CR11","doi-asserted-by":"publisher","first-page":"1309","DOI":"10.1007\/s00371-020-01867-9","volume":"37","author":"G Barlas","year":"2021","unstructured":"Barlas, G., Veinidis, C., Arampatzis, A.: What we see in a photograph: content selection for image captioning. Vis Comput 37, 1309\u20131326 (2021)","journal-title":"Vis Comput"},{"key":"2309_CR12","doi-asserted-by":"publisher","first-page":"1655","DOI":"10.1007\/s00371-018-1565-z","volume":"35","author":"T Jiang","year":"2019","unstructured":"Jiang, T., Zhang, Z., Yang, Y.: Modeling coverage with semantic embedding for image caption generation. Vis Computer 35, 1655\u20131665 (2019)","journal-title":"Vis Computer"},{"key":"2309_CR13","doi-asserted-by":"crossref","unstructured":"Donahue J., Hendricks L. A., Guadarrama S., Rohrbach M., Darrell TJIToSE.: Long-Term Recurrent Convolutional Networks for Visual Recognition and Description. pp. 99 (2014)","DOI":"10.21236\/ADA623249"},{"key":"2309_CR14","doi-asserted-by":"crossref","unstructured":"Marwah T., Mittal G., Balasubramanian V. N.: Attentive semantic video generation using captions.\u00a0In: Proceedings of the IEEE international conference on computer vision, pp. 1435\u20131443 (2017)","DOI":"10.1109\/ICCV.2017.159"},{"key":"2309_CR15","doi-asserted-by":"crossref","unstructured":"Venugopalan S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., Saenko, K.: Sequence to sequence-video to text. In: Proceedings of the IEEE international conference on computer vision, pp. 4534\u20134542 (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"2309_CR16","doi-asserted-by":"crossref","unstructured":"Yao, L., Torabi, A., Cho, K., Ballas, N., Pal, C., Larochelle, H., Courville, A.: Describing videos by exploiting temporal structure. In: Proceedings of the IEEE international conference on computer vision, pp. 4507\u20134515 (2015)","DOI":"10.1109\/ICCV.2015.512"},{"key":"2309_CR17","doi-asserted-by":"crossref","unstructured":"Yu H., Wang J., Huang Z., Yang Y., Xu W.: Video paragraph captioning using hierarchical recurrent neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4584\u20134593 (2016)","DOI":"10.1109\/CVPR.2016.496"},{"key":"2309_CR18","doi-asserted-by":"crossref","unstructured":"Pan P., Xu Z., Yang Y., Wu F., Zhuang Y.: Hierarchical recurrent neural encoder for video representation with application to captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 1029\u20131038 (2016)","DOI":"10.1109\/CVPR.2016.117"},{"key":"2309_CR19","doi-asserted-by":"crossref","unstructured":"Pan Y., Yao T., Li H., T. Mei.: Video captioning with transferred semantic attributes. arXiv preprint arXiv: 1611.07675 (2016)","DOI":"10.1109\/CVPR.2017.111"},{"key":"2309_CR20","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Hendricks, L. A., Mooney, R., Saenko, K., Improving lstm-based video description with linguistic knowledge mined from text. In Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing. pp. 1961\u20131966 (2016)","DOI":"10.18653\/v1\/D16-1204"},{"key":"2309_CR21","unstructured":"Xin Wang, Jiawei Wu, Da Zhang, Yu Su, and William Yang Wang.: Learning to compose topic-aware mixture of experts for zero-shot video captioning. arXiv preprint arXiv: 1811.02765. (2018)"},{"key":"2309_CR22","doi-asserted-by":"crossref","unstructured":"Li, Y., Yao, T., Pan, Y., Chao, H., Mei, T.: Jointly localizing and describing events for dense video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7492\u20137500 (2018)","DOI":"10.1109\/CVPR.2018.00782"},{"key":"2309_CR23","doi-asserted-by":"crossref","unstructured":"Dong J. et al.: Early embedding and late reranking for video captioning. In: Proceedings of the 2016 ACM on multimedia conference, pp 1082\u20131086 (2016)","DOI":"10.1145\/2964284.2984064"},{"key":"2309_CR24","doi-asserted-by":"crossref","unstructured":"Yu Y., Ko H., Choi J., Kim G.: End-to-end concept word detection for video captioning, retrieval, and question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3261\u20133269 (2017)","DOI":"10.1109\/CVPR.2017.347"},{"key":"2309_CR25","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Shi, Y., Yuan, C., Li, B., Wang, P., Hu, W., Zha, Z.-J.: Object relational graph with teacher-recommended learning for video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 13278\u2013 13288 (2020)","DOI":"10.1109\/CVPR42600.2020.01329"},{"key":"2309_CR26","doi-asserted-by":"crossref","unstructured":"Xu J., Mei T., Yao T., Rui Y.: Msr-vtt: A large video description dataset for bridging video and language. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"2309_CR27","unstructured":"Chen D., Dolan W.B.: Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pp. 190\u2013200 (2011)"},{"key":"2309_CR28","doi-asserted-by":"crossref","unstructured":"Krishna R., Hata K., Ren F., Fei-Fei L., Carlos Niebles J.: Dense-captioning events in videos. In: Proceedings of the IEEE international conference on computer vision, pp 706\u2013715 (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"2309_CR29","doi-asserted-by":"crossref","unstructured":"Zhou L., Kalantidis Y., Chen X, Corso J.J., Rohrbach M.: Grounded video description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6578\u20136587 (2019)","DOI":"10.1109\/CVPR.2019.00674"},{"key":"2309_CR30","doi-asserted-by":"crossref","unstructured":"Wang X, Wu J, Chen J, Li L, Wang YF, Wang WY.: VATEX: A Large-Scale, High-Quality Multilingual Dataset for Video-and-Language Research. In: Proceedings of the IEEE international conference on computer vision, pp. 4580\u20134590 (2019)","DOI":"10.1109\/ICCV.2019.00468"},{"key":"2309_CR31","doi-asserted-by":"crossref","unstructured":"Guadarrama S., Krishnamoorthy N., Malkarnenkar G., Venugopalan S., Saenko K.: YouTube2Text: Recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2712\u20132719 (2014)","DOI":"10.1109\/ICCV.2013.337"},{"key":"2309_CR32","doi-asserted-by":"crossref","unstructured":"Krishnamoorthy N., Malkarnenkar G., Mooney R., Saenko K., Guadarrama S.: Generating natural-language video descriptions using text-mined knowledge. The Proceedings of the Twenty-Seventh AAAI Conference on Artificial Intelligence, pp. 541\u2013547 (2013)","DOI":"10.1609\/aaai.v27i1.8679"},{"key":"2309_CR33","unstructured":"Thomason J., Venugopalan S., Guadarrama S., Saenko K., Mooney R.: Integrating language and vision to generate natural language descriptions of videos in the wild. In\u00a0Proceeding of the 24th International Conference on Computational Linguistics, pp. 1218\u20131227 (2014)"},{"key":"2309_CR34","doi-asserted-by":"crossref","unstructured":"Wang X, Wang YF, Wang WY.: Watch, Listen, and Describe: Globally and Locally Aligned Cross-Modal Attentions for Video Captioning. In Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 795\u2013801 (2018)","DOI":"10.18653\/v1\/N18-2125"},{"key":"2309_CR35","doi-asserted-by":"crossref","unstructured":"Gan, Z., Gan, C., He, X., Pu, Y., Tran, K., Gao, J., Carin, L., Deng, L.: Semantic compositional networks for visual captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1141\u20131150 (2017)","DOI":"10.1109\/CVPR.2017.127"},{"key":"2309_CR36","doi-asserted-by":"crossref","unstructured":"Shen, Z., Li, J., Su, Z., Li, M., Chen, Y., Jiang, Y.-G., Xue, X.: Weakly supervised dense video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5159\u20135167 (2017)","DOI":"10.1109\/CVPR.2017.548"},{"key":"2309_CR37","doi-asserted-by":"crossref","unstructured":"Pasunuru, R., Bansal, M.: Multi-task video captioning with video and entailment generation. In: Proceedings of the 55th annual meeting of the Association for Computational Linguistics. arXiv: 1704.07489 (2017)","DOI":"10.18653\/v1\/P17-1117"},{"key":"2309_CR38","doi-asserted-by":"crossref","unstructured":"Chen S., Zhao Y., Jin Q., Wu Q.: Fine-grained video-text retrieval with hierarchical graph reasoning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 10 638\u201310 647 (2020)","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"2309_CR39","doi-asserted-by":"crossref","unstructured":"Chen, S., Jin, Q., Wang, P., Wu, Q.: Say as you wish: Fine-grained control of image caption generation with abstract scene graphs. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 9962\u20139971(2020)","DOI":"10.1109\/CVPR42600.2020.00998"},{"key":"2309_CR40","doi-asserted-by":"crossref","unstructured":"Yang S., Li G., Yu. Y.: Cross-modal relationship inference for grounding referring expressions. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4145\u20134154 (2019)","DOI":"10.1109\/CVPR.2019.00427"},{"key":"2309_CR41","unstructured":"Yang S., Bang and Liu, Fenglin and Zhang, Can and Zou, Yuexian.: Non-Autoregressive Coarse-to-Fine Video Captioning. arXiv: 1911.12018 (2019)"},{"key":"2309_CR42","doi-asserted-by":"crossref","unstructured":"Rohrbach M., Amin S., Andriluka M., Schiele B.: A database for fine grained activity detection of cooking activities. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1194\u20131201 (2012)","DOI":"10.1109\/CVPR.2012.6247801"},{"key":"2309_CR43","doi-asserted-by":"crossref","unstructured":"Das P., Xu C., Doell R.F., Corso J.J. .: A thousand frames in just a few words: Lingual description of videos through latent topics and sparse object stitching. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 2634\u20132641 (2013)","DOI":"10.1109\/CVPR.2013.340"},{"key":"2309_CR44","doi-asserted-by":"crossref","unstructured":"Rohrbach M., Regneri M., Andriluka M., Amin S., Pinkal M., Schiele B.: Script data for attribute-based recognition of composite activities. In: European conference on computer vision, Springer, pp. 144\u2013157 (2012)","DOI":"10.1007\/978-3-642-33718-5_11"},{"key":"2309_CR45","doi-asserted-by":"crossref","unstructured":"Rohrbach A., Rohrbach M., Qiu W., Friedrich A., Pinkal M., Schiele B.: Coherent multi-sentence video description with variable level of detail. In: German conference on pattern recognition,Springer, pp. 184\u2013195 (2014)","DOI":"10.1007\/978-3-319-11752-2_15"},{"key":"2309_CR46","doi-asserted-by":"crossref","unstructured":"Zhou L., Xu C., Corso J.: Towards automatic learning of procedures from web instructional videos. In: Association for the Advancement of Artificial Intelligence, pp. 7590\u20137598 (2018)","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"2309_CR47","doi-asserted-by":"crossref","unstructured":"Rohrbach A., Rohrbach M., Tandon N., Schiele B.: A dataset for movie description. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3202\u20133212 (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"2309_CR48","unstructured":"Torabi A., Pal C., Larochelle H., Courville A.: Using descriptive video services to create a large data source for video annotation research. arXiv preprint arXiv: 1503.01070 (2015)"},{"issue":"6","key":"2309_CR49","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3355390","volume":"52","author":"N Aafaq","year":"2019","unstructured":"Aafaq N., Mian A., Liu W., Gilani S.Z., Shah, M.: Video description: A survey of methods, datasets, and evaluation metrics. ACM Comput. Surv. 52(6), 1\u201337 (2019)","journal-title":"ACM Comput. Surv."},{"key":"2309_CR50","doi-asserted-by":"crossref","unstructured":"He K., Zhang X., Ren S., Sun J.: Deep Residual Learning for Image Recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"2309_CR51","doi-asserted-by":"crossref","unstructured":"Carreira J., Zisserman A.: Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset. arXiv: 1705.07750 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"2309_CR52","doi-asserted-by":"crossref","unstructured":"Hershey S., Chaudhuri S., Ellis D.P.W., Gemmeke J.F., Jansen A., Moore R.C., Plakal M., Platt D., Saurous R.A., Seybold B.: CNN Architectures for Large-Scale Audio Classification. In: the 2017 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 131\u2013135 (2017)","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"2309_CR53","doi-asserted-by":"crossref","unstructured":"Chen H., Lin K., Maye A., Li J., Hu X.: A semantics-assisted video captioning model trained with scheduled sampling. arXiv: 1909.00121 (2019)","DOI":"10.3389\/frobt.2020.475767"},{"key":"2309_CR54","doi-asserted-by":"crossref","unstructured":"Sun B., Yu L., Zhao Y., He J.J.I.I.P.: Feedback evaluations to promote image captioningFeedback evaluations to promote image captioning. In\u00a0IET Image Processing, pp 3021\u20133027 (2020)","DOI":"10.1049\/iet-ipr.2019.1317"},{"key":"2309_CR55","doi-asserted-by":"crossref","unstructured":"Denkowski M., Lavie.: A Meteor universal: Language specific translation evaluation for any target language. In: Proceedings of the ninth workshop on statistical machine translation, pp. 376\u2013380(2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"2309_CR56","doi-asserted-by":"crossref","unstructured":"Papineni K., Roukos S., Ward T., Zhu W.-J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics, pp 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"2309_CR57","doi-asserted-by":"crossref","unstructured":"Vedantam R., Lawrence Zitnick C., Parikh D.: Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"2309_CR58","unstructured":"Lin C.-Y.: Rouge: A package for automatic evaluation of summaries. In: Text summarization branches out, pp. 74\u201381(2004)"},{"key":"2309_CR59","doi-asserted-by":"crossref","unstructured":"Lin T.Y., Maire M., Belongie S., Hays J., Perona P., Ramanan D., Doll\u00e1r P., Zitnick C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision, Springer, pp 740\u2013755(2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2309_CR60","doi-asserted-by":"crossref","unstructured":"Xu J, Yao T, Zhang Y, et al.: Learning multimodal attention LSTM networks for video captioning. In: Proceedings of the 25th ACM international conference on Multimedia. Pp 537\u2013545. (2017)","DOI":"10.1145\/3123266.3123448"},{"key":"2309_CR61","doi-asserted-by":"crossref","unstructured":"Chen S., Jiang Y.-G.: Motion guided spatial attention for video captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 8191\u20138198 (2019)","DOI":"10.1609\/aaai.v33i01.33018191"},{"key":"2309_CR62","doi-asserted-by":"crossref","unstructured":"Olivastri S., Singh G., Cuzzolin F.J.C.: An End-to-End Baseline for Video Captioning. In: Proceedings of the IEEE International Conference on Computer Vision Workshop, pp 2993\u20133000 (2019)","DOI":"10.1109\/ICCVW.2019.00185"},{"key":"2309_CR63","doi-asserted-by":"crossref","unstructured":"Pasunuru R., Bansal M.Japa.: Continual and multi-task architecture search. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics. arXiv: 1906.05226 (2019)","DOI":"10.18653\/v1\/P19-1185"},{"key":"2309_CR64","doi-asserted-by":"crossref","unstructured":"Pei W., Zhang J., Wang X., Ke L., Shen X., Tai Y.-W.: Memory-attended recurrent network for video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8347\u20138356 (2019)","DOI":"10.1109\/CVPR.2019.00854"},{"key":"2309_CR65","doi-asserted-by":"crossref","unstructured":"Zheng Q., Wang C., Tao D.: Syntax-Aware Action Targeting for Video Captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 13096\u201313105 (2020)","DOI":"10.1109\/CVPR42600.2020.01311"},{"key":"2309_CR66","doi-asserted-by":"crossref","unstructured":"Zhang Z., Shi Y., Yuan C., Li B., Wang P., Hu W., Zha Z.: Object Relational Graph with Teacher-Recommended Learning for Video Captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 13275\u201313285 (2020)","DOI":"10.1109\/CVPR42600.2020.01329"},{"key":"2309_CR67","doi-asserted-by":"crossref","unstructured":"Guo L., Liu J., Zhu X., Yao P., Shichen L., Lu H.: Normalized and geometry-aware self-attention network for image captioning. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 10324\u201310333 (2020)","DOI":"10.1109\/CVPR42600.2020.01034"},{"key":"2309_CR68","doi-asserted-by":"crossref","unstructured":"Wang B., Ma L., Zhang W., Liu W.: Reconstruction network for video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7622\u20137631 (2018)","DOI":"10.1109\/CVPR.2018.00795"},{"key":"2309_CR69","doi-asserted-by":"crossref","unstructured":"Sun B., Wu Y., Zhao K., et al.: Student Class Behavior Dataset: a video dataset for recognizing, detecting, and captioning students' behaviors in classroom scenes. Neural Computing and Applications, pp. 1\u201320. (2021)","DOI":"10.1007\/s00521-020-05587-y"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-021-02309-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-021-02309-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-021-02309-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T15:22:38Z","timestamp":1673277758000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-021-02309-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,1,13]]},"references-count":69,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2023,1]]}},"alternative-id":["2309"],"URL":"https:\/\/doi.org\/10.1007\/s00371-021-02309-w","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"type":"print","value":"0178-2789"},{"type":"electronic","value":"1432-2315"}],"subject":[],"published":{"date-parts":[[2022,1,13]]},"assertion":[{"value":"19 September 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 January 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declared that they have\u00a0no\u00a0conflicts of interest with regard to this work. We declare that we do not have any commercial or associative interest that represents a conflict of interest in connection with the work submitted.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"The data included in this study may be available upon reasonable request by contacting with the corresponding author.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Data availability statement"}}]}}