{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T15:47:32Z","timestamp":1765295252389,"version":"3.37.3"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2021,8,7]],"date-time":"2021-08-07T00:00:00Z","timestamp":1628294400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,8,7]],"date-time":"2021-08-07T00:00:00Z","timestamp":1628294400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"name":"Key-Area Research and Development Program of Guangdong Province","award":["2018B010109007"],"award-info":[{"award-number":["2018B010109007"]}]},{"name":"Key-Area Research and Development Program of Guangdong Province","award":["2019B010153002"],"award-info":[{"award-number":["2019B010153002"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62002071"],"award-info":[{"award-number":["62002071"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["61903091"],"award-info":[{"award-number":["61903091"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Guangdong Provincial Key Laboratory of Cyber-Physical System","award":["2020B1212060069"],"award-info":[{"award-number":["2020B1212060069"]}]},{"DOI":"10.13039\/501100021171","name":"Guangdong Basic and Applied Basic Research Foundation","doi-asserted-by":"crossref","award":["2020A1515010801"],"award-info":[{"award-number":["2020A1515010801"]}],"id":[{"id":"10.13039\/501100021171","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2022,3]]},"DOI":"10.1007\/s10489-021-02612-y","type":"journal-article","created":{"date-parts":[[2021,8,7]],"date-time":"2021-08-07T13:03:23Z","timestamp":1628341403000},"page":"5241-5260","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["MIVCN: Multimodal interaction video captioning network based on semantic association graph"],"prefix":"10.1007","volume":"52","author":[{"given":"Ying","family":"Wang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3640-3229","authenticated-orcid":false,"given":"Guoheng","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Lin","family":"Yuming","sequence":"additional","affiliation":[]},{"given":"Haoliang","family":"Yuan","sequence":"additional","affiliation":[]},{"given":"Chi-Man","family":"Pun","sequence":"additional","affiliation":[]},{"given":"Wing-Kuen","family":"Ling","sequence":"additional","affiliation":[]},{"given":"Lianglun","family":"Cheng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,8,7]]},"reference":[{"key":"2612_CR1","doi-asserted-by":"crossref","unstructured":"Viola P, Jones M (2001) Rapid object detection using a boosted cascade of simple features. In: 2001 IEEE Computer Society Conference on Computer Vision and Pattern Recognition. CVPR, Kauai, HI, USA (Vol. 1, pp. I-I)","DOI":"10.1109\/CVPR.2001.990517"},{"key":"2612_CR2","doi-asserted-by":"crossref","unstructured":"Lowe DG (1999) Object recognition from local scale-invariant features. In: 1999 IEEE International Conference on Computer Vision. Kerkyra, Greece, pp 1150\u20131157 vol.2","DOI":"10.1109\/ICCV.1999.790410"},{"key":"2612_CR3","doi-asserted-by":"crossref","unstructured":"Dalal N, Triggs B (2005) Histograms of oriented gradients for human detection. In: 2005 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR'05), San Diego, CA, USA, pp 886-893 vol. 1","DOI":"10.1109\/CVPR.2005.177"},{"key":"2612_CR4","unstructured":"Langkilde-geary I, Knight K (2002) Halogen statistical sentence generator. In: Proceedings of the ACL-02 Demonstrations Session. Philadelphia. pp 102-103"},{"key":"2612_CR5","unstructured":"Pollard CJ, Sag IA (1994) Head-driven phrase structure grammar. University of Chicago Press"},{"key":"2612_CR6","unstructured":"Ehud R, Robert D (2006) Building natural language generation systems (studies in natural language processing). Cambridge University Press"},{"key":"2612_CR7","doi-asserted-by":"crossref","unstructured":"Das P, Xu C, Doell RF, Corso JJ (2013) A thousand frames in just a few words: lingual description of videos through latent topics and sparse object stitching. In:2013 IEEE Conference on Computer Vision and Pattern Recognition. Portland, OR, pp 2634-2641","DOI":"10.1109\/CVPR.2013.340"},{"key":"2612_CR8","doi-asserted-by":"crossref","unstructured":"Pan Y, Yao T, Li T, Mei T (2017) Video captioning with transferred semantic attributes. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition. CVPR, Honolulu, HI, pp. 984\u2013992","DOI":"10.1109\/CVPR.2017.111"},{"key":"2612_CR9","doi-asserted-by":"crossref","unstructured":"Hemalatha M, Sekhar CC (2020) Domain-specific semantics guided approach to video captioning. In:2020 IEEE Winter Conference on Applications of Computer Vision. WACV, Snowmass Village, CO, USA, pp 1576-1585","DOI":"10.1109\/WACV45572.2020.9093344"},{"key":"2612_CR10","doi-asserted-by":"crossref","unstructured":"Ryu H, et al. (2021) Semantic grouping network for video captioning. In: proceedings of the AAAI Conference on Artificial Intelligence. Columbia, Canada. arXiv preprint arXiv:2102.00831","DOI":"10.1609\/aaai.v35i3.16353"},{"key":"2612_CR11","doi-asserted-by":"crossref","unstructured":"Yang B, et al. (2021) Non-autoregressive coarse-to-fine video captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence. Columbia, Canada. arXiv preprint arXiv:1911.12018","DOI":"10.1609\/aaai.v35i4.16421"},{"key":"2612_CR12","first-page":"4507","volume-title":"2015 IEEE international conference on computer vision","author":"L Yao","year":"2015","unstructured":"Yao L et al (2015) Describing videos by exploiting temporal structure. In: 2015 IEEE international conference on computer vision. ICCV, Santiago, pp 4507\u20134515"},{"key":"2612_CR13","doi-asserted-by":"crossref","unstructured":"Venugopalan S, et al. (2014) Translating videos to natural language using deep recurrent neural networks. In: Human Language Technologies: The 2015 Annual Conference of the North American Chapter of the ACL. Denver, Colorado, arXiv preprint arXiv:1412.4729","DOI":"10.3115\/v1\/N15-1173"},{"key":"2612_CR14","first-page":"4534","volume-title":"2015 IEEE international conference on computer vision","author":"S Venugopalan","year":"2015","unstructured":"Venugopalan S, Rohrbach M, Donahue J, Mooney R, Darrell T, Saenko K (2015) Sequence to Sequence -- Video to Text. In: 2015 IEEE international conference on computer vision. ICCV, Santiago, pp 4534\u20134542"},{"key":"2612_CR15","doi-asserted-by":"crossref","unstructured":"Yu H, Wang J, Huang Z, Yang Y, Xu W (2016) Video paragraph captioning using hierarchical recurrent neural networks. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition. CVPR, Las Vegas, NV, pp. 4584\u20134593","DOI":"10.1109\/CVPR.2016.496"},{"key":"2612_CR16","doi-asserted-by":"crossref","unstructured":"Liang Y, He F, Zeng X. (2020) 3D mesh simplification with feature preservation based on whale optimization algorithm and differential evolution[J]. Integrated computer-aided engineering, (preprint): 1-19","DOI":"10.3233\/ICA-200641"},{"key":"2612_CR17","doi-asserted-by":"publisher","first-page":"106335","DOI":"10.1016\/j.asoc.2020.106335","volume":"93","author":"Y Chen","year":"2020","unstructured":"Chen Y, He F, Li H, Zhang D, Wu Y (2020) A full migration BBO algorithm with enhanced population quality bounds for multimodal biomedical image registration[J]. Appl Soft Comput 93:106335","journal-title":"Appl Soft Comput"},{"issue":"2","key":"2612_CR18","doi-asserted-by":"publisher","first-page":"245","DOI":"10.1007\/s00371-020-01796-7","volume":"37","author":"Q Quan","year":"2021","unstructured":"Quan Q, He F, Li H (2021) A multi-phase blending method with incremental intensity for training detection networks[J]. Vis Comput 37(2):245\u2013259","journal-title":"Vis Comput"},{"key":"2612_CR19","doi-asserted-by":"crossref","unstructured":"Wang B, Ma L, Zhang W, Liu W (2018) Reconstruction Network for Video Captioning. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR, Salt Lake City, UT, pp. 7622\u20137631","DOI":"10.1109\/CVPR.2018.00795"},{"key":"2612_CR20","first-page":"2641","volume-title":"2019 IEEE\/CVF International Conference on Computer Vision","author":"B Wang","year":"2019","unstructured":"Wang B et al (2019) Controllable video captioning with pos sequence guidance based on gated fusion network. In: 2019 IEEE\/CVF International Conference on Computer Vision. Seoul, South Korea, pp 2641\u20132650"},{"key":"2612_CR21","doi-asserted-by":"crossref","unstructured":"Aafaq N, Akhtar N, Liu W, Gilani SZ, Mian A (2019) Spatio-temporal dynamics and semantic attribute enriched visual encoding for video captioning. In:2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR, Long Beach, CA, USA, pp 12479-12488","DOI":"10.1109\/CVPR.2019.01277"},{"key":"2612_CR22","doi-asserted-by":"crossref","unstructured":"Pan B, et al. (2020) Spatio-temporal graph for video captioning with knowledge distillation. In:2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR, Seattle, WA, USA, pp 10867-10876","DOI":"10.1109\/CVPR42600.2020.01088"},{"key":"2612_CR23","doi-asserted-by":"crossref","unstructured":"Gan Z, et al. (2017) Semantic compositional networks for visual captioning. In:2017 IEEE Conference on Computer Vision and Pattern Recognition. CVPR, Honolulu, HI, pp 1141-1150","DOI":"10.1109\/CVPR.2017.127"},{"issue":"9","key":"2612_CR24","doi-asserted-by":"publisher","first-page":"2045","DOI":"10.1109\/TMM.2017.2729019","volume":"19","author":"L Gao","year":"2017","unstructured":"Gao L, Guo Z, Zhang H, Xu X, Shen HT (2017) Video captioning with attention-based LSTM and semantic consistency. IEEE Transactions on Multimedia 19(9):2045\u20132055","journal-title":"IEEE Transactions on Multimedia"},{"key":"2612_CR25","doi-asserted-by":"publisher","first-page":"222","DOI":"10.1016\/j.neucom.2018.06.096","volume":"395","author":"L Gao","year":"2020","unstructured":"Gao L, Wang X, Song J, Liu Y (2020) Fused GRU with semantic-temporal attention for video captioning. Neurocomputing 395:222\u2013228","journal-title":"Neurocomputing"},{"key":"2612_CR26","doi-asserted-by":"crossref","unstructured":"Liu S, Ren Z, Yuan J (2019) SibNet: sibling convolutional encoder for video captioning. IEEE Transactions on Pattern Analysis and Machine Intelligence pp:1\u20131","DOI":"10.1145\/3240508.3240667"},{"key":"2612_CR27","first-page":"7370","volume-title":"Proceedings of the AAAI conference on artificial intelligence","author":"L Yao","year":"2019","unstructured":"Yao L, Mao CS, Lo Y (2019) Graph convolutional networks for text classification. In: Proceedings of the AAAI conference on artificial intelligence. AAAI, Honolulu, Hawaii, pp 7370\u20137377"},{"issue":"1","key":"2612_CR28","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s12293-021-00328-7","volume":"13","author":"H Li","year":"2021","unstructured":"Li H, He F, Chen Y, Pan Y (2021) MLFS-CCDE: multi-objective large-scale feature selection by cooperative coevolutionary differential evolution[J]. Memetic Computing 13(1):1\u201318","journal-title":"Memetic Computing"},{"key":"2612_CR29","doi-asserted-by":"crossref","unstructured":"You Q, Jin H, Wang Z, Fang C, Luo J (2016) Image captioning with semantic attention. In: 2016 IEEE conference on computer vision and pattern recognition. CVPR, Las Vegas, NV, pp. 4651\u20134659","DOI":"10.1109\/CVPR.2016.503"},{"key":"2612_CR30","unstructured":"Kiros R, Salakhutdinov R, Zemel RS (2014) Unifying visual-semantic embeddings with multimodal neural language models. In: NIPS 2014 deep learning workshop. Montreal, Canada. arXiv preprint arXiv:1411.2539"},{"issue":"4","key":"2612_CR31","doi-asserted-by":"publisher","first-page":"664","DOI":"10.1109\/TPAMI.2016.2598339","volume":"39","author":"A Karpathy","year":"2017","unstructured":"Karpathy A, Fei-Fei L (2017) Deep visual-semantic alignments for generating image descriptions. IEEE Trans Pattern Anal Mach Intell 39(4):664\u2013676","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"2612_CR32","unstructured":"Mikolov T, et al. (2013) Efficient estimation of word representations in vector space. Computer science. arXiv preprint arXiv:1301.3781"},{"key":"2612_CR33","doi-asserted-by":"crossref","unstructured":"Xie S, Girshick R, Doll\u00e1r P, Tu Z, He K (2017) Aggregated Residual Transformations for Deep Neural Networks. In: 2017 IEEE conference on computer vision and pattern recognition. CVPR, Honolulu, HI, pp. 5987\u20135995","DOI":"10.1109\/CVPR.2017.634"},{"key":"2612_CR34","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky O, Deng J, Su H, Krause J, Satheesh S, Ma S, Huang Z, Karpathy A, Khosla A, Bernstein M, Berg AC, Fei-Fei L (2015) Imagenet large scale visual recognition challenge. Int J Comput Vis 115:211\u2013252","journal-title":"Int J Comput Vis"},{"key":"2612_CR35","first-page":"695","volume-title":"Eco: efficient convolutional network for online video understanding","author":"M Zolfaghari","year":"2018","unstructured":"Zolfaghari M, Singh K, Brox T (2018) Eco: efficient convolutional network for online video understanding. Proceedings of the European conference on computer vision. ECCV, In, pp 695\u2013712"},{"key":"2612_CR36","unstructured":"Kay W, et al. (2017) The kinetics human action video dataset. In: computer vision and pattern recognition. Hawaii, USA. arXiv preprint arXiv:1705.06950"},{"issue":"6","key":"2612_CR37","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2017","unstructured":"Ren S, He K, Girshick R, Sun J (2017) Faster R-CNN: towards real-time object detection with region proposal networks. IEEE Trans Pattern Anal Mach Intell 39(6):1137\u20131149","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"2612_CR38","unstructured":"Kingma DA (2014) A method for stochastic optimization. Computer Science. arXiv preprint arXiv:1412.6980"},{"key":"2612_CR39","doi-asserted-by":"crossref","unstructured":"Freitag M, Al-Onaizan Y (2017) Beam search strategies for neural machine translation. In: Proceedings of the First Workshop on Neural Machine Translation. arXiv preprint arXiv:1702.01806","DOI":"10.18653\/v1\/W17-3207"},{"key":"2612_CR40","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos R, Ward T, Zhu WJ (2002) BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the Annual Meeting on Association for Computational Linguistics. ACL, pp. 311\u2013318","DOI":"10.3115\/1073083.1073135"},{"key":"2612_CR41","doi-asserted-by":"crossref","unstructured":"Crouse JR, Raichlen JS, Riley WA, Evans GW, Palmer MK, O\u2019Leary DH, Grobbee DE, Bots ML, METEOR Study Group, et al. (2007) METEOR Study Group, et al. 2007. Effect of rosuvastatin on progression of carotid intima-media thickness in low-risk individuals with subclinical atherosclerosis: the METEOR trial. JAMA The Journal of the American Medical Association 297(12):1344\u20131353","DOI":"10.1001\/jama.297.12.1344"},{"key":"2612_CR42","first-page":"74","volume-title":"Association for Computational Linguistics","author":"CY Lin","year":"2004","unstructured":"Lin CY (2004) Rouge: a package for automatic evaluation of summaries. In: Association for Computational Linguistics. Barcelona, Spain, pp 74\u201381"},{"key":"2612_CR43","doi-asserted-by":"crossref","unstructured":"Vedantam R, Zitnick CL, Parikh D (2015) CIDEr: Consensus-based image description evaluation. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition. CVPR, Boston, MA, pp. 4566\u20134575","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"2612_CR44","doi-asserted-by":"crossref","unstructured":"Yao L, et al. (2015) Describing videos by exploiting temporal structure. In: 2015 IEEE International Conference on Computer Vision. ICCV, Santiago, pp. 4507\u20134515","DOI":"10.1109\/ICCV.2015.512"},{"key":"2612_CR45","doi-asserted-by":"crossref","unstructured":"Pan Y, Mei T, Yao T, Li H, Rui Y (2016) Jointly modeling embedding and translation to bridge video and language. In:2016 IEEE conference on computer vision and pattern recognition. CVPR, Las Vegas, NV, pp 4594-4602","DOI":"10.1109\/CVPR.2016.497"},{"key":"2612_CR46","doi-asserted-by":"crossref","unstructured":"Pan P, Xu Z, Yang Y, Wu F, Zhuang Y (2016) Hierarchical Recurrent Neural Encoder for Video Representation with Application to Captioning. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition. CVPR, Las Vegas, NV, pp. 1029\u20131038","DOI":"10.1109\/CVPR.2016.117"},{"key":"2612_CR47","doi-asserted-by":"crossref","unstructured":"Zhu L, Xu Z, Yang Y (2017) Bidirectional multirate reconstruction for temporal modeling in videos. In: 2017 IEEE conference on computer vision and pattern recognition. CVPR, Honolulu, HI, pp. 1339\u20131348","DOI":"10.1109\/CVPR.2017.147"},{"key":"2612_CR48","doi-asserted-by":"crossref","unstructured":"Venugopalan S , Xu H , Donahue J , et al. (2014) Translating videos to natural language using deep recurrent neural networks. North American chapter of the Association for Computational Linguistics (NAACL) Baltimore, Maryland, USA arXiv preprint arXiv:1412.4729","DOI":"10.3115\/v1\/N15-1173"},{"key":"2612_CR49","doi-asserted-by":"publisher","first-page":"1300","DOI":"10.1109\/ICME.2019.00226","volume-title":"2019 IEEE International Conference on Multimedia and Expo","author":"L Sun","year":"2019","unstructured":"Sun L, Li B, Yuan C, Zha Z, Hu W (2019) Multimodal semantic attention network for video captioning. In: 2019 IEEE International Conference on Multimedia and Expo. ICME, Shanghai, China, pp 1300\u20131305"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-021-02612-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-021-02612-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-021-02612-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,6]],"date-time":"2024-09-06T00:52:26Z","timestamp":1725583946000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-021-02612-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,8,7]]},"references-count":49,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2022,3]]}},"alternative-id":["2612"],"URL":"https:\/\/doi.org\/10.1007\/s10489-021-02612-y","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"type":"print","value":"0924-669X"},{"type":"electronic","value":"1573-7497"}],"subject":[],"published":{"date-parts":[[2021,8,7]]},"assertion":[{"value":"9 June 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 August 2021","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}