{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:05:48Z","timestamp":1778079948091,"version":"3.51.4"},"reference-count":52,"publisher":"Springer Science and Business Media LLC","issue":"28","license":[{"start":{"date-parts":[[2024,2,8]],"date-time":"2024-02-08T00:00:00Z","timestamp":1707350400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,2,8]],"date-time":"2024-02-08T00:00:00Z","timestamp":1707350400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100012476","name":"Fundamental Research Funds for Central Universities of the Central South University","doi-asserted-by":"publisher","award":["B220202019"],"award-info":[{"award-number":["B220202019"]}],"id":[{"id":"10.13039\/501100012476","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100014718","name":"Innovative Research Group Project of the National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276090"],"award-info":[{"award-number":["62276090"]}],"id":[{"id":"10.13039\/100014718","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Top Talent of Changzhou \u201cThe 14th Five-Year Plan\u201d High-Level Health Talents Training Project","award":["2022260"],"award-info":[{"award-number":["2022260"]}]},{"DOI":"10.13039\/501100013058","name":"Jiangsu Provincial Key Research and Development Program","doi-asserted-by":"publisher","award":["BK20192004"],"award-info":[{"award-number":["BK20192004"]}],"id":[{"id":"10.13039\/501100013058","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013058","name":"Jiangsu Provincial Key Research and Development Program","doi-asserted-by":"publisher","award":["BE2018004-04"],"award-info":[{"award-number":["BE2018004-04"]}],"id":[{"id":"10.13039\/501100013058","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-024-18372-z","type":"journal-article","created":{"date-parts":[[2024,2,8]],"date-time":"2024-02-08T08:03:57Z","timestamp":1707379437000},"page":"72113-72130","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Multi-level video captioning method based on semantic space"],"prefix":"10.1007","volume":"83","author":[{"given":"Xiao","family":"Yao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuanlin","family":"Zeng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Min","family":"Gu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruxi","family":"Yuan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jie","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junyi","family":"Ge","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,2,8]]},"reference":[{"key":"18372_CR1","doi-asserted-by":"crossref","unstructured":"Chen S, Yao T, Jiang Y-G (2019) Deep learning for video captioning: A review. IJCAI 1","DOI":"10.24963\/ijcai.2019\/877"},{"issue":"12","key":"18372_CR2","doi-asserted-by":"publisher","first-page":"9434","DOI":"10.1109\/TPAMI.2021.3126682","volume":"44","author":"M Monfort","year":"2021","unstructured":"Monfort M, Pan B, Ramakrishnan K et al (2021) Multi-moments in time: learning and interpreting models for multi-action video understanding. IEEE Trans Pattern Anal Mach Intel 44(12):9434\u20139445","journal-title":"IEEE Trans Pattern Anal Mach Intel"},{"key":"18372_CR3","unstructured":"Cai JJ, Tang J, Chen QG, Hu Y, Wang X, Huang SJ (2018) Surveil- lance applications. In: 2018 International Conference on Communication and Signal Processing (ICCSP). IEEE, pp 563\u2013568"},{"key":"18372_CR4","first-page":"2053","volume":"2019","author":"JJ Cai","year":"2019","unstructured":"Cai JJ, Tang J, Chen QG, Hu Y, Wang X, Huang SJ (2019) Multi-view active learning for video recommendation. IJCAI 2019:2053\u20132059","journal-title":"IJCAI"},{"key":"18372_CR5","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","author":"N Aafaq","year":"2019","unstructured":"Aafaq N et al (2019) Spatio-temporal dynamics and semantic attribute enriched visual encoding for video captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition"},{"key":"18372_CR6","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","author":"K He","year":"2016","unstructured":"He K et al (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition"},{"key":"18372_CR7","volume-title":"2017 IEEE international conference on acoustics, speech and signal processing (ICASSP)","author":"S Hershey","year":"2017","unstructured":"Hershey S et al (2017) CNN architectures for large-scale audio classification. In: 2017 IEEE international conference on acoustics, speech and signal processing (ICASSP). IEEE"},{"key":"18372_CR8","doi-asserted-by":"crossref","unstructured":"Tran D et al (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2015.510"},{"key":"18372_CR9","unstructured":"Ng JY-H et al (2015) Beyond short snippets: Deep networks for video classification. In: Proceedings of the IEEE conference on computer vision and pattern recognition"},{"key":"18372_CR10","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","author":"X Wang","year":"2018","unstructured":"Wang X et al (2018) Non-local neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition"},{"key":"18372_CR11","unstructured":"Vaswani A et al (2017) Attention is all you need. Adv Neural Inf Process Syst 30"},{"key":"18372_CR12","doi-asserted-by":"crossref","unstructured":"Venugopalan, Subhashini et al (2014) Translating videos to natural language using deep recurrent neural networks. arXiv preprint arXiv:1412.4729","DOI":"10.3115\/v1\/N15-1173"},{"key":"18372_CR13","doi-asserted-by":"crossref","unstructured":"Donahue J, Hendricks LA, Guadarrama S, Rohrbach M, Venu- gopalan S, Saenko K, Darrell T (2015) Long-term recurrent convolutional networks for visual recognition and description. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 2625\u20132634","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"18372_CR14","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","author":"Z Gan","year":"2017","unstructured":"Gan Z et al (2017) Semantic compositional networks for visual captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition"},{"key":"18372_CR15","volume-title":"Proceedings of the IEEE international conference on computer vision","author":"L Yao","year":"2015","unstructured":"Yao L et al (2015) Describing videos by exploiting temporal structure. In: Proceedings of the IEEE international conference on computer vision"},{"key":"18372_CR16","volume-title":"Proceedings of the European conference on computer vision (ECCV)","author":"Y Chen","year":"2018","unstructured":"Chen Y et al (2018) Less is more: Picking informative frames for video captioning. In: Proceedings of the European conference on computer vision (ECCV)"},{"key":"18372_CR17","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","author":"W Pei","year":"2019","unstructured":"Pei W et al (2019) Memory-attended recurrent network for video captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition"},{"key":"18372_CR18","volume-title":"Proceedings of the AAAI conference on artificial intelligence","author":"J Hou","year":"2020","unstructured":"Hou, Jingyi et al (2020) Commonsense and relation reasoning for image and video captioning. In: Proceedings of the AAAI conference on artificial intelligence 34(07)"},{"key":"18372_CR19","doi-asserted-by":"crossref","unstructured":"Zhang J, Peng Y (2019) Object-aware aggregation with bidirectional temporal graph for video captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2019.00852"},{"key":"18372_CR20","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","author":"B Pan","year":"2020","unstructured":"Pan B et al (2020) Spatio-temporal graph for video captioning with knowledge distillation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition"},{"key":"18372_CR21","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","author":"Z Zhang","year":"2020","unstructured":"Zhang Z et al (2020) Object relational graph with teacher-recommended learning for video captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition"},{"key":"18372_CR22","doi-asserted-by":"crossref","unstructured":"Bai Y, Wang J, Long Y et al (2021) Discriminative latent semantic graph for video captioning. In: Proceedings of the 29th ACM International Conference on Multimedia, pp 3556\u20133564","DOI":"10.1145\/3474085.3475519"},{"key":"18372_CR23","unstructured":"He E, Li G, Qi Y et al (2022) Hierarchical modular network for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 17939\u201317948"},{"key":"18372_CR24","unstructured":"Chen D, Dolan WB (2011) Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pp 190\u2013200"},{"key":"18372_CR25","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","author":"J Xu","year":"2016","unstructured":"Xu J et al (2016) MSR-VTT: A large video description dataset for bridging video and language. In: Proceedings of the IEEE conference on computer vision and pattern recognition"},{"key":"18372_CR26","doi-asserted-by":"crossref","unstructured":"Tan G et al (2020) Learning to discretely compose reasoning module networks for video captioning. arXiv preprint arXiv:2007.09049","DOI":"10.24963\/ijcai.2020\/104"},{"key":"18372_CR27","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu W-J (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp 311\u2013318","DOI":"10.3115\/1073083.1073135"},{"key":"18372_CR28","doi-asserted-by":"crossref","unstructured":"Denkowski M, Lavie A (2014) Meteor universal: Language specific translation evaluation for any target language. In: Proceedings of the Ninth Workshop on Statistical Machine Translation, pp 376\u2013380","DOI":"10.3115\/v1\/W14-3348"},{"key":"18372_CR29","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","author":"R Vedantam","year":"2015","unstructured":"Vedantam R, Zitnick CL, Parikh D (2015) Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition"},{"key":"18372_CR30","unstructured":"Lin C-Y (2004) Rouge: A package for automatic evaluation of summaries. Text summarization branches out"},{"key":"18372_CR31","doi-asserted-by":"crossref","unstructured":"Szegedy C, Ioffe S, Vanhoucke V, Alemi A (2017) Inception-v4, inception- resnet and the impact of residual connections on learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 31","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"18372_CR32","doi-asserted-by":"crossref","unstructured":"Carreira J, Zisserman A (2017) Quo vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 6299\u20136308","DOI":"10.1109\/CVPR.2017.502"},{"key":"18372_CR33","unstructured":"Ren S, He K, Girshick R, Sun J (2015) Faster r-cnn: Towards real-time object detection with region proposal networks. arXiv preprint arXiv:1506.01497"},{"key":"18372_CR34","doi-asserted-by":"crossref","unstructured":"He C, Li K, Zhang Y et al (2023) Camouflaged object detection with feature decomposition and edge reconstruction[C]\/\/Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 22046\u201322055","DOI":"10.1109\/CVPR52729.2023.02111"},{"key":"18372_CR35","unstructured":"He C, Li K, Zhang Y et al (2023) Weakly-Supervised Concealed Object Segmentation with SAM-based Pseudo Labeling and Multi-scale Feature Grouping[J]. arXiv preprint arXiv:2305.11003"},{"key":"18372_CR36","unstructured":"Patrick M et al (2020) Support-set bottlenecks for video-text representation learning. arXiv preprint arXiv:2010.02824"},{"key":"18372_CR37","unstructured":"Li L et al (2021) Value: A multi-task benchmark for video-and-language understanding evaluation. arXiv preprint arXiv:2106.04632"},{"key":"18372_CR38","volume-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXI 16","author":"J Lei","year":"2020","unstructured":"Lei J et al (2020) Tvr: A large-scale dataset for video-subtitle moment retrieval. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXI 16. Springer International Publishing"},{"key":"18372_CR39","doi-asserted-by":"crossref","unstructured":"Li L et al (2020) Hero: Hierarchical encoder for video+ language omni-representation pre-training. arXiv preprint arXiv:2005.00200","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"18372_CR40","volume-title":"Proceedings of the 57th annual meeting of the association for computational linguistics","author":"B Shi","year":"2019","unstructured":"Shi B et al (2019) Dense procedure captioning in narrated instructional videos. In: Proceedings of the 57th annual meeting of the association for computational linguistics"},{"key":"18372_CR41","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision","author":"C Sun","year":"2019","unstructured":"Sun C et al (2019) Videobert: A joint model for video and language representation learning. In: Proceedings of the IEEE\/CVF international conference on computer vision"},{"key":"18372_CR42","doi-asserted-by":"crossref","unstructured":"He C et al (2023) Hqg-net: Unpaired medical image enhancement with high-quality guidance. IEEE Trans Neural Netw Learn Syst","DOI":"10.1109\/TNNLS.2023.3315307"},{"key":"18372_CR43","doi-asserted-by":"crossref","unstructured":"He C et al (2023) Degradation-resistant unfolding network for heterogeneous image fusion. Proceedings of the IEEE\/CVF International Conference on Computer Vision","DOI":"10.1109\/ICCV51070.2023.01159"},{"key":"18372_CR44","volume-title":"International conference on machine learning","author":"N Parmar","year":"2018","unstructured":"Parmar N et al (2018) Image transformer. In: International conference on machine learning. PMLR"},{"key":"18372_CR45","unstructured":"Ramachandran P et al (2019) Stand-alone self-attention in vision models. Adv Neural Inf Process Syst 32"},{"key":"18372_CR46","unstructured":"Bertasius G, Wang H, Torresani L (2021) Is space-time attention all you need for video understanding? ICML 2(3)"},{"key":"18372_CR47","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","author":"R Girdhar","year":"2019","unstructured":"Girdhar R et al (2019) Video action transformer network. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition"},{"key":"18372_CR48","doi-asserted-by":"publisher","first-page":"8753","DOI":"10.1109\/TMM.2023.3241517","volume":"25","author":"Z Shao","year":"2023","unstructured":"Shao Z, Han J, Debattista K, Pang Y (2023) Textual context-aware dense captioning with diverse words. IEEE Trans Multimedia 25:8753\u20138766. https:\/\/doi.org\/10.1109\/TMM.2023.3241517","journal-title":"IEEE Trans Multimedia"},{"issue":"6","key":"18372_CR49","doi-asserted-by":"publisher","first-page":"3891","DOI":"10.1007\/s00530-023-01166-y","volume":"29","author":"J Chang","year":"2023","unstructured":"Chang J, Zhang L, Shao Z (2023) View-target relation-guided unsupervised 2D image-based 3D model retrieval via transformer. Multimedia Syst 29(6):3891\u20133901","journal-title":"Multimedia Syst"},{"key":"18372_CR50","unstructured":"Lu J et al (2019) Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Adv Neural Inf Process Syst 32"},{"key":"18372_CR51","unstructured":"He C et al (2023) Strategic preys make acute predators: Enhancing camouflaged object detectors by generating camouflaged objects. arXiv preprint arXiv:2308.03166"},{"key":"18372_CR52","unstructured":"Yang A, Nagrani A, Seo PH, Miech A, Pont-Tuset J, Laptev I, Sivic J, Schmid C (2023) Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 10714\u201310726"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-18372-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-024-18372-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-18372-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,30]],"date-time":"2024-07-30T17:24:44Z","timestamp":1722360284000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-024-18372-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,2,8]]},"references-count":52,"journal-issue":{"issue":"28","published-online":{"date-parts":[[2024,8]]}},"alternative-id":["18372"],"URL":"https:\/\/doi.org\/10.1007\/s11042-024-18372-z","relation":{},"ISSN":["1573-7721"],"issn-type":[{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,2,8]]},"assertion":[{"value":"4 May 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 January 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 January 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 February 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declared that they had no conflicts of interest with respect to their authorship or the publication of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}