{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T03:04:04Z","timestamp":1780542244771,"version":"3.54.1"},"reference-count":56,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100003471","name":"Harbin Engineering University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003471","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Signal Processing: Image Communication"],"published-print":{"date-parts":[[2026,9]]},"DOI":"10.1016\/j.image.2026.117585","type":"journal-article","created":{"date-parts":[[2026,5,18]],"date-time":"2026-05-18T23:01:27Z","timestamp":1779145287000},"page":"117585","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["CSACE-Net: Cross-Modal Semantic Attention Co-Enhancement for video captioning"],"prefix":"10.1016","volume":"147","author":[{"given":"Jiashan","family":"He","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7037-3214","authenticated-orcid":false,"given":"Yan","family":"Cang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qingbo","family":"Ji","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"issue":"2","key":"10.1016\/j.image.2026.117585_b1","article-title":"Spatiotemporal enhanced video subtitle generation combining state space model and transformer","volume":"41","author":"Sun","year":"2025","journal-title":"J. Signal Process."},{"issue":"16","key":"10.1016\/j.image.2026.117585_b2","doi-asserted-by":"crossref","first-page":"11815","DOI":"10.1007\/s00521-023-08323-4","article-title":"Cross-media correlation learning for web video event mining with integrated text semantics and network structural information","volume":"35","author":"Zhang","year":"2023","journal-title":"Neural Comput. Appl."},{"key":"10.1016\/j.image.2026.117585_b3","series-title":"Delving deeper into convolutional networks for learning video representations","author":"Ballas","year":"2015"},{"key":"10.1016\/j.image.2026.117585_b4","doi-asserted-by":"crossref","unstructured":"P. Pan, Z. Xu, Y. Yang, F. Wu, Y. Zhuang, Hierarchical recurrent neural encoder for video representation with application to captioning, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 1029\u20131038.","DOI":"10.1109\/CVPR.2016.117"},{"key":"10.1016\/j.image.2026.117585_b5","doi-asserted-by":"crossref","unstructured":"L. Yao, A. Torabi, K. Cho, N. Ballas, C. Pal, H. Larochelle, A. Courville, Describing videos by exploiting temporal structure, in: Proceedings of the IEEE International Conference on Computer Vision, 2015, pp. 4507\u20134515.","DOI":"10.1109\/ICCV.2015.512"},{"key":"10.1016\/j.image.2026.117585_b6","doi-asserted-by":"crossref","unstructured":"H. Yu, J. Wang, Z. Huang, Y. Yang, W. Xu, Video paragraph captioning using hierarchical recurrent neural networks, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 4584\u20134593.","DOI":"10.1109\/CVPR.2016.496"},{"key":"10.1016\/j.image.2026.117585_b7","doi-asserted-by":"crossref","unstructured":"J. Wang, W. Wang, Y. Huang, L. Wang, T. Tan, Hierarchical memory modelling for video captioning, in: Proceedings of the 26th ACM International Conference on Multimedia, 2018, pp. 63\u201371.","DOI":"10.1145\/3240508.3240538"},{"key":"10.1016\/j.image.2026.117585_b8","first-page":"22605","article-title":"Coot: Cooperative hierarchical transformer for video-text representation learning","volume":"33","author":"Ging","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"7","key":"10.1016\/j.image.2026.117585_b9","article-title":"Video subtitle model with reverse-focus fine-grained multimodal semantic alignment","volume":"42","author":"Cai","year":"2025","journal-title":"Appl. Res. Comput.\/Jisuanji Yingyong Yanjiu"},{"key":"10.1016\/j.image.2026.117585_b10","doi-asserted-by":"crossref","unstructured":"S. Chen, Y.-G. Jiang, Motion guided spatial attention for video captioning, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 33, 2019, pp. 8191\u20138198, 01.","DOI":"10.1609\/aaai.v33i01.33018191"},{"key":"10.1016\/j.image.2026.117585_b11","series-title":"2022 IEEE International Conference on Multimedia and Expo","first-page":"1","article-title":"Support-set based multi-modal representation enhancement for video captioning","author":"Chen","year":"2022"},{"key":"10.1016\/j.image.2026.117585_b12","series-title":"European Conference on Computer Vision","first-page":"146","article-title":"Aiatrack: Attention in attention for transformer visual tracking","author":"Gao","year":"2022"},{"key":"10.1016\/j.image.2026.117585_b13","doi-asserted-by":"crossref","unstructured":"N. Aafaq, N. Akhtar, W. Liu, S.Z. Gilani, A. Mian, Spatio-temporal dynamics and semantic attribute enriched visual encoding for video captioning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 12487\u201312496.","DOI":"10.1109\/CVPR.2019.01277"},{"key":"10.1016\/j.image.2026.117585_b14","doi-asserted-by":"crossref","unstructured":"B. Pan, H. Cai, D.-A. Huang, K.-H. Lee, A. Gaidon, E. Adeli, J.C. Niebles, Spatio-temporal graph for video captioning with knowledge distillation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 10870\u201310879.","DOI":"10.1109\/CVPR42600.2020.01088"},{"key":"10.1016\/j.image.2026.117585_b15","doi-asserted-by":"crossref","unstructured":"Q. Zheng, C. Wang, D. Tao, Syntax-aware action targeting for video captioning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 13096\u201313105.","DOI":"10.1109\/CVPR42600.2020.01311"},{"key":"10.1016\/j.image.2026.117585_b16","doi-asserted-by":"crossref","unstructured":"L. Zhou, Y. Zhou, J.J. Corso, R. Socher, C. Xiong, End-to-end dense video captioning with masked transformer, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 8739\u20138748.","DOI":"10.1109\/CVPR.2018.00911"},{"key":"10.1016\/j.image.2026.117585_b17","doi-asserted-by":"crossref","unstructured":"B. Yang, Y. Zou, F. Liu, C. Zhang, Non-autoregressive coarse-to-fine video captioning, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, 2021, pp. 3119\u20133127, 4.","DOI":"10.1609\/aaai.v35i4.16421"},{"issue":"8","key":"10.1016\/j.image.2026.117585_b18","doi-asserted-by":"crossref","first-page":"2049","DOI":"10.1109\/TMM.2017.2788206","article-title":"Blind quality assessment based on pseudo-reference image","volume":"20","author":"Min","year":"2017","journal-title":"IEEE Trans. Multimed."},{"issue":"2","key":"10.1016\/j.image.2026.117585_b19","doi-asserted-by":"crossref","first-page":"508","DOI":"10.1109\/TBC.2018.2816783","article-title":"Blind image quality estimation via distortion aggravation","volume":"64","author":"Min","year":"2018","journal-title":"IEEE Trans. Broadcast."},{"key":"10.1016\/j.image.2026.117585_b20","doi-asserted-by":"crossref","first-page":"3790","DOI":"10.1109\/TIP.2020.2966081","article-title":"A metric for light field reconstruction, compression, and display quality evaluation","volume":"29","author":"Min","year":"2020","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.image.2026.117585_b21","doi-asserted-by":"crossref","DOI":"10.1109\/TCSVT.2025.3544659","article-title":"Exploring rich subjective quality information for image quality assessment in the wild","author":"Min","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.image.2026.117585_b22","series-title":"European Conference on Computer Vision","first-page":"213","article-title":"End-to-end object detection with transformers","author":"Carion","year":"2020"},{"key":"10.1016\/j.image.2026.117585_b23","series-title":"Multi-head attention: Collaborate instead of concatenate","author":"Cordonnier","year":"2020"},{"key":"10.1016\/j.image.2026.117585_b24","doi-asserted-by":"crossref","first-page":"53","DOI":"10.1162\/tacl_a_00353","article-title":"Efficient content-based sparse attention with routing transformers","volume":"9","author":"Roy","year":"2021","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"10.1016\/j.image.2026.117585_b25","series-title":"International Conference on Image Analysis and Processing","first-page":"633","article-title":"Dmsanet: Dual multi scale attention network","author":"Sagar","year":"2022"},{"key":"10.1016\/j.image.2026.117585_b26","doi-asserted-by":"crossref","first-page":"3805","DOI":"10.1109\/TIP.2020.2966082","article-title":"A multimodal saliency model for videos with high audio-visual correspondence","volume":"29","author":"Min","year":"2020","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.image.2026.117585_b27","doi-asserted-by":"crossref","first-page":"1882","DOI":"10.1109\/TIP.2023.3251695","article-title":"Attention-guided neural networks for full-reference and no-reference audio-visual quality assessment","volume":"32","author":"Cao","year":"2023","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.image.2026.117585_b28","doi-asserted-by":"crossref","first-page":"6054","DOI":"10.1109\/TIP.2020.2988148","article-title":"Study of subjective and objective quality assessment of audio-visual signals","volume":"29","author":"Min","year":"2020","journal-title":"IEEE Trans. Image Process."},{"issue":"4","key":"10.1016\/j.image.2026.117585_b29","doi-asserted-by":"crossref","first-page":"1224","DOI":"10.3390\/s25041224","article-title":"GazeCapsNet: A lightweight gaze estimation framework","volume":"25","author":"Muksimova","year":"2025","journal-title":"Sensors"},{"key":"10.1016\/j.image.2026.117585_b30","series-title":"Pre-training of deep bidirectional transformers for language understanding (2018)","author":"Devlin","year":"1810"},{"key":"10.1016\/j.image.2026.117585_b31","unstructured":"D. Chen, W.B. Dolan, Collecting highly parallel data for paraphrase evaluation, in: Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, 2011, pp. 190\u2013200."},{"key":"10.1016\/j.image.2026.117585_b32","doi-asserted-by":"crossref","unstructured":"J. Xu, T. Mei, T. Yao, Y. Rui, Msr-vtt: A large video description dataset for bridging video and language, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 5288\u20135296.","DOI":"10.1109\/CVPR.2016.571"},{"key":"10.1016\/j.image.2026.117585_b33","doi-asserted-by":"crossref","unstructured":"X. Wang, J. Wu, J. Chen, L. Li, Y.-F. Wang, W.Y. Wang, Vatex: A large-scale, high-quality multilingual dataset for video-and-language research, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 4581\u20134591.","DOI":"10.1109\/ICCV.2019.00468"},{"key":"10.1016\/j.image.2026.117585_b34","doi-asserted-by":"crossref","unstructured":"K. Papineni, S. Roukos, T. Ward, W.-J. Zhu, Bleu: a method for automatic evaluation of machine translation, in: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, 2002, pp. 311\u2013318.","DOI":"10.3115\/1073083.1073135"},{"key":"10.1016\/j.image.2026.117585_b35","unstructured":"S. Banerjee, A. Lavie, METEOR: An automatic metric for MT evaluation with improved correlation with human judgments, in: Proceedings of the Acl Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/Or Summarization, 2005, pp. 65\u201372."},{"key":"10.1016\/j.image.2026.117585_b36","series-title":"Text Summarization Branches Out","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"10.1016\/j.image.2026.117585_b37","doi-asserted-by":"crossref","unstructured":"R. Vedantam, C. Lawrence Zitnick, D. Parikh, Cider: Consensus-based image description evaluation, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2015, pp. 4566\u20134575.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"10.1016\/j.image.2026.117585_b38","doi-asserted-by":"crossref","unstructured":"S. Liu, Z. Ren, J. Yuan, Sibnet: Sibling convolutional encoder for video captioning, in: Proceedings of the 26th ACM International Conference on Multimedia, 2018, pp. 1425\u20131434.","DOI":"10.1145\/3240508.3240667"},{"issue":"1","key":"10.1016\/j.image.2026.117585_b39","doi-asserted-by":"crossref","first-page":"229","DOI":"10.1109\/TMM.2019.2924576","article-title":"STAT: Spatial-temporal attention mechanism for video captioning","volume":"22","author":"Yan","year":"2019","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.image.2026.117585_b40","doi-asserted-by":"crossref","unstructured":"B. Wang, L. Ma, W. Zhang, W. Jiang, J. Wang, W. Liu, Controllable video captioning with pos sequence guidance based on gated fusion network, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 2641\u20132650.","DOI":"10.1109\/ICCV.2019.00273"},{"issue":"10","key":"10.1016\/j.image.2026.117585_b41","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3679203","article-title":"Action-aware linguistic skeleton optimization network for non-autoregressive video captioning","volume":"20","author":"Chen","year":"2024","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"issue":"4","key":"10.1016\/j.image.2026.117585_b42","doi-asserted-by":"crossref","first-page":"11187","DOI":"10.1007\/s11042-023-15978-7","article-title":"Video captioning using sentence vector-enabled convolutional framework with short-connected LSTM","volume":"83","author":"Naik","year":"2024","journal-title":"Multimedia Tools Appl."},{"key":"10.1016\/j.image.2026.117585_b43","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110744","article-title":"Rethink video retrieval representation for video captioning","volume":"156","author":"Tian","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.image.2026.117585_b44","doi-asserted-by":"crossref","unstructured":"Z. Zhang, Y. Shi, C. Yuan, B. Li, P. Wang, W. Hu, Z.-J. Zha, Object relational graph with teacher-recommended learning for video captioning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 13278\u201313288.","DOI":"10.1109\/CVPR42600.2020.01329"},{"key":"10.1016\/j.image.2026.117585_b45","doi-asserted-by":"crossref","unstructured":"Y. Shen, X. Gu, K. Xu, H. Fan, L. Wen, L. Zhang, Accurate and fast compressed video captioning, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 15558\u201315567.","DOI":"10.1109\/ICCV51070.2023.01426"},{"key":"10.1016\/j.image.2026.117585_b46","doi-asserted-by":"crossref","first-page":"4013","DOI":"10.1109\/TIP.2020.2969330","article-title":"Image captioning with end-to-end attribute detection and subsequent attributes prediction","volume":"29","author":"Huang","year":"2020","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.image.2026.117585_b47","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2020.107702","article-title":"Enhancing the alignment between target words and corresponding frames for video captioning","volume":"111","author":"Tu","year":"2021","journal-title":"Pattern Recognit."},{"issue":"2","key":"10.1016\/j.image.2026.117585_b48","doi-asserted-by":"crossref","first-page":"880","DOI":"10.1109\/TCSVT.2021.3063423","article-title":"Syntax-guided hierarchical attention network for video captioning","volume":"32","author":"Deng","year":"2021","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.image.2026.117585_b49","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2022.109204","article-title":"Relation-aware attention for video captioning via graph learning","volume":"136","author":"Tu","year":"2023","journal-title":"Pattern Recognit."},{"issue":"1s","key":"10.1016\/j.image.2026.117585_b50","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3539225","article-title":"Retrieval augmented convolutional encoder-decoder networks for video captioning","volume":"19","author":"Chen","year":"2023","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"10.1016\/j.image.2026.117585_b51","doi-asserted-by":"crossref","unstructured":"Y. Shen, L. Yang, L. Wen, H. Yu, E. Elhamifar, H. Wang, Exploring the role of audio in video captioning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 2090\u20132100.","DOI":"10.1109\/CVPRW63382.2024.00214"},{"key":"10.1016\/j.image.2026.117585_b52","doi-asserted-by":"crossref","first-page":"202","DOI":"10.1109\/TIP.2021.3120867","article-title":"Hierarchical representation network with auxiliary tasks for video captioning and video question answering","volume":"31","author":"Gao","year":"2021","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.image.2026.117585_b53","doi-asserted-by":"crossref","first-page":"2367","DOI":"10.1109\/TMM.2023.3295098","article-title":"Memory-based augmentation network for video captioning","volume":"26","author":"Jing","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.image.2026.117585_b54","doi-asserted-by":"crossref","DOI":"10.1109\/TCSVT.2025.3541965","article-title":"Frame-by-frame multi-object tracking-guided video captioning","author":"Luo","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.image.2026.117585_b55","doi-asserted-by":"crossref","unstructured":"H. Ryu, S. Kang, H. Kang, C.D. Yoo, Semantic grouping network for video captioning, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, 2021, pp. 2514\u20132522, 3.","DOI":"10.1609\/aaai.v35i3.16353"},{"key":"10.1016\/j.image.2026.117585_b56","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2022.109202","article-title":"A multi-layer memory sharing network for video captioning","volume":"136","author":"Niu","year":"2023","journal-title":"Pattern Recognit."}],"container-title":["Signal Processing: Image Communication"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0923596526001086?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0923596526001086?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T02:36:00Z","timestamp":1780540560000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0923596526001086"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,9]]},"references-count":56,"alternative-id":["S0923596526001086"],"URL":"https:\/\/doi.org\/10.1016\/j.image.2026.117585","relation":{},"ISSN":["0923-5965"],"issn-type":[{"value":"0923-5965","type":"print"}],"subject":[],"published":{"date-parts":[[2026,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"CSACE-Net: Cross-Modal Semantic Attention Co-Enhancement for video captioning","name":"articletitle","label":"Article Title"},{"value":"Signal Processing: Image Communication","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.image.2026.117585","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"117585"}}