{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,8]],"date-time":"2026-02-08T17:19:14Z","timestamp":1770571154633,"version":"3.49.0"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T00:00:00Z","timestamp":1743724800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T00:00:00Z","timestamp":1743724800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s00530-025-01775-9","type":"journal-article","created":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T15:20:09Z","timestamp":1743866409000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Dual-visual collaborative enhanced transformer for image captioning"],"prefix":"10.1007","volume":"31","author":[{"given":"Zhenping","family":"Mou","sequence":"first","affiliation":[]},{"given":"Tianqi","family":"Song","sequence":"additional","affiliation":[]},{"given":"Hong","family":"Luo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,4]]},"reference":[{"key":"1775_CR1","doi-asserted-by":"publisher","first-page":"2413","DOI":"10.1109\/TMM.2020.3011317","volume":"23","author":"J Wu","year":"2020","unstructured":"Wu, J., Chen, T., Wu, H., et al.: Fine-grained image captioning with global-local discriminative objective. IEEE Trans. Multimed. 23, 2413\u20132427 (2020)","journal-title":"IEEE Trans. Multimed."},{"key":"1775_CR2","doi-asserted-by":"publisher","first-page":"3101","DOI":"10.1109\/TMM.2021.3093725","volume":"24","author":"Z Zhang","year":"2021","unstructured":"Zhang, Z., Wu, Q., Wang, Y., et al.: Exploring pairwise relationships adaptively from linguistic context in image captioning. IEEE Trans. Multimed. 24, 3101\u20133113 (2021)","journal-title":"IEEE Trans. Multimed."},{"key":"1775_CR3","doi-asserted-by":"publisher","first-page":"1271","DOI":"10.1109\/TIP.2019.2940693","volume":"29","author":"H Cui","year":"2019","unstructured":"Cui, H., Zhu, L., Li, J.J., Yang, Y., Nie, L.Q.: Scalable deep hashing for large-scale social image retrieval. IEEE Trans. Image Process. 29, 1271\u20131284 (2019)","journal-title":"IEEE Trans. Image Process."},{"issue":"32","key":"1775_CR4","doi-asserted-by":"publisher","first-page":"23311","DOI":"10.1007\/s00521-021-06012-8","volume":"35","author":"MK Chowdary","year":"2023","unstructured":"Chowdary, M.K., Nguyen, T.N., Hemanth, D.J.: Deep learning-based facial emotion recognition for human\u2013computer interaction applications. Neural Comput. Appl. 35(32), 23311\u201323328 (2023)","journal-title":"Neural Comput. Appl."},{"key":"1775_CR5","first-page":"28825","volume":"27","author":"I Sutskever","year":"2014","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. Adv. Neural Inform. Process. Syst. 27, 28825 (2014)","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"1775_CR6","doi-asserted-by":"crossref","unstructured":"Lu, J., Xiong, C., Parikh, D., et al:. Knowing when to look: Adaptive attention via a visual sentinel for image captioning[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 375\u2013383. (2017)","DOI":"10.1109\/CVPR.2017.345"},{"key":"1775_CR7","unstructured":"Xu, K., Ba, J., Kiros, R., et al.: Show, attend and tell: Neural image caption generation with visual attention[C]\/\/International conference on machine learning. PMLR. 2048\u20132057 (2015)"},{"issue":"5","key":"1775_CR8","first-page":"1112","volume":"42","author":"LL Gao","year":"2020","unstructured":"Gao, L.L., Li, X.P., Song, J.K., Shen, H.T.: Hierarchical LSTMs with adaptive attention for visual captioning. IEEE Trans. Pattern Anal. Mach. Intell. 42(5), 1112\u20131131 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1775_CR9","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., et al.: Bottom-up and top-down attention for image captioning and visual question answering[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"1775_CR10","doi-asserted-by":"crossref","unstructured":"Albawi, S., Mohammed, T.A., Al-Zawi, S.: Understanding of a convolutional neural network[C]\/\/2017 international conference on engineering and technology (ICET). IEEE 1\u20136. (2017)","DOI":"10.1109\/ICEngTechnol.2017.8308186"},{"key":"1775_CR11","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929, (2020)"},{"issue":"6","key":"1775_CR12","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2016","unstructured":"Ren, S., He, K., Girshick, R., et al.: Faster R-CNN: towards real-time object detection with region proposal networks. IEEE Trans. Pattern Anal. Mach. Intell. 39(6), 1137\u20131149 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1775_CR13","doi-asserted-by":"crossref","unstructured":"Song, Z., Zhou, X.., Dong, L., et al.: Direction relation transformer for image captioning[C]\/\/Proceedings of the 29th ACM International Conference on Multimedia. 5056\u20135064 (2021)","DOI":"10.1145\/3474085.3475607"},{"key":"1775_CR14","doi-asserted-by":"crossref","unstructured":"Li, G., Zhu, L., Liu, P., et al.: Entangled transformer for image captioning[C]\/\/Proceedings of the IEEE\/CVF international conference on computer vision. 8928\u20138937. (2019)","DOI":"10.1109\/ICCV.2019.00902"},{"issue":"3","key":"1775_CR15","first-page":"2286","volume":"35","author":"Y Luo","year":"2021","unstructured":"Luo, Y., Ji, J., Sun, X., et al.: Dual-level collaborative transformer for image captioning. Proceed. AAAI Conf. Artif. Intell. 35(3), 2286\u20132293 (2021)","journal-title":"Proceed. AAAI Conf. Artif. Intell."},{"key":"1775_CR16","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1016\/j.neunet.2022.01.011","volume":"148","author":"T Xian","year":"2022","unstructured":"Xian, T., Li, Z., Zhang, C., et al.: Dual global enhanced transformer for image captioning[J]. Neural Netw. 148, 129\u2013141 (2022)","journal-title":"Neural Netw."},{"key":"1775_CR17","first-page":"740","volume-title":"European conference on computer vision, pages","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Lawrence, C., Zitnick.: Microsoft coco: Common objects in context. In: European conference on computer vision, pages, pp. 740\u2013755. Springer, Cham (2014)"},{"key":"1775_CR18","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., et al.: Show and tell: A neural image caption generator[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"1775_CR19","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 3128\u20133137 (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"1775_CR20","doi-asserted-by":"crossref","unstructured":"Pan, Y.W., Yao, T., Li, Y.H., Mei, T.: X-linear attention networks for image captioning. In: Proc. of the 2020 IEEE\/CVF Conf. on Computer Vision and Pattern Recognition. Seattle: IEEE. 10968\u201310977 (2020)","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"1775_CR21","doi-asserted-by":"crossref","unstructured":"Jiang, H., Misra, I., Rohrbach, M., et al.: In defense of grid features for visual question answering[C]\/\/Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 10267\u201310276 (2020)","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"1775_CR22","doi-asserted-by":"crossref","unstructured":"Zhang, X., Sun, X., Luo, Y., et al.: Rstnet: Captioning with adaptive attention on visual and non-visual words[C]\/\/Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 15465\u201315474 (2021)","DOI":"10.1109\/CVPR46437.2021.01521"},{"key":"1775_CR23","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1706.03762","author":"A Vaswani","year":"2017","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et al.: Attention is all you need. Adv. Neural Inform. Process. Syst. (2017). https:\/\/doi.org\/10.48550\/arXiv.1706.03762","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"1775_CR24","unstructured":"Devlin, J., Chang, M. W., Lee, K., et al.: Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805, (2018)"},{"key":"1775_CR25","first-page":"11960","volume":"34","author":"Y Wang","year":"2021","unstructured":"Wang, Y., Huang, R., Song, S., et al.: Not all images are worth 16x16 words: dynamic transformers for efficient image recognition[J]. Adv. Neural. Inf. Process. Syst. 34, 11960\u201311973 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1775_CR26","doi-asserted-by":"crossref","unstructured":"Yang, X., Zhang, H., Cai, J.: Learning to collocate neural modules for image captioning[C]\/\/Proceedings of the IEEE\/CVF International Conference on Computer Vision.: 4250\u20134260 (2019)","DOI":"10.1109\/ICCV.2019.00435"},{"key":"1775_CR27","doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Zhu, X., et al.: Normalized and geometry-aware self-attention network for image captioning[C]\/\/Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 10327\u201310336 (2020)","DOI":"10.1109\/CVPR42600.2020.01034"},{"key":"1775_CR28","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., et al.: Attention on attention for image captioning[C]\/\/Proceedings of the IEEE\/CVF international conference on computer vision. 4634\u20134643 (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"1775_CR29","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., et al.: Self-critical sequence training for image captioning[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 7008\u20137024 (2017)","DOI":"10.1109\/CVPR.2017.131"},{"key":"1775_CR30","doi-asserted-by":"crossref","unstructured":"Kuo, C.W., Kira, Z.: HAAV: Hierarchical Aggregation of Augmented Views for Image Captioning[C]\/\/Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 11039\u201311049 (2023)","DOI":"10.1109\/CVPR52729.2023.01062"},{"key":"1775_CR31","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T. and Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"1775_CR32","doi-asserted-by":"crossref","unstructured":"Denkowski, M. and Lavie, A.: Meteor universal: Language specific translation evaluation for any target language. In: Proceedings of the ninth workshop on statistical machine translation, pp. 376\u2013 380 (2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"1775_CR33","unstructured":"Lin, C-Y.: A package for automatic evaluation of summaries. In Text summarization branches out, pp. 74\u201381 (2004)"},{"key":"1775_CR34","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C. and Parikh, D.: Cider: consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pages 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"1775_CR35","first-page":"382","volume-title":"European conference on computer vision, pages","author":"Peter Anderson","year":"2016","unstructured":"Anderson, Peter, Fernando, Basura, Johnson, Mark, Gould, Stephen: Spice: Semantic propositional image caption evaluation. In: European conference on computer vision, pages, pp. 382\u2013398. Springer, Cham (2016)"},{"issue":"1","key":"1775_CR36","first-page":"411","volume":"7","author":"M Honnibal","year":"2017","unstructured":"Honnibal, M., Montani, I.: spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing. To Appear 7(1), 411\u2013420 (2017)","journal-title":"To Appear"},{"key":"1775_CR37","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L. and Cucchiara, R.:Grided-memory transformer for image captioning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 10578\u201310587 (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"issue":"1","key":"1775_CR38","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1007\/s13735-023-00266-9","volume":"12","author":"Z Jiang","year":"2023","unstructured":"Jiang, Z., Wang, X., Zhai, Z., et al.: LG-MLFormer: local and global MLP for image captioning. Int. J. Multimed. Inform. Retr. 12(1), 4 (2023)","journal-title":"Int. J. Multimed. Inform. Retr."},{"issue":"3","key":"1775_CR39","first-page":"2585","volume":"36","author":"Y Wang","year":"2022","unstructured":"Wang, Y., Xu, J., Sun, Y.: End-to-end transformer based model for image captioning. Proceed. AAAI Conf. Artif. Intell. 36(3), 2585\u20132594 (2022)","journal-title":"Proceed. AAAI Conf. Artif. Intell."},{"key":"1775_CR40","doi-asserted-by":"crossref","unstructured":"Barraco, M., Stefanini, M., Cornia, M., et al.: CaMEL: mean teacher learning for image captioning[C]\/\/2022 26th International Conference on Pattern Recognition (ICPR). IEEE 4087\u20134094 (2022)","DOI":"10.1109\/ICPR56361.2022.9955644"},{"key":"1775_CR41","doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Zhu, X., Yao, P., Lu, S. and Lu, H.: Normalized and geometry-aware self-attention network for image captioning. CVPR pp. 10324\u201310333 (2020)","DOI":"10.1109\/CVPR42600.2020.01034"},{"issue":"2","key":"1775_CR42","first-page":"1655","volume":"35","author":"J Ji","year":"2021","unstructured":"Ji, J., Luo, Y., Sun, X., et al.: Improving image captioning by leveraging intra-and inter-layer global representation in transformer network. Proceed. AAAI Conf. Artif. Intell. 35(2), 1655\u20131663 (2021)","journal-title":"Proceed. AAAI Conf. Artif. Intell."},{"issue":"8","key":"1775_CR43","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3663667","volume":"20","author":"S Sarto","year":"2024","unstructured":"Sarto, S., Cornia, M., Baraldi, L., et al.: Towards retrieval-augmented architectures for image captioning. ACM Trans. Multimed. Comput., Commun. Appl. 20(8), 1\u201322 (2024)","journal-title":"ACM Trans. Multimed. Comput., Commun. Appl."},{"key":"1775_CR44","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.119774","volume":"223","author":"H Parvin","year":"2023","unstructured":"Parvin, H., Naghsh-Nilchi, A.R., Mohammadi, H.M.: Transformer-based local-global guidance for image captioning. Expert Syst. Appl. 223, 119774 (2023)","journal-title":"Expert Syst. Appl."},{"key":"1775_CR45","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., et al.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. 2, 67\u201378 (2014)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"1775_CR46","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109420","volume":"138","author":"Y Ma","year":"2023","unstructured":"Ma, Y., Ji, J., Sun, X., et al.: Towards local visual modeling for image captioning. Pattern Recogn. 138, 109420 (2023)","journal-title":"Pattern Recogn."},{"key":"1775_CR47","doi-asserted-by":"crossref","unstructured":"Nguyen, V.Q., Suganuma, M., Okatani, T.: Grit: Faster and better image captioning transformer using dual visual features[C]\/\/European Conference on Computer Vision. Cham: Springer Nature Switzerland, pp. 167\u2013184 (2022)","DOI":"10.1007\/978-3-031-20059-5_10"},{"key":"1775_CR48","first-page":"2966","volume":"25","author":"Z Song","year":"2024","unstructured":"Song, Z., Hu, Z., Zhou, Y., et al.: Embedded heterogeneous attention transformer for cross-lingual image captioning. IEEE Trans. Multimed. 25, 2966\u20132977 (2024)","journal-title":"IEEE Trans. Multimed."}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01775-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01775-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01775-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,4]],"date-time":"2025-09-04T15:01:36Z","timestamp":1756998096000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01775-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,4]]},"references-count":48,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["1775"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01775-9","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,4]]},"assertion":[{"value":"19 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 April 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"181"}}