{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T08:15:58Z","timestamp":1773994558298,"version":"3.50.1"},"reference-count":62,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2024,12,30]],"date-time":"2024-12-30T00:00:00Z","timestamp":1735516800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,30]],"date-time":"2024-12-30T00:00:00Z","timestamp":1735516800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,2]]},"DOI":"10.1007\/s00530-024-01608-1","type":"journal-article","created":{"date-parts":[[2024,12,30]],"date-time":"2024-12-30T09:27:13Z","timestamp":1735550833000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["IGINet: integrating geometric information to enhance inter-modal interaction for fine-grained image captioning"],"prefix":"10.1007","volume":"31","author":[{"given":"Md. Shamim","family":"Hossain","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shamima","family":"Aktar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weiyong","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Naijie","family":"Gu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhangjin","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,30]]},"reference":[{"key":"1608_CR1","doi-asserted-by":"crossref","unstructured":"Farhadi, A., Hejrati, M., Sadeghi, MA., Young, P., Rashtchian, C., Hockenmaier, J., et\u00a0al.: Every picture tells a story: generating sentences from images. In: Computer Vision\u2013ECCV 2010: 11th European Conference on Computer Vision, Heraklion, Crete, Greece, September 5\u201311, 2010, Proceedings, Part IV 11, pp. 15\u201329. Springer (2010)","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"1608_CR2","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., et\u00a0al. Show, attend and tell: neural image caption generation with visual attention. In: International Conference on Machine Learning, pp. 2048\u20132057. PMLR (2015)"},{"key":"1608_CR3","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., et\u00a0al.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"1608_CR4","doi-asserted-by":"crossref","unstructured":"Rennie, SJ., Marcheret, E., Mroueh, Y., Ross, J., Goel, V.: Self-critical sequence training for image captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7008\u20137024 (2017)","DOI":"10.1109\/CVPR.2017.131"},{"key":"1608_CR5","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-024-20220-z","author":"MS Hossain","year":"2024","unstructured":"Hossain, M.S., Aktar, S., Hossen, M.B., Hossain, M.A., Gu, N., Huang, Z.: CSDNet: cross-sketch with dual gated attention for fine-grained image captioning network. Multimedia Tools Appl. (2024). https:\/\/doi.org\/10.1007\/s11042-024-20220-z","journal-title":"Multimedia Tools Appl."},{"key":"1608_CR6","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.127651","volume":"585","author":"G Ge","year":"2024","unstructured":"Ge, G., Han, Y., Hao, L., Hao, K., Wei, B., Xs, Tang: Show, tell and rectify: boost image caption generation via an output rectifier. Neurocomputing 585, 127651 (2024)","journal-title":"Neurocomputing"},{"key":"1608_CR7","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"1608_CR8","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., Wei, XY.: Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4634\u20134643 (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"1608_CR9","doi-asserted-by":"crossref","unstructured":"You, Q., Jin, H., Wang, Z., Fang, C., Luo, J.: Image captioning with semantic attention. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4651\u20134659 (2016)","DOI":"10.1109\/CVPR.2016.503"},{"key":"1608_CR10","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2022.117174","volume":"201","author":"C Wang","year":"2022","unstructured":"Wang, C., Shen, Y., Ji, L.: Geometry Attention Transformer with position-aware LSTMs for image captioning. Expert Syst. Appl. 201, 117174 (2022)","journal-title":"Expert Syst. Appl."},{"key":"1608_CR11","doi-asserted-by":"crossref","unstructured":"Cho, K., Van\u00a0Merri\u00ebnboer, B., Gulcehre, C., Bahdanau, D., Bougares, F., Schwenk, H., et\u00a0al.: Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"1608_CR12","unstructured":"Herdade, S., Kappeler, A., Boakye, K., Soares J.: Image captioning: transforming objects into words. In: Advances in Neural Information Processing Systems, p. 32 (2019)"},{"key":"1608_CR13","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.127823","volume":"593","author":"X Yang","year":"2024","unstructured":"Yang, X., Yang, Y., Ma, S., Li, Z., Dong, W., Wo\u017aniak, M.: SAMT-generator: a second-attention for image captioning based on multi-stage transformer network. Neurocomputing 593, 127823 (2024)","journal-title":"Neurocomputing"},{"key":"1608_CR14","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1016\/j.neucom.2022.11.045","volume":"519","author":"N Hu","year":"2023","unstructured":"Hu, N., Fan, C., Ming, Y., Feng, F.: MAENet: a novel multi-head association attention enhancement network for completing intra-modal interaction in image captioning. Neurocomputing 519, 69\u201381 (2023)","journal-title":"Neurocomputing"},{"key":"1608_CR15","doi-asserted-by":"crossref","unstructured":"Li, L., Tang, S., Deng, L., Zhang, Y., Tian, Q.: Image caption with global-local attention. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a031, pp. 4133\u20134139 (2017)","DOI":"10.1609\/aaai.v31i1.11236"},{"key":"1608_CR16","doi-asserted-by":"crossref","unstructured":"Tu, Y., Zhang, X., Liu, B., Yan, C.: Video description with spatial-temporal attention. In: Proceedings of the 25th ACM International Conference on Multimedia, pp. 1014\u20131022 (2017)","DOI":"10.1145\/3123266.3123354"},{"key":"1608_CR17","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems, p. 28 (2015)"},{"key":"1608_CR18","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, Y., Mei, T.: X-linear attention networks for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10971\u201310980 (2020)","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"1608_CR19","doi-asserted-by":"publisher","unstructured":"Hossain, M.S.: GF-FRCNN MSCOCO. Mendeley Data. https:\/\/doi.org\/10.17632\/sf238jg557.3","DOI":"10.17632\/sf238jg557.3"},{"key":"1608_CR20","doi-asserted-by":"crossref","unstructured":"Lu, J., Xiong, C., Parikh, D., Socher, R.: Knowing when to look: adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 375\u2013383 (2017)","DOI":"10.1109\/CVPR.2017.345"},{"key":"1608_CR21","doi-asserted-by":"crossref","unstructured":"Fang, H., Gupta, S., Iandola, F., Srivastava, RK., Deng, L., Doll\u00e1r, P. et\u00a0al.: From captions to visual concepts and back. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1473\u20131482 (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"1608_CR22","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern recognition, pp. 10578\u201310587 (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"1608_CR23","doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J., Gao, J.: Unified vision-language pre-training for image captioning and vqa. In: Proceedings of the AAAI conference on artificial intelligence. vol.\u00a034; 2020. p. 13041\u201313049","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"1608_CR24","doi-asserted-by":"crossref","unstructured":"Tu, Y., Li, L., Su, L., Zha, ZJ., Yan, C., Huang, Q.: Self-supervised cross-view representation reconstruction for change captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2805\u20132815 (2023)","DOI":"10.1109\/ICCV51070.2023.00263"},{"key":"1608_CR25","doi-asserted-by":"crossref","unstructured":"Tu, Y., Li, L., Su, L., Yan, C., Huang, Q.: Distractors-immune representation learning with cross-modal contrastive regularization for change captioning. In: European Conference on Computer Vision, pp. 311\u2013328. Springer(2025)","DOI":"10.1007\/978-3-031-72775-7_18"},{"key":"1608_CR26","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2022.102238","volume":"73","author":"W Jiang","year":"2022","unstructured":"Jiang, W., Li, Q., Zhan, K., Fang, Y., Shen, F.: Hybrid attention network for image captioning. Displays 73, 102238 (2022)","journal-title":"Displays"},{"key":"1608_CR27","doi-asserted-by":"crossref","unstructured":"Hu, H., Gu, J., Zhang, Z., Dai, J., Wei, Y.: Relation networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3588\u20133597 (2018)","DOI":"10.1109\/CVPR.2018.00378"},{"key":"1608_CR28","doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Zhu, X., Yao, P., Lu, S., Lu, H.: Normalized and geometry-aware self-attention network for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10327\u201310336 (2020)","DOI":"10.1109\/CVPR42600.2020.01034"},{"key":"1608_CR29","doi-asserted-by":"crossref","unstructured":"Yao, T., Pan, Y., Li, Y., Mei, T.: Exploring visual relationship for image captioning. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 684\u2013699 (2018)","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"1608_CR30","doi-asserted-by":"crossref","unstructured":"Luo, Y., Ji, J., Sun, X., Cao, L., Wu, Y., Huang, F., et\u00a0al.: Dual-level collaborative transformer for image captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 2286\u20132293 (2021)","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"1608_CR31","doi-asserted-by":"publisher","first-page":"8828","DOI":"10.1109\/TMM.2023.3242142","volume":"25","author":"S Yue","year":"2023","unstructured":"Yue, S., Tu, Y., Li, L., Yang, Y., Gao, S., Yu, Z.: I3N: intra-and inter-representation interaction network for change captioning. IEEE Trans. Multimedia 25, 8828\u20138841 (2023)","journal-title":"IEEE Trans. Multimedia"},{"key":"1608_CR32","doi-asserted-by":"publisher","first-page":"4926","DOI":"10.1109\/TPAMI.2024.3365104","volume":"46","author":"Y Tu","year":"2024","unstructured":"Tu, Y., Li, L., Su, L., Zha, Z.J., Huang, Q.: SMART: syntax-calibrated multi-aspect relation transformer for change captioning. IEEE Trans. Pattern Anal. Mach. Intell. 46, 4926\u20134943 (2024)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1608_CR33","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"1608_CR34","unstructured":"Chen, X., Fang, H., Lin, TY., Vedantam, R., Gupta, S., Doll\u00e1r, P., et\u00a0al.: Microsoft coco captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"1608_CR35","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7132\u20137141 (2018)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"1608_CR36","unstructured":"Dauphin, YN., Fan, A., Auli, M., Grangier, D.: Language modeling with gated convolutional networks. In: International Conference on Machine Learning, pp. 933\u2013941. PMLR (2017)"},{"key":"1608_CR37","doi-asserted-by":"crossref","unstructured":"Lin, TY., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., et\u00a0al.: Microsoft COCO: common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part V 13, pp. 740\u2013755. Springer (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1608_CR38","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3128\u20133137 (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"1608_CR39","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, WJ.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"1608_CR40","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization, pp. 65\u201372 (2005)"},{"key":"1608_CR41","unstructured":"Lin, CY.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"1608_CR42","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: CIDER: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"1608_CR43","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: SPICE: semantic propositional image caption evaluation. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part V 14, pp. 382\u2013398. Springer (2016)","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"1608_CR44","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L., ImageNet: a large-scale hierarchical image database. In: IEEE Conference on Computer Vision and Pattern Recognition, vol . 2009, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1608_CR45","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., et al.: Visual Genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"1608_CR46","unstructured":"Kingma, D.P., Ba, J., Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"issue":"8","key":"1608_CR47","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"issue":"11","key":"1608_CR48","doi-asserted-by":"publisher","first-page":"2673","DOI":"10.1109\/78.650093","volume":"45","author":"M Schuster","year":"1997","unstructured":"Schuster, M., Paliwal, K.K.: Bidirectional recurrent neural networks. IEEE Trans. Signal Process. 45(11), 2673\u20132681 (1997)","journal-title":"IEEE Trans. Signal Process."},{"key":"1608_CR49","doi-asserted-by":"crossref","unstructured":"Jiang, W., Ma, L., Jiang, YG., Liu, W., Zhang, T.: Recurrent fusion network for image captioning. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 499\u2013515 (2018)","DOI":"10.1007\/978-3-030-01216-8_31"},{"key":"1608_CR50","doi-asserted-by":"crossref","unstructured":"Yang, X., Tang, K., Zhang, H., Cai, J.: Auto-encoding scene graphs for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10685\u201310694 (2019)","DOI":"10.1109\/CVPR.2019.01094"},{"key":"1608_CR51","doi-asserted-by":"crossref","unstructured":"Qin, Y., Du, J., Zhang, Y., Lu, H.: Look back and predict forward in image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8367\u20138375 (2019)","DOI":"10.1109\/CVPR.2019.00856"},{"key":"1608_CR52","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2021.102069","volume":"70","author":"C Bai","year":"2021","unstructured":"Bai, C., Zheng, A., Huang, Y., Pan, X., Chen, N.: Boosting convolutional image captioning with semantic content and visual relationship. Displays 70, 102069 (2021)","journal-title":"Displays"},{"key":"1608_CR53","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2022.102210","volume":"74","author":"S Zhao","year":"2022","unstructured":"Zhao, S., Li, L., Peng, H.: Aligned visual semantic scene graph for image captioning. Displays 74, 102210 (2022)","journal-title":"Displays"},{"issue":"2","key":"1608_CR54","doi-asserted-by":"publisher","first-page":"890","DOI":"10.1109\/TCYB.2022.3156367","volume":"54","author":"Y Yang","year":"2022","unstructured":"Yang, Y., Wei, H., Zhu, H., Yu, D., Xiong, H., Yang, J.: Exploiting cross-modal prediction and relation consistency for semisupervised image captioning. IEEE Trans. Cybern. 54(2), 890\u2013902 (2022)","journal-title":"IEEE Trans. Cybern."},{"issue":"1","key":"1608_CR55","doi-asserted-by":"publisher","first-page":"1223","DOI":"10.1007\/s11042-022-13279-z","volume":"82","author":"D Zhao","year":"2023","unstructured":"Zhao, D., Yang, R., Wang, Z., Qi, Z.: A cooperative approach based on self-attention with interactive attribute for image caption. Multimedia Tools Appl. 82(1), 1223\u20131236 (2023)","journal-title":"Multimedia Tools Appl."},{"key":"1608_CR56","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.111433","volume":"287","author":"C Cai","year":"2024","unstructured":"Cai, C., Wang, S., Yap, K.H., Wang, Y.: Top-down framework for weakly-supervised grounded image captioning. Knowl. Based Syst. 287, 111433 (2024)","journal-title":"Knowl. Based Syst."},{"key":"1608_CR57","unstructured":"Al-Qatf, M., Hawbani, A., Wang, X., Abdusallam, A., Alsamhi, S., Alhabib, M., et\u00a0al.: RVAIC: refined visual attention for improved image captioning. J. Intel. Fuzzy Syst. 1\u201313 (2027) (Preprint)"},{"key":"1608_CR58","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2023.107732","volume":"131","author":"M Al-Qatf","year":"2024","unstructured":"Al-Qatf, M., Hawbani, A., Wang, X., Abdusallam, A., Zhao, L., Alsamhi, S.H., et al.: NPoSC-A3: a novel part of speech clues-aware adaptive attention mechanism for image captioning. Eng. Appl. Artif. Intell. 131, 107732 (2024)","journal-title":"Eng. Appl. Artif. Intell."},{"key":"1608_CR59","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics, Volume 1: Long Papers, pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"issue":"1","key":"1608_CR60","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1109\/TCSVT.2021.3067449","volume":"32","author":"C Yan","year":"2021","unstructured":"Yan, C., Hao, Y., Li, L., Yin, J., Liu, A., Mao, Z., et al.: Task-adaptive attention for image captioning. IEEE Trans. Circuits Syst. Video Technol. 32(1), 43\u201351 (2021)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"1608_CR61","doi-asserted-by":"crossref","unstructured":"Qian, K., Pan, Y., Xu, H., Tian, L.: Transformer model incorporating local graph semantic attention for image caption. The Visual Computer, pp. 1\u201312 (2023)","DOI":"10.1007\/s00371-023-03180-7"},{"key":"1608_CR62","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2023.102377","volume":"77","author":"L Chen","year":"2023","unstructured":"Chen, L., Yang, Y., Hu, J., Pan, L., Zhai, H.: Relational-convergent transformer for image captioning. Displays 77, 102377 (2023)","journal-title":"Displays"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-024-01608-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-024-01608-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-024-01608-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T11:00:26Z","timestamp":1740740426000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-024-01608-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,30]]},"references-count":62,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2025,2]]}},"alternative-id":["1608"],"URL":"https:\/\/doi.org\/10.1007\/s00530-024-01608-1","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,30]]},"assertion":[{"value":"13 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 December 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 December 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors state that there are no financial Conflict of interest or personal relationships that could have influenced the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"33"}}