{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T02:55:29Z","timestamp":1772160929652,"version":"3.50.1"},"reference-count":55,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2023,12,7]],"date-time":"2023-12-07T00:00:00Z","timestamp":1701907200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,12,7]],"date-time":"2023-12-07T00:00:00Z","timestamp":1701907200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/100007847","name":"Natural Science Foundation of Jilin Province","doi-asserted-by":"publisher","award":["YDZJ202101ZYTS128"],"award-info":[{"award-number":["YDZJ202101ZYTS128"]}],"id":[{"id":"10.13039\/100007847","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2024,1]]},"DOI":"10.1007\/s10489-023-05198-9","type":"journal-article","created":{"date-parts":[[2023,12,7]],"date-time":"2023-12-07T09:03:16Z","timestamp":1701939796000},"page":"231-245","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Integrating grid features and geometric coordinates for enhanced image captioning"],"prefix":"10.1007","volume":"54","author":[{"given":"Fengzhi","family":"Zhao","sequence":"first","affiliation":[]},{"given":"Zhezhou","family":"Yu","sequence":"additional","affiliation":[]},{"given":"He","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Tao","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8060-4725","authenticated-orcid":false,"given":"Tian","family":"Bai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,12,7]]},"reference":[{"key":"5198_CR1","doi-asserted-by":"publisher","unstructured":"Stefanini M, Cornia M, Baraldi L et al (2023) From show to tell: a survey on deep learning-based image captioning. IEEE Transactions on Pattern Analysis and Machine Intelligence 45(1):539\u2013559. https:\/\/doi.org\/10.1109\/TPAMI.2022.3148210","DOI":"10.1109\/TPAMI.2022.3148210"},{"key":"5198_CR2","doi-asserted-by":"crossref","unstructured":"Jia J, Ding X, Pang S et\u00a0al (2023) Image captioning based on scene graphs: a survey. Expert Syst Appl pp 120698","DOI":"10.1016\/j.eswa.2023.120698"},{"issue":"5","key":"5198_CR3","doi-asserted-by":"publisher","first-page":"3833","DOI":"10.1007\/s10462-021-10092-2","volume":"55","author":"Z Zohourianshahzadi","year":"2022","unstructured":"Zohourianshahzadi Z, Kalita JK (2022) Neural attention for image captioning: review of outstanding methods. Artif Intell Rev 55(5):3833\u20133862","journal-title":"Artif Intell Rev"},{"issue":"6","key":"5198_CR4","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3295748","volume":"51","author":"MZ Hossain","year":"2019","unstructured":"Hossain MZ, Sohel F, Shiratuddin MF et al (2019) A comprehensive survey of deep learning for image captioning. ACM Computing Surveys (CsUR) 51(6):1\u201336","journal-title":"ACM Computing Surveys (CsUR)"},{"key":"5198_CR5","unstructured":"Xu K, Ba J, Kiros R et\u00a0al (2015) Show, attend and tell: Neural image caption generation with visual attention. In: International conference on machine learning, PMLR, pp 2048\u20132057"},{"key":"5198_CR6","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C et\u00a0al (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"5198_CR7","doi-asserted-by":"crossref","unstructured":"Nguyen VQ, Suganuma M, Okatani T (2022) Grit: faster and better image captioning transformer using dual visual features. In: Computer vision\u2013ECCV 2022: 17th European conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXXVI, Springer, pp 167\u2013184","DOI":"10.1007\/978-3-031-20059-5_10"},{"key":"5198_CR8","doi-asserted-by":"crossref","unstructured":"Liu Z, Lin Y, Cao Y et\u00a0al (2021) Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 10,012\u201310,022","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"5198_CR9","unstructured":"Zhu X, Su W, Lu L et\u00a0al (2021) Deformable detr: deformable transformers for end-to-end object detection. In: International conference on learning representations"},{"key":"5198_CR10","unstructured":"Herdade S, Kappeler A, Boakye K et\u00a0al (2019) Image captioning: transforming objects into words. Adv Neural Inform Process Syst 32"},{"key":"5198_CR11","doi-asserted-by":"crossref","unstructured":"Cornia M, Stefanini M, Baraldi L et\u00a0al (2020) Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10,578\u201310,587","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"5198_CR12","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1016\/j.neunet.2022.01.011","volume":"148","author":"T Xian","year":"2022","unstructured":"Xian T, Li Z, Zhang C et al (2022) Dual global enhanced transformer for image captioning. Neural Netw 148:129\u2013141","journal-title":"Neural Netw"},{"key":"5198_CR13","doi-asserted-by":"publisher","unstructured":"Yu J, Li J, Yu Z et al (2020) Multimodal transformer with multi-view visual representation for image captioning. IEEE Transactions on Circuits and Systems for Video Technology 30(12):4467\u20134480. https:\/\/doi.org\/10.1109\/TCSVT.2019.2947482","DOI":"10.1109\/TCSVT.2019.2947482"},{"key":"5198_CR14","doi-asserted-by":"publisher","unstructured":"Zhu Y, Xia Q, Jin W (2022) Srdd: a lightweight end-to-end object detection with transformer. Connect Sci 34(1):2448\u20132465. https:\/\/doi.org\/10.1080\/09540091.2022.2125499","DOI":"10.1080\/09540091.2022.2125499"},{"key":"5198_CR15","doi-asserted-by":"publisher","unstructured":"Su J, Tang J, Lu Z et al (2019) A neural image captioning model with caption-to-images semantic constructor. Neurocompu-ting 367:144\u2013151. https:\/\/doi.org\/10.1016\/j.neucom.2019.08.012. https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0925231219311312","DOI":"10.1016\/j.neucom.2019.08.012"},{"key":"5198_CR16","doi-asserted-by":"crossref","unstructured":"Zhu C, Ye X, Lu Q (2022) Semantic space captioner: generating image captions step by step. J Electron Imaging 31(6):063,021\u2013063,021","DOI":"10.1117\/1.JEI.31.6.063021"},{"issue":"3","key":"5198_CR17","doi-asserted-by":"publisher","first-page":"808","DOI":"10.1109\/TMM.2019.2931815","volume":"22","author":"L Wu","year":"2020","unstructured":"Wu L, Xu M, Wang J et al (2020) Recall what you see continually using gridlstm in image captioning. IEEE Transactions on multimedia 22(3):808\u2013818. https:\/\/doi.org\/10.1109\/TMM.2019.2931815","journal-title":"IEEE Transactions on multimedia"},{"key":"5198_CR18","doi-asserted-by":"crossref","unstructured":"Haque AU, Ghani S, Saeed M (2021) Image captioning with positional and geometrical semantics. IEEE Access 9:160,917\u2013160,925","DOI":"10.1109\/ACCESS.2021.3131343"},{"key":"5198_CR19","doi-asserted-by":"crossref","unstructured":"Kuznetsova P, Ordonez V, Berg TL et al (2014) Treetalk: composition and compression of trees for image descriptions. Trans Assoc Comput Linguistics 2:351\u2013362","DOI":"10.1162\/tacl_a_00188"},{"key":"5198_CR20","doi-asserted-by":"crossref","unstructured":"Chen F, Ji R, Sun X et\u00a0al (2018) Groupcap: group-based image captioning with structured relevance and diversity constraints. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1345\u20131353","DOI":"10.1109\/CVPR.2018.00146"},{"key":"5198_CR21","doi-asserted-by":"crossref","unstructured":"Jiang W, Ma L, Jiang YG et\u00a0al (2018) Recurrent fusion network for image captioning. In: Proceedings of the European conference on computer vision (ECCV), pp 499\u2013515","DOI":"10.1007\/978-3-030-01216-8_31"},{"key":"5198_CR22","doi-asserted-by":"crossref","unstructured":"Lu J, Xiong C, Parikh D et\u00a0al (2017) Knowing when to look: adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 375\u2013383","DOI":"10.1109\/CVPR.2017.345"},{"key":"5198_CR23","doi-asserted-by":"crossref","unstructured":"Gupta N, Jalal AS (2020) Integration of textual cues for fine-grained image captioning using deep cnn and lstm. Neural Comput Appl 32:17,899\u201317,908","DOI":"10.1007\/s00521-019-04515-z"},{"key":"5198_CR24","doi-asserted-by":"publisher","unstructured":"Lu Y, Guo C, Dai X et al (2022) Data-efficient image captioning of fine art paintings via virtual-real semantic alignment training. Neurocomputing 490:163\u2013180. https:\/\/doi.org\/10.1016\/j.neucom.2022.01.068. https:\/\/www.sciencedirect.com\/science\/article\/pii\/S092523122200087X","DOI":"10.1016\/j.neucom.2022.01.068"},{"key":"5198_CR25","doi-asserted-by":"publisher","first-page":"265","DOI":"10.1016\/j.neucom.2022.07.068","volume":"506","author":"Y Huang","year":"2022","unstructured":"Huang Y, Chen J, Ma H et al (2022) Attribute assisted teacher-critical training strategies for image captioning. Neurocomputing 506:265\u2013276","journal-title":"Neurocomputing"},{"key":"5198_CR26","doi-asserted-by":"publisher","unstructured":"Kastner MA, Umemura K, Ide I et\u00a0al (2021) Imageability- and length-controllable image captioning. IEEE Access 9:162,951\u2013162,961. https:\/\/doi.org\/10.1109\/ACCESS.2021.3131393","DOI":"10.1109\/ACCESS.2021.3131393"},{"issue":"108","key":"5198_CR27","first-page":"285","volume":"122","author":"JH Lim","year":"2022","unstructured":"Lim JH, Chan CS, Ng KW et al (2022) Protect, show, attend and tell: Empowering image captioning models with ownership protection. Pattern Recog 122(108):285","journal-title":"Pattern Recog"},{"issue":"6","key":"5198_CR28","doi-asserted-by":"publisher","first-page":"2743","DOI":"10.1109\/TIP.2018.2889922","volume":"28","author":"N Yu","year":"2019","unstructured":"Yu N, Hu X, Song B et al (2019) Topic-oriented image captioning based on order-embedding. IEEE Transactions on image processing 28(6):2743\u20132754. https:\/\/doi.org\/10.1109\/TIP.2018.2889922","journal-title":"IEEE Transactions on image processing"},{"key":"5198_CR29","doi-asserted-by":"publisher","unstructured":"Xu C, Yang M, Ao X et al (2021) Retrieval-enhanced adversarial training with dynamic memory-augmented attention for image paragraph captioning. Knowl-Based Syst 214(106):730. https:\/\/doi.org\/10.1016\/j.knosys.2020.106730. https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0950705120308595","DOI":"10.1016\/j.knosys.2020.106730"},{"key":"5198_CR30","unstructured":"Vaswani A, Shazeer N, Parmar N et\u00a0al (2017) Attention is all you need. Adv Neural Inform Process Syst 30"},{"key":"5198_CR31","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A et\u00a0al (2021) An image is worth 16x16 words: transformers for image recognition at scale. In: International conference on learning representations"},{"key":"5198_CR32","doi-asserted-by":"crossref","unstructured":"Carion N, Massa F, Synnaeve G et\u00a0al (2020) End-to-end object detection with transformers. In: Computer Vision\u2013ECCV 2020: 16th European conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part I 16, Springer, pp 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"5198_CR33","doi-asserted-by":"crossref","unstructured":"Yang Q, Ni Z, Ren P (2022) Meta captioning: a meta learning based remote sensing image captioning framework. ISPRS J Photogramm Remote Sens 186:190\u2013200. https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0924271622000351","DOI":"10.1016\/j.isprsjprs.2022.02.001"},{"issue":"8","key":"5198_CR34","first-page":"52","volume":"52","author":"D Zhou","year":"2022","unstructured":"Zhou D, Yang J, Bao R (2022) Collaborative strategy network for spatial attention image captioning. Appl Intell Int J Artif Intell, Neural Networks, and Complex Problem-Solving Technologies 52(8):52","journal-title":"Appl Intell Int J Artif Intell, Neural Networks, and Complex Problem-Solving Technologies"},{"issue":"11","key":"5198_CR35","doi-asserted-by":"publisher","first-page":"7348","DOI":"10.1109\/TPAMI.2021.3119754","volume":"44","author":"DJ Kim","year":"2022","unstructured":"Kim DJ, Oh TH, Choi J et al (2022) Dense relational image captioning via multi-task triple-stream networks. IEEE Transactions on pattern analysis and machine intelligence 44(11):7348\u20137362. https:\/\/doi.org\/10.1109\/TPAMI.2021.3119754","journal-title":"IEEE Transactions on pattern analysis and machine intelligence"},{"key":"5198_CR36","doi-asserted-by":"publisher","unstructured":"Tan YH, Chan CS (2019) Phrase-based image caption generator with hierarchical lstm network. Neurocomputing 333:86\u2013100. https:\/\/doi.org\/10.1016\/j.neucom.2018.12.026. https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0925231218314802","DOI":"10.1016\/j.neucom.2018.12.026"},{"key":"5198_CR37","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-022-03624-y","volume-title":"Enhance understanding and reasoning ability for image captioning","author":"J Wei","year":"2023","unstructured":"Wei J, Li Z, Zhu J et al (2023) Enhance understanding and reasoning ability for image captioning. Appl Intell Int J Artif Intell, Neural Netw, Complex Problem-Solving Technol"},{"key":"5198_CR38","first-page":"5910","volume":"53","author":"Z Lian","year":"2023","unstructured":"Lian Z, Zhang YA, Li HC et al (2023) Cross modification attention-based deliberation model for image captioning. Appl Int 53:5910\u20135033","journal-title":"Appl Int"},{"key":"5198_CR39","doi-asserted-by":"publisher","first-page":"9731","DOI":"10.1007\/s10489-022-04010-4","volume":"53","author":"S Zhao","year":"2023","unstructured":"Zhao S, Li L, Peng H (2023) Incorporating retrieval-based method for feature enhanced image captioning. Appl Intell 53:9731\u20139743","journal-title":"Appl Intell"},{"key":"5198_CR40","unstructured":"Zhao S, Li L, Peng H (2023b) Multi-feature fusion enhanced transformer with multi-layer fused decoding for image captioning. Appl Intell 53:13,398\u201313,414"},{"issue":"7","key":"5198_CR41","doi-asserted-by":"publisher","first-page":"7805","DOI":"10.1109\/TITS.2021.3072970","volume":"23","author":"Y Li","year":"2021","unstructured":"Li Y, Wu C, Li L et al (2021) Caption generation from road images for traffic scene modeling. IEEE Transactions on intelligent transportation systems 23(7):7805\u20137816","journal-title":"IEEE Transactions on intelligent transportation systems"},{"key":"5198_CR42","doi-asserted-by":"crossref","unstructured":"Luo Y, Ji J, Sun X et\u00a0al (2021) Dual-level collaborative transformer for image captioning. In: Proceedings of the AAAI conference on artificial intelligence, pp 2286\u20132293","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"5198_CR43","doi-asserted-by":"crossref","unstructured":"Huang L, Wang W, Chen J et\u00a0al (2019) Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 4634\u20134643","DOI":"10.1109\/ICCV.2019.00473"},{"key":"5198_CR44","doi-asserted-by":"crossref","unstructured":"Pan Y, Yao T, Li Y et\u00a0al (2020) X-linear attention networks for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10,971\u201310,980","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"5198_CR45","doi-asserted-by":"crossref","unstructured":"Kuo CW, Kira Z (2022) Beyond a pre-trained object detector: Cross-modal textual and visual context for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 17,969\u201317,979","DOI":"10.1109\/CVPR52688.2022.01744"},{"key":"5198_CR46","doi-asserted-by":"publisher","unstructured":"He X, Yang Y, Shi B et al (2019) Vd-san: visual-densely semantic attention network for image caption generation. Neurocomputing 328:48\u201355. https:\/\/doi.org\/10.1016\/j.neucom.2018.02.106. https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0925231218309585","DOI":"10.1016\/j.neucom.2018.02.106"},{"key":"5198_CR47","doi-asserted-by":"crossref","unstructured":"Agrawal H, Desai K, Wang Y et\u00a0al (2019) Nocaps: Novel object captioning at scale. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 8948\u20138957","DOI":"10.1109\/ICCV.2019.00904"},{"issue":"7","key":"5198_CR48","doi-asserted-by":"publisher","first-page":"1956","DOI":"10.1007\/s11263-020-01316-z","volume":"128","author":"A Kuznetsova","year":"2020","unstructured":"Kuznetsova A, Rom H, Alldrin N et al (2020) The open images dataset v4: Unified image classification, object detection, and visual relationship detection at scale. Int J Comput Vis 128(7):1956\u20131981","journal-title":"Int J Comput Vis"},{"key":"5198_CR49","doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie S et\u00a0al (2014) Microsoft coco: common objects in context. In: Computer vision\u2013ECCV 2014: 13th European conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, Springer, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"5198_CR50","doi-asserted-by":"crossref","unstructured":"Zhang P, Li X, Hu X et\u00a0al (2021) Vinvl: revisiting visual representations in vision-language models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5579\u20135588","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"5198_CR51","doi-asserted-by":"crossref","unstructured":"Shao S, Li Z, Zhang T et\u00a0al (2019) Objects365: a large-scale, high-quality dataset for object detection. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 8430\u20138439","DOI":"10.1109\/ICCV.2019.00852"},{"key":"5198_CR52","doi-asserted-by":"crossref","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, et\u00a0al (2017) Self-critical sequence training for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7008\u20137024","DOI":"10.1109\/CVPR.2017.131"},{"key":"5198_CR53","doi-asserted-by":"crossref","unstructured":"Fan Z, Wei Z, Wang S et\u00a0al (2021) Tcic: theme concepts learning cross language and vision for image captioning. In: International joint conference on artificial intelligence","DOI":"10.24963\/ijcai.2021\/91"},{"key":"5198_CR54","doi-asserted-by":"crossref","unstructured":"Lu J, Yang J, Batra D et\u00a0al (2018) Neural baby talk. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7219\u20137228","DOI":"10.1109\/CVPR.2018.00754"},{"key":"5198_CR55","unstructured":"Ren S, He K, Girshick R et\u00a0al (2015) Faster r-cnn: towards real-time object detection with region proposal networks. Adv Neural Inform Process Syst 28"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-023-05198-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-023-05198-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-023-05198-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,13]],"date-time":"2024-01-13T10:14:30Z","timestamp":1705140870000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-023-05198-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,7]]},"references-count":55,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2024,1]]}},"alternative-id":["5198"],"URL":"https:\/\/doi.org\/10.1007\/s10489-023-05198-9","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,12,7]]},"assertion":[{"value":"24 November 2023","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 December 2023","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"We promise that this manuscript will not be submitted to multiple journals or conferences simultaneously. We promise to abide by ethical standards, respect the autonomy of participants in the use of data, and ensure the legal, transparent, and secure use of data.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical standard"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}]}}