{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T15:47:21Z","timestamp":1778168841876,"version":"3.51.4"},"reference-count":60,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["52102382"],"award-info":[{"award-number":["52102382"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100007839","name":"Yunnan University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100007839","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Engineering Applications of Artificial Intelligence"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.engappai.2026.114518","type":"journal-article","created":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T13:24:19Z","timestamp":1774013059000},"page":"114518","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["Retrieval-based objects and relations prompt for image captioning"],"prefix":"10.1016","volume":"174","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7669-8201","authenticated-orcid":false,"given":"Jinjing","family":"Gu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0127-6374","authenticated-orcid":false,"given":"Tianbao","family":"Qin","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5137-0488","authenticated-orcid":false,"given":"Yuanyuan","family":"Pu","sequence":"additional","affiliation":[]},{"given":"Zhengpeng","family":"Zhao","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.engappai.2026.114518_b1","doi-asserted-by":"crossref","unstructured":"Agrawal, H., Desai, K., Wang, Y., Chen, X., Jain, R., Johnson, M., Batra, D., Parikh, D., Lee, S., Anderson, P., 2019. Nocaps: Novel object captioning at scale. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 8948\u20138957.","DOI":"10.1109\/ICCV.2019.00904"},{"key":"10.1016\/j.engappai.2026.114518_b2","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2023.107732","article-title":"NPoSC-A3: A novel part of speech clues-aware adaptive attention mechanism for image captioning","volume":"131","author":"Al-Qatf","year":"2024","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.engappai.2026.114518_b3","doi-asserted-by":"crossref","first-page":"23716","DOI":"10.52202\/068431-1723","article-title":"Flamingo: a visual language model for few-shot learning","volume":"35","author":"Alayrac","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.engappai.2026.114518_b4","series-title":"Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, the Netherlands, October 11-14, 2016, Proceedings, Part V 14","first-page":"382","article-title":"Spice: Semantic propositional image caption evaluation","author":"Anderson","year":"2016"},{"key":"10.1016\/j.engappai.2026.114518_b5","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L., 2018. Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 6077\u20136086.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"10.1016\/j.engappai.2026.114518_b6","series-title":"Introducing English Grammar","author":"B\u00f6rjars","year":"2019"},{"key":"10.1016\/j.engappai.2026.114518_b7","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.111433","article-title":"Top-down framework for weakly-supervised grounded image captioning","volume":"287","author":"Cai","year":"2024","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.engappai.2026.114518_b8","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2021.104340","article-title":"Improving image captioning with pyramid attention and SC-GAN","volume":"117","author":"Chen","year":"2022","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.engappai.2026.114518_b9","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R., 2020. Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10578\u201310587.","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"10.1016\/j.engappai.2026.114518_b10","first-page":"1317","article-title":"Memory-augmented image captioning","volume":"vol. 35, no. 2","author":"Fei","year":"2021"},{"key":"10.1016\/j.engappai.2026.114518_b11","doi-asserted-by":"crossref","unstructured":"Feng, Y., Ma, L., Liu, W., Luo, J., 2019. Unsupervised image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 4125\u20134134.","DOI":"10.1109\/CVPR.2019.00425"},{"issue":"3","key":"10.1016\/j.engappai.2026.114518_b12","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3617592","article-title":"Deep learning approaches on image captioning: A review","volume":"56","author":"Ghandi","year":"2023","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.engappai.2026.114518_b13","series-title":"Mamba: Linear-time sequence modeling with selective state spaces","author":"Gu","year":"2023"},{"issue":"10","key":"10.1016\/j.engappai.2026.114518_b14","doi-asserted-by":"crossref","first-page":"1697","DOI":"10.1109\/JAS.2021.1004201","article-title":"Variational gridded graph convolution network for node classification","volume":"8","author":"Hong","year":"2021","journal-title":"IEEE\/CAA J. Autom. Sin."},{"key":"10.1016\/j.engappai.2026.114518_b15","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2024.109134","article-title":"Attribute-driven filtering: A new attributes predicting approach for fine-grained image captioning","volume":"137","author":"Hossen","year":"2024","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.engappai.2026.114518_b16","series-title":"ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1945","article-title":"Improve image captioning via relation modeling","author":"Huang","year":"2022"},{"key":"10.1016\/j.engappai.2026.114518_b17","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., Wei, X.-Y., 2019. Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 4634\u20134643.","DOI":"10.1109\/ICCV.2019.00473"},{"key":"10.1016\/j.engappai.2026.114518_b18","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L., 2015. Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 3128\u20133137.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"10.1016\/j.engappai.2026.114518_b19","first-page":"4320","article-title":"Vipcap: Retrieval text-based visual prompts for lightweight image captioning","volume":"vol. 39, no. 4","author":"Kim","year":"2025"},{"key":"10.1016\/j.engappai.2026.114518_b20","series-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2014"},{"key":"10.1016\/j.engappai.2026.114518_b21","series-title":"Semi-supervised classification with graph convolutional networks","author":"Kipf","year":"2016"},{"key":"10.1016\/j.engappai.2026.114518_b22","doi-asserted-by":"crossref","first-page":"351","DOI":"10.1162\/tacl_a_00188","article-title":"Treetalk: Composition and compression of trees for image descriptions","volume":"2","author":"Kuznetsova","year":"2014","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"10.1016\/j.engappai.2026.114518_b23","doi-asserted-by":"crossref","unstructured":"Li, Y., Pan, Y., Yao, T., Mei, T., 2022. Comprehending and ordering semantics for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 17990\u201317999.","DOI":"10.1109\/CVPR52688.2022.01746"},{"key":"10.1016\/j.engappai.2026.114518_b24","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX 16","first-page":"121","article-title":"Oscar: Object-semantics aligned pre-training for vision-language tasks","author":"Li","year":"2020"},{"issue":"7","key":"10.1016\/j.engappai.2026.114518_b25","doi-asserted-by":"crossref","first-page":"5266","DOI":"10.1109\/TCSVT.2023.3343520","article-title":"Cascade semantic prompt alignment network for image captioning","volume":"34","author":"Li","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.engappai.2026.114518_b26","series-title":"Text Summarization Branches Out","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"10.1016\/j.engappai.2026.114518_b27","series-title":"Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"Lin","year":"2014"},{"issue":"6","key":"10.1016\/j.engappai.2026.114518_b28","doi-asserted-by":"crossref","first-page":"3685","DOI":"10.1109\/TCSVT.2021.3107035","article-title":"Region-aware image captioning via interaction learning","volume":"32","author":"Liu","year":"2021","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.engappai.2026.114518_b29","series-title":"Nltk: The natural language toolkit","author":"Loper","year":"2002"},{"key":"10.1016\/j.engappai.2026.114518_b30","series-title":"ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"I-tuning: Tuning frozen language models with image for lightweight image captioning","author":"Luo","year":"2023"},{"key":"10.1016\/j.engappai.2026.114518_b31","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109420","article-title":"Towards local visual modeling for image captioning","volume":"138","author":"Ma","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.engappai.2026.114518_b32","series-title":"Mapl: Parameter-efficient adaptation of unimodal pre-trained models for vision-language few-shot prompting","author":"Ma\u00f1as","year":"2022"},{"key":"10.1016\/j.engappai.2026.114518_b33","series-title":"Clipcap: Clip prefix for image captioning","author":"Mokady","year":"2021"},{"key":"10.1016\/j.engappai.2026.114518_b34","series-title":"European Conference on Computer Vision","first-page":"167","article-title":"Grit: Faster and better image captioning transformer using dual visual features","author":"Nguyen","year":"2022"},{"key":"10.1016\/j.engappai.2026.114518_b35","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, H., Mei, T., 2017. Video captioning with transferred semantic attributes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 6504\u20136512.","DOI":"10.1109\/CVPR.2017.111"},{"key":"10.1016\/j.engappai.2026.114518_b36","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, Y., Mei, T., 2020. X-linear attention networks for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10971\u201310980.","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"10.1016\/j.engappai.2026.114518_b37","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.-J., 2002. Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics. pp. 311\u2013318.","DOI":"10.3115\/1073083.1073135"},{"key":"10.1016\/j.engappai.2026.114518_b38","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2023.106545","article-title":"Image captioning using transformer-based double attention network","volume":"125","author":"Parvin","year":"2023","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.engappai.2026.114518_b39","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"issue":"8","key":"10.1016\/j.engappai.2026.114518_b40","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"10.1016\/j.engappai.2026.114518_b41","series-title":"Retrieval-augmented image captioning","author":"Ramos","year":"2023"},{"key":"10.1016\/j.engappai.2026.114518_b42","doi-asserted-by":"crossref","unstructured":"Ramos, R., Martins, B., Elliott, D., Kementchedjhieva, Y., 2023b. Smallcap: lightweight image captioning prompted with retrieval augmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 2840\u20132849.","DOI":"10.1109\/CVPR52729.2023.00278"},{"key":"10.1016\/j.engappai.2026.114518_b43","doi-asserted-by":"crossref","DOI":"10.1016\/j.displa.2024.102653","article-title":"Reversegan: An intelligent reverse generative adversarial networks system for complex image captioning generation","volume":"82","author":"Tong","year":"2024","journal-title":"Displays"},{"key":"10.1016\/j.engappai.2026.114518_b44","series-title":"Sg2caps: Revisiting scene graphs for image captioning","author":"Tripathi","year":"2021"},{"key":"10.1016\/j.engappai.2026.114518_b45","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.engappai.2026.114518_b46","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D., 2015a. Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 4566\u20134575.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"10.1016\/j.engappai.2026.114518_b47","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C., Parikh, D., 2015b. METEOR: An automatic metric for MT evaluation with high levels of correlation with human judgments. In: Proc. IEEE Comput. Soc. Conf. Comput. Vis. Pattern Recognit.. pp. 4566\u20134575.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"10.1016\/j.engappai.2026.114518_b48","doi-asserted-by":"crossref","unstructured":"Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al., 2020. Transformers: State-of-the-art natural language processing. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. pp. 38\u201345.","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"10.1016\/j.engappai.2026.114518_b49","doi-asserted-by":"crossref","DOI":"10.1016\/j.compeleceng.2024.109626","article-title":"Sentinel mechanism for visual semantic graph-based image captioning","volume":"119","author":"Xiao","year":"2024","journal-title":"Comput. Electr. Eng."},{"issue":"4","key":"10.1016\/j.engappai.2026.114518_b50","doi-asserted-by":"crossref","first-page":"1445","DOI":"10.1109\/TPAMI.2020.2975798","article-title":"Deep multi-view enhancement hashing for image retrieval","volume":"43","author":"Yan","year":"2020","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.engappai.2026.114518_b51","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2024.128666","article-title":"EntroCap: Zero-shot image captioning with entropy-based retrieval","volume":"611","author":"Yan","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.engappai.2026.114518_b52","doi-asserted-by":"crossref","unstructured":"Yang, X., Tang, K., Zhang, H., Cai, J., 2019. Auto-encoding scene graphs for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10685\u201310694.","DOI":"10.1109\/CVPR.2019.01094"},{"key":"10.1016\/j.engappai.2026.114518_b53","doi-asserted-by":"crossref","unstructured":"Yao, T., Pan, Y., Li, Y., Mei, T., 2018. Exploring visual relationship for image captioning. In: Proceedings of the European Conference on Computer Vision. ECCV, pp. 684\u2013699.","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"10.1016\/j.engappai.2026.114518_b54","doi-asserted-by":"crossref","unstructured":"Yao, T., Pan, Y., Li, Y., Qiu, Z., Mei, T., 2017. Boosting image captioning with attributes. In: Proceedings of the IEEE International Conference on Computer Vision. pp. 4894\u20134902.","DOI":"10.1109\/ICCV.2017.524"},{"issue":"12","key":"10.1016\/j.engappai.2026.114518_b55","doi-asserted-by":"crossref","first-page":"4467","DOI":"10.1109\/TCSVT.2019.2947482","article-title":"Multimodal transformer with multi-view visual representation for image captioning","volume":"30","author":"Yu","year":"2019","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.engappai.2026.114518_b56","first-page":"3394","article-title":"Consensus graph representation learning for better grounded image captioning","volume":"vol. 35, no. 4","author":"Zhang","year":"2021"},{"key":"10.1016\/j.engappai.2026.114518_b57","doi-asserted-by":"crossref","DOI":"10.1016\/j.displa.2022.102210","article-title":"Aligned visual semantic scene graph for image captioning","volume":"74","author":"Zhao","year":"2022","journal-title":"Displays"},{"key":"10.1016\/j.engappai.2026.114518_b58","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Yang, J., Zhang, P., Li, C., Codella, N., Li, L.H., Zhou, L., Dai, X., Yuan, L., Li, Y., et al., 2022. Regionclip: Region-based language-image pretraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 16793\u201316803.","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"10.1016\/j.engappai.2026.114518_b59","first-page":"13041","article-title":"Unified vision-language pre-training for image captioning and vqa","volume":"vol. 34, no. 07","author":"Zhou","year":"2020"},{"key":"10.1016\/j.engappai.2026.114518_b60","series-title":"Deformable detr: Deformable transformers for end-to-end object detection","author":"Zhu","year":"2020"}],"container-title":["Engineering Applications of Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0952197626007992?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0952197626007992?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T20:45:56Z","timestamp":1776113156000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0952197626007992"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":60,"alternative-id":["S0952197626007992"],"URL":"https:\/\/doi.org\/10.1016\/j.engappai.2026.114518","relation":{},"ISSN":["0952-1976"],"issn-type":[{"value":"0952-1976","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Retrieval-based objects and relations prompt for image captioning","name":"articletitle","label":"Article Title"},{"value":"Engineering Applications of Artificial Intelligence","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.engappai.2026.114518","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"114518"}}