{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T04:03:10Z","timestamp":1771646590096,"version":"3.50.1"},"reference-count":70,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100002611","name":"University of Seoul","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100002611","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100014188","name":"Ministry of Science and ICT, South Korea","doi-asserted-by":"publisher","award":["RS-2025-24523036"],"award-info":[{"award-number":["RS-2025-24523036"]}],"id":[{"id":"10.13039\/501100014188","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1016\/j.neucom.2026.133025","type":"journal-article","created":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T00:27:22Z","timestamp":1770856042000},"page":"133025","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["ReCoD: Enhancing image description for cross-modal understanding via retrieval and comparison feedback mechanism"],"prefix":"10.1016","volume":"676","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-4337-4706","authenticated-orcid":false,"given":"Geunyoung","family":"Jung","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2795-1896","authenticated-orcid":false,"given":"Jun","family":"Park","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8201-1320","authenticated-orcid":false,"given":"Hankyeol","family":"Lee","sequence":"additional","affiliation":[]},{"given":"Kyungwoo","family":"Song","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9316-9750","authenticated-orcid":false,"given":"Jiyoung","family":"Jung","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neucom.2026.133025_bib0005","series-title":"Proceedings of the 38th International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.neucom.2026.133025_bib0010","series-title":"Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics","first-page":"4171","article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.neucom.2026.133025_bib0015","author":"Touvron"},{"key":"10.1016\/j.neucom.2026.133025_bib0020","article-title":"Language models are unsupervised multitask learners","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"10.1016\/j.neucom.2026.133025_bib0025","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"13492","article-title":"De-diffusion makes text a strong cross-modal interface","author":"Wei","year":"2024"},{"key":"10.1016\/j.neucom.2026.133025_bib0030","author":"Mokady"},{"key":"10.1016\/j.neucom.2026.133025_bib0035","series-title":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"1","article-title":"I-tuning: tuning frozen language models with image for lightweight image captioning","author":"Luo","year":"2023"},{"key":"10.1016\/j.neucom.2026.133025_bib0040","author":"Chung"},{"key":"10.1016\/j.neucom.2026.133025_bib0055","series-title":"Advances in Neural Information Processing Systems","first-page":"200","article-title":"Multimodal few-shot learning with frozen language models","author":"Tsimpoukelli","year":"2021"},{"key":"10.1016\/j.neucom.2026.133025_bib0060","series-title":"Advances in Neural Information Processing Systems","first-page":"23716","article-title":"Flamingo: a visual language model for few-shot learning","author":"Alayrac","year":"2022"},{"key":"10.1016\/j.neucom.2026.133025_bib0065","series-title":"Advances in Neural Information Processing Systems","first-page":"49250","article-title":"InstructBLIP: towards general-purpose vision-language models with instruction tuning","author":"Dai","year":"2023"},{"key":"10.1016\/j.neucom.2026.133025_bib0070","series-title":"Advances in Neural Information Processing Systems","first-page":"72096","article-title":"Language is not all you need: aligning perception with language models","author":"Huang","year":"2023"},{"key":"10.1016\/j.neucom.2026.133025_bib0075","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"26296","article-title":"Improved baselines with visual instruction tuning","author":"Liu","year":"2024"},{"key":"10.1016\/j.neucom.2026.133025_bib0080","series-title":"Conference on Empirical Methods in Natural Language Processing","first-page":"292","article-title":"Evaluating object hallucination in large vision-language models","author":"Li","year":"2023"},{"key":"10.1016\/j.neucom.2026.133025_bib0085","series-title":"Proceedings of the 38th International Conference on Machine Learning","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"Jia","year":"2021"},{"key":"10.1016\/j.neucom.2026.133025_bib0090","series-title":"European Conference on Computer Vision (ECCV)","first-page":"529","article-title":"SLIP: self-supervision meets language-image pre-training","author":"Mu","year":"2022"},{"key":"10.1016\/j.neucom.2026.133025_bib0095","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"27391","article-title":"DetCLIPv3: towards versatile generative open-vocabulary object detection","author":"Yao","year":"2024"},{"key":"10.1016\/j.neucom.2026.133025_bib0100","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"16901","article-title":"YOLO-world: real-time open-vocabulary object detection","author":"Cheng","year":"2024"},{"key":"10.1016\/j.neucom.2026.133025_bib0105","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"14987","article-title":"LLMDet: learning strong open-vocabulary object detectors under the supervision of large language models","author":"Fu","year":"2025"},{"key":"10.1016\/j.neucom.2026.133025_bib0110","first-page":"1","article-title":"SKDF: a simple knowledge distillation framework for distilling open-vocabulary knowledge to open-world object detector","author":"Ma","year":"2025","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.neucom.2026.133025_bib0115","series-title":"Computer Vision \u2013 ECCV 2020","first-page":"213","article-title":"End-to-end object detection with transformers","author":"Carion","year":"2020"},{"key":"10.1016\/j.neucom.2026.133025_bib0120","series-title":"International Conference on Learning Representations","article-title":"Deformable DETR: deformable transformers for end-to-end object detection","author":"Zhu","year":"2021"},{"key":"10.1016\/j.neucom.2026.133025_bib0125","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"7464","article-title":"YOLOv7: trainable bag-of-freebies sets new state-of-the-art for real-time object detectors","author":"Wang","year":"2023"},{"issue":"11","key":"10.1016\/j.neucom.2026.133025_bib0130","doi-asserted-by":"crossref","first-page":"6921","DOI":"10.1109\/TCYB.2024.3424430","article-title":"Multilevel fine-grained features-based general framework for object detection","volume":"54","author":"Zuo","year":"2024","journal-title":"IEEE Trans. Cybern."},{"key":"10.1016\/j.neucom.2026.133025_bib0135","series-title":"International Conference on Learning Representations","article-title":"Language-driven semantic segmentation","author":"Li","year":"2022"},{"key":"10.1016\/j.neucom.2026.133025_bib0140","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"18082","article-title":"DenseCLIP: language-guided dense prediction with context-aware prompting","author":"Rao","year":"2022"},{"key":"10.1016\/j.neucom.2026.133025_bib0145","series-title":"European Conference on Computer Vision (ECCV)","first-page":"696","article-title":"Extract free dense labels from CLIP","author":"Zhou","year":"2022"},{"key":"10.1016\/j.neucom.2026.133025_bib0150","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"17980","article-title":"Scaling up vision-language pre-training for image captioning","author":"Hu","year":"2022"},{"key":"10.1016\/j.neucom.2026.133025_bib0155","article-title":"GIT: a generative image-to-text transformer for vision and language","author":"Wang","year":"2022","journal-title":"Trans. Mach. Learn. Res."},{"key":"10.1016\/j.neucom.2026.133025_bib0160","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"6935","article-title":"Cross-domain image captioning with discriminative finetuning","author":"Dess\u00ec","year":"2023"},{"key":"10.1016\/j.neucom.2026.133025_bib0165","series-title":"The Eleventh International Conference on Learning Representations","article-title":"DeCap: decoding CLIP latents for zero-shot captioning via text-only training","author":"Li","year":"2023"},{"key":"10.1016\/j.neucom.2026.133025_bib0170","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"2840","article-title":"SmallCap: lightweight image captioning prompted with retrieval augmentation","author":"Ramos","year":"2023"},{"key":"10.1016\/j.neucom.2026.133025_bib0175","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"14100","article-title":"MeaCap: memory-augmented zero-shot image captioning","author":"Zeng","year":"2024"},{"key":"10.1016\/j.neucom.2026.133025_bib0180","series-title":"Proceedings of the 40th the International Conference on Machine Learning","first-page":"19730","article-title":"BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"10.1016\/j.neucom.2026.133025_bib0185","series-title":"Advances in Neural Information Processing Systems","first-page":"34892","article-title":"Visual instruction tuning","author":"Liu","year":"2023"},{"key":"10.1016\/j.neucom.2026.133025_bib0190","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"10684","article-title":"High-resolution image synthesis with latent diffusion models","author":"Rombach","year":"2022"},{"key":"10.1016\/j.neucom.2026.133025_bib0195","doi-asserted-by":"crossref","first-page":"78","DOI":"10.1145\/2629489","article-title":"Wikidata: a free collaborative knowledgebase","author":"Vrande\u010di\u0107","year":"2014","journal-title":"Commun. ACM"},{"key":"10.1016\/j.neucom.2026.133025_bib0200","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"4444","article-title":"ConceptNet 5.5: an open multilingual graph of general knowledge","author":"Speer","year":"2017"},{"key":"10.1016\/j.neucom.2026.133025_bib0205","first-page":"3081","article-title":"An empirical study of GPT-3 for few-shot knowledge-based VQA","author":"Yang","year":"2022","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"10.1016\/j.neucom.2026.133025_bib0210","series-title":"Findings of the Association for Computational Linguistics: ACL 2023","first-page":"9268","article-title":"Zero-shot visual question answering with language model feedback","author":"Du","year":"2023"},{"key":"10.1016\/j.neucom.2026.133025_bib0215","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"10867","article-title":"From images to textual prompts: zero-shot visual question answering with frozen large language models","author":"Guo","year":"2023"},{"key":"10.1016\/j.neucom.2026.133025_bib0220","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"2963","article-title":"PromptCap: prompt-guided image captioning for VQA with GPT-3","author":"Hu","year":"2023"},{"key":"10.1016\/j.neucom.2026.133025_bib0225","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"14974","article-title":"Prompting large language models with answer heuristics for knowledge-based visual question answering","author":"Shao","year":"2023"},{"key":"10.1016\/j.neucom.2026.133025_bib0230","series-title":"European Conference on Computer Vision (ECCV)","first-page":"740","article-title":"Microsoft COCO: common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.neucom.2026.133025_bib0235","series-title":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","first-page":"2556","article-title":"Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning","author":"Sharma","year":"2018"},{"key":"10.1016\/j.neucom.2026.133025_bib0240","author":"Burns"},{"key":"10.1016\/j.neucom.2026.133025_bib0245","series-title":"Advances in Neural Information Processing Systems","first-page":"1877","article-title":"Language models are few-shot learners","author":"Brown","year":"2020"},{"key":"10.1016\/j.neucom.2026.133025_bib0250","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"3195","article-title":"Ok-VQA: a visual question answering benchmark requiring external knowledge","author":"Marino","year":"2019"},{"key":"10.1016\/j.neucom.2026.133025_bib0255","series-title":"European Conference on Computer Vision (ECCV)","first-page":"146","article-title":"A-OKVQA: a benchmark for visual question answering using world knowledge","author":"Schwenk","year":"2022"},{"key":"10.1016\/j.neucom.2026.133025_bib0260","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"6904","article-title":"Making the V in VQA matter: elevating the role of image understanding in visual question answering","author":"Goyal","year":"2017"},{"key":"10.1016\/j.neucom.2026.133025_bib0265","series-title":"The claude 3 model family: opus, sonnet, haiku","year":"2024"},{"key":"10.1016\/j.neucom.2026.133025_bib0270","series-title":"Conference on Empirical Methods in Natural Language Processing","first-page":"489","article-title":"ConceptBert: concept-aware representation for visual question answering","author":"Gard\u00e8res","year":"2020"},{"key":"10.1016\/j.neucom.2026.133025_bib0275","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"14111","article-title":"KRISP: integrating implicit and symbolic knowledge for open-domain knowledge-based VQA","author":"Marino","year":"2021"},{"key":"10.1016\/j.neucom.2026.133025_bib0280","first-page":"2712","article-title":"Multi-modal answer validation for knowledge-based VQA","author":"Wu","year":"2022","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"10.1016\/j.neucom.2026.133025_bib0285","series-title":"Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","first-page":"956","article-title":"KAT: a knowledge augmented transformer for vision-and-language","author":"Gui","year":"2022"},{"key":"10.1016\/j.neucom.2026.133025_bib0290","series-title":"Advances in Neural Information Processing Systems","first-page":"10560","article-title":"REVIVE: regional visual representation matters in knowledge-based visual question answering","author":"Lin","year":"2022"},{"key":"10.1016\/j.neucom.2026.133025_bib0295","series-title":"Findings of the Association for Computational Linguistics: EMNLP 2022","first-page":"951","article-title":"Plug-and-play VQA: zero-shot VQA by conjoining large pretrained models with zero training","author":"Tiong","year":"2022"},{"key":"10.1016\/j.neucom.2026.133025_bib0300","series-title":"Advances in Neural Information Processing Systems","first-page":"13","article-title":"ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","author":"Lu","year":"2019"},{"key":"10.1016\/j.neucom.2026.133025_bib0305","series-title":"Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)","first-page":"5100","article-title":"LXMERT: learning cross-modality encoder representations from transformers","author":"Tan","year":"2019"},{"key":"10.1016\/j.neucom.2026.133025_bib0310","series-title":"European Conference on Computer Vision (ECCV)","first-page":"662","article-title":"Webly supervised concept expansion for general purpose vision models","author":"Kamath","year":"2022"},{"key":"10.1016\/j.neucom.2026.133025_bib0315","author":"Wang"},{"key":"10.1016\/j.neucom.2026.133025_bib0320","series-title":"European Conference on Computer Vision (ECCV)","first-page":"323","article-title":"LLaMA-VID: an image is worth 2 tokens in large language models","author":"Li","year":"2024"},{"key":"10.1016\/j.neucom.2026.133025_bib0325","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR)","first-page":"19792","article-title":"VisionZip: longer is better but not necessary in vision language models","author":"Yang","year":"2025"},{"key":"10.1016\/j.neucom.2026.133025_bib0330","series-title":"Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics","first-page":"311","article-title":"BLEU: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002"},{"key":"10.1016\/j.neucom.2026.133025_bib0335","series-title":"Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization","first-page":"65","article-title":"METEOR: an automatic metric for MT evaluation with improved correlation with human judgments","author":"Banerjee","year":"2005"},{"key":"10.1016\/j.neucom.2026.133025_bib0340","series-title":"Text Summarization Branches Out","first-page":"74","article-title":"ROUGE: a package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"10.1016\/j.neucom.2026.133025_bib0345","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"4566","article-title":"CIDEr: consensus-based image description evaluation","author":"Vedantam","year":"2015"},{"key":"10.1016\/j.neucom.2026.133025_bib0350","series-title":"European Conference on Computer Vision (ECCV)","first-page":"382","article-title":"SPICE: semantic propositional image caption evaluation","author":"Anderson","year":"2016"},{"key":"10.1016\/j.neucom.2026.133025_bib0355","author":"Hurst"},{"key":"10.1016\/j.neucom.2026.133025_bib0360","series-title":"Advances in Neural Information Processing Systems","first-page":"17612","article-title":"Mind the gap: understanding the modality gap in multi-modal contrastive representation learning","author":"Liang","year":"2022"}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231226004224?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231226004224?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T03:30:13Z","timestamp":1771644613000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0925231226004224"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5]]},"references-count":70,"alternative-id":["S0925231226004224"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133025","relation":{},"ISSN":["0925-2312"],"issn-type":[{"value":"0925-2312","type":"print"}],"subject":[],"published":{"date-parts":[[2026,5]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"ReCoD: Enhancing image description for cross-modal understanding via retrieval and comparison feedback mechanism","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133025","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"133025"}}