{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T15:42:15Z","timestamp":1776181335597,"version":"3.50.1"},"reference-count":54,"publisher":"Elsevier BV","issue":"4","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100008845","name":"Xinjiang University","doi-asserted-by":"publisher","award":["XJDX2025YJS189"],"award-info":[{"award-number":["XJDX2025YJS189"]}],"id":[{"id":"10.13039\/501100008845","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62166043"],"award-info":[{"award-number":["62166043"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Information Processing &amp; Management"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.ipm.2025.104606","type":"journal-article","created":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T02:59:13Z","timestamp":1768013953000},"page":"104606","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"title":["PrQAC : Prompting LLaMA3 with question-aware image captions and answer candidates for knowledge-based VQA"],"prefix":"10.1016","volume":"63","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-7817-1290","authenticated-orcid":false,"given":"Peichao","family":"Jiang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8766-0647","authenticated-orcid":false,"given":"Mayire","family":"Ibrayim","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4085-7651","authenticated-orcid":false,"given":"Linying","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0465-3117","authenticated-orcid":false,"given":"Wenjie","family":"Xu","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.ipm.2025.104606_bib0001","series-title":"2018 IEEE\/CVF conference on computer vision and pattern recognition","first-page":"6077","article-title":"Bottom-up and top-down attention for image captioning and visual question answering","author":"Anderson","year":"2018"},{"key":"10.1016\/j.ipm.2025.104606_bib0002","series-title":"2015 IEEE International conference on computer vision, ICCV 2015, Santiago, Chile, December 7-13, 2015","first-page":"2425","article-title":"VQA: Visual question answering","author":"Antol","year":"2015"},{"key":"10.1016\/j.ipm.2025.104606_bib0003","unstructured":"Yang, Q. A., Yang, B., Zhang, B., Hui, B., Zheng, B., Yu, B., Li, C., Liu, D., Huang, F., Dong, G., Wei, H., Lin, H., Yang, J., Tu, J., Zhang, J., Yang, J., Yang, J., Zhou, J., Lin, J., Wang, Z. (2024). Qwen2.5 Technical Report. ArXiv, abs\/2412.15115, https:\/\/api.semanticscholar.org\/CorpusID:274859421."},{"key":"10.1016\/j.ipm.2025.104606_bib0004","article-title":"Language models are few-shot learners","volume":"abs\/2005.14165","author":"Brown","year":"2020","journal-title":"ArXiv"},{"key":"10.1016\/j.ipm.2025.104606_bib0005","series-title":"Computer vision - ECCV 2020 - 16th European conference, Glasgow, UK, August 23-28, 2020, proceedings, part XXX","first-page":"104","article-title":"UNITER: Universal image-text representation learning","volume":"vol. 12375","author":"Chen","year":"2020"},{"key":"10.1016\/j.ipm.2025.104606_bib0006","series-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.ipm.2025.104606_bib0007","series-title":"2022 IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"5079","article-title":"MuKEA: Multimodal knowledge extraction and accumulation for knowledge-based visual question answering","author":"Ding","year":"2022"},{"key":"10.1016\/j.ipm.2025.104606_bib0008","series-title":"2022 IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"5057","article-title":"Transform-retrieve-generate: Natural language-centric outside-knowledge visual question answering","author":"Gao","year":"2022"},{"key":"10.1016\/j.ipm.2025.104606_bib0009","series-title":"Findings of the association for computational linguistics: EMNLP 2020, online event, 16-20 November 2020","first-page":"489","article-title":"ConceptBERT: Concept-aware representation for visual question answering","volume":"vol. EMNLP 2020","author":"Gard\u00e8res","year":"2020"},{"key":"10.1016\/j.ipm.2025.104606_bib0010","series-title":"2017 IEEE conference on computer vision and pattern recognition (CVPR)","first-page":"6325","article-title":"Making the V in VQA matter: Elevating the role of image understanding in visual question answering","author":"Goyal","year":"2017"},{"key":"10.1016\/j.ipm.2025.104606_bib0011","unstructured":"Grattafiori, A., Dubey, A., Jauhri, A., Pandey, A., Kadian, A., Al-Dahle, A., Letman, A., Mathur, A., Schelten, A., Vaughan, A. et al. (2024). The Llama 3 herd of models. arXiv preprint arXiv: 2407.21783,."},{"key":"10.1016\/j.ipm.2025.104606_bib0012","series-title":"Proceedings of the 2022 conference of the North American chapter of the association for computational linguistics: human language technologies","first-page":"956","article-title":"KAT: A knowledge augmented transformer for vision-and-language","author":"Gui","year":"2022"},{"key":"10.1016\/j.ipm.2025.104606_bib0013","series-title":"2017 IEEE International conference on computer vision (ICCV)","first-page":"804","article-title":"Learning to reason: End-to-end module networks for visual question answering","author":"Hu","year":"2017"},{"key":"10.1016\/j.ipm.2025.104606_bib0014","series-title":"2023 IEEE\/CVF International conference on computer vision (ICCV)","first-page":"2951","article-title":"PromptCAP: Prompt-guided image captioning for VQA with GPT-3","author":"Hu","year":"2023"},{"key":"10.1016\/j.ipm.2025.104606_bib0015","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110399","article-title":"Prompting large language model with context and pre-answer for knowledge-based VQA","volume":"151","author":"Hu","year":"2024","journal-title":"Pattern Recognition"},{"issue":"3","key":"10.1016\/j.ipm.2025.104606_bib0016","doi-asserted-by":"crossref","first-page":"843","DOI":"10.26599\/BDMA.2024.9020026","article-title":"Prompting large language models with knowledge-injection for knowledge-based visual question answering","volume":"7","author":"Hu","year":"2024","journal-title":"Big Data Mining and Analytics"},{"key":"10.1016\/j.ipm.2025.104606_bib0017","series-title":"2019 IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"6693","article-title":"GQA: A new dataset for real-world visual reasoning and compositional question answering","author":"Hudson","year":"2019"},{"key":"10.1016\/j.ipm.2025.104606_bib0018","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2024.102270","article-title":"From image to language: A critical analysis of visual question answering (VQA) approaches, challenges, and opportunities","volume":"106","author":"Ishmam","year":"2024","journal-title":"Information Fusion"},{"key":"10.1016\/j.ipm.2025.104606_bib0019","author":"Jiang"},{"key":"10.1016\/j.ipm.2025.104606_bib0020","series-title":"European conference on computer vision","first-page":"662","article-title":"Webly supervised concept expansion for general purpose vision models","author":"Kamath","year":"2022"},{"key":"10.1016\/j.ipm.2025.104606_bib0021","series-title":"Computer vision - ECCV 2024 - 18th European conference, Milan, Italy, September 29-October 4, 2024, proceedings, part XX","first-page":"132","article-title":"HYDRA: A hyper agent for dynamic compositional visual reasoning","volume":"vol. 15078","author":"Ke","year":"2024"},{"key":"10.1016\/j.ipm.2025.104606_bib0022","series-title":"Advances in neural information processing systems 31: Annual conference on neural information processing systems 2018, neurIPS 2018, December 3-8, 2018, Montr\u00e9al, Canada","first-page":"1571","article-title":"Bilinear attention networks","author":"Kim","year":"2018"},{"key":"10.1016\/j.ipm.2025.104606_bib0023","series-title":"International conference on machine learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume":"vol. 202","author":"Li","year":"2023"},{"key":"10.1016\/j.ipm.2025.104606_bib0024","series-title":"International conference on machine learning, ICML 2022, 17-23 July 2022, Baltimore, Maryland, USA","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume":"vol. 162","author":"Li","year":"2022"},{"key":"10.1016\/j.ipm.2025.104606_bib0025","series-title":"2019 IEEE\/CVF International conference on computer vision, ICCV 2019, Seoul, Korea (South), October 27-November 2, 2019","first-page":"10312","article-title":"Relation-aware graph attention network for visual question answering","author":"Li","year":"2019"},{"key":"10.1016\/j.ipm.2025.104606_bib0026","first-page":"10560","article-title":"Revive: Regional visual representation matters in knowledge-based visual question answering","volume":"35","author":"Lin","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.ipm.2025.104606_bib0027","series-title":"2024 IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"26286","article-title":"Improved baselines with visual instruction tuning","author":"Liu","year":"2024"},{"issue":"4","key":"10.1016\/j.ipm.2025.104606_bib0028","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1023\/B:BTTJ.0000047600.45421.6d","article-title":"ConceptNet \u2013 A practical commonsense reasoning tool-kit","volume":"22","author":"Liu","year":"2004","journal-title":"BT Technology Journal"},{"issue":"5","key":"10.1016\/j.ipm.2025.104606_bib0029","doi-asserted-by":"crossref","DOI":"10.1016\/j.ipm.2024.103809","article-title":"Are LLMs good at structured outputs? A benchmark for evaluating structured output capabilities in LLMs","volume":"61","author":"Liu","year":"2024","journal-title":"Information Processing & Management"},{"key":"10.1016\/j.ipm.2025.104606_bib0030","unstructured":"Lu, J., Batra, D., Parikh, D., & Lee, S. (2019). ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In Proceedings of the 33rd International Conference on Neural Information Processing Systems, 32. 11."},{"key":"10.1016\/j.ipm.2025.104606_bib0031","series-title":"Proceedings of the 2021 conference on empirical methods in natural language processing, EMNLP 2021, virtual event \/ Punta Cana, Dominican Republic, 7-11 November, 2021","first-page":"6417","article-title":"Weakly-supervised visual-retriever-reader for knowledge-based question answering","author":"Luo","year":"2021"},{"key":"10.1016\/j.ipm.2025.104606_bib0032","series-title":"IEEE conference on computer vision and pattern recognition, CVPR 2021, virtual, June 19-25, 2021","first-page":"14111","article-title":"KRISP: Integrating implicit and symbolic knowledge for open-domain knowledge-based VQA","author":"Marino","year":"2021"},{"key":"10.1016\/j.ipm.2025.104606_bib0033","series-title":"IEEE conference on computer vision and pattern recognition, CVPR 2019, Long Beach, CA, USA, June 16-20, 2019","first-page":"3195","article-title":"OK-VQA: A visual question answering benchmark requiring external knowledge","author":"Marino","year":"2019"},{"key":"10.1016\/j.ipm.2025.104606_bib0034","article-title":"ClipCap: CLIP prefix for image captioning","volume":"abs\/2111.09734","author":"Mokady","year":"2021","journal-title":"ArXiv"},{"issue":"4","key":"10.1016\/j.ipm.2025.104606_bib0035","doi-asserted-by":"crossref","DOI":"10.1016\/j.ipm.2024.103726","article-title":"Explainable knowledge reasoning via thought chains for knowledge-based visual question answering","volume":"61","author":"Qiu","year":"2024","journal-title":"Information Processing & Management"},{"key":"10.1016\/j.ipm.2025.104606_bib0036","series-title":"International conference on machine learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.ipm.2025.104606_bib0037","series-title":"2023 IEEE\/CVF winter conference on applications of computer vision (WACV)","first-page":"1155","article-title":"VLC-BERT: Visual question answering with contextualized commonsense knowledge","author":"Ravi","year":"2023"},{"key":"10.1016\/j.ipm.2025.104606_bib0038","series-title":"Computer vision - ECCV 2022 - 17th European conference, Tel Aviv, Israel, October 23-27, 2022, proceedings, part VIII","first-page":"146","article-title":"A-OKVQA: A benchmark for visual question answering using world knowledge","volume":"vol. 13668","author":"Schwenk","year":"2022"},{"key":"10.1016\/j.ipm.2025.104606_bib0039","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"8876","article-title":"KVQA: Knowledge-aware visual question answering","volume":"vol. 33","author":"Shah","year":"2019"},{"key":"10.1016\/j.ipm.2025.104606_bib0040","series-title":"2023 IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"14974","article-title":"Prompting large language models with answer heuristics for knowledge-based visual question answering","author":"Shao","year":"2023"},{"key":"10.1016\/j.ipm.2025.104606_bib0041","series-title":"The tenth international conference on learning representations, ICLR 2022, virtual event, April 25-29, 2022","article-title":"How much can CLIP benefit vision-and-language tasks?","author":"Shen","year":"2022"},{"key":"10.1016\/j.ipm.2025.104606_bib0042","series-title":"Findings of the association for computational linguistics: NAACL 2024","first-page":"1836","article-title":"Prompt space optimizing few-shot reasoning success with large language models","author":"Shi","year":"2024"},{"key":"10.1016\/j.ipm.2025.104606_bib0043","series-title":"Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th international joint conference on natural language processing, EMNLP-IJCNLP 2019, Hong Kong, China, November 3-7, 2019","first-page":"5099","article-title":"LXMERT: Learning cross-modality encoder representations from transformers","author":"Tan","year":"2019"},{"issue":"10","key":"10.1016\/j.ipm.2025.104606_bib0044","doi-asserted-by":"crossref","first-page":"78","DOI":"10.1145\/2629489","article-title":"Wikidata: A free collaborative knowledgebase","volume":"57","author":"Vrande\u010di\u0107","year":"2014","journal-title":"Communications of the ACM"},{"key":"10.1016\/j.ipm.2025.104606_bib0045","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.113711","article-title":"Toward profundity and precision: Reinventing knowledge retrieval capabilities guided by human cognition","volume":"322","author":"Wang","year":"2025","journal-title":"Knowledge-Based Systems"},{"key":"10.1016\/j.ipm.2025.104606_bib0046","series-title":"Proceedings of the twenty-sixth international joint conference on artificial intelligence, IJCAI 2017, Melbourne, Australia, August 19-25, 2017","first-page":"1290","article-title":"Explicit knowledge-based reasoning for visual question answering","author":"Wang","year":"2017"},{"key":"10.1016\/j.ipm.2025.104606_bib0047","series-title":"Thirty-sixth AAAI conference on artificial intelligence, AAAI 2022","first-page":"2712","article-title":"Multi-modal answer validation for knowledge-based VQA","author":"Wu","year":"2022"},{"key":"10.1016\/j.ipm.2025.104606_bib0048","unstructured":"Xie, S. M., Raghunathan, A., Liang, P., & Ma, T. (2021). An explanation of in-context learning as implicit bayesian inference. arXiv preprint arXiv: 2111.02080."},{"key":"10.1016\/j.ipm.2025.104606_bib0049","first-page":"3081","article-title":"An empirical study of GPT-3 for few-shot knowledge-based VQA","author":"Yang","year":"2022"},{"key":"10.1016\/j.ipm.2025.104606_bib0050","series-title":"2019 IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"6274","article-title":"Deep modular co-attention networks for visual question answering","author":"Yu","year":"2019"},{"key":"10.1016\/j.ipm.2025.104606_bib0051","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2025.129345","article-title":"VQA and visual reasoning: An overview of approaches, datasets, and future direction","volume":"622","author":"Zakari","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.ipm.2025.104606_bib0052","series-title":"2021 IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"5575","article-title":"VinVL: Revisiting visual representations in vision-language models","author":"Zhang","year":"2021"},{"key":"10.1016\/j.ipm.2025.104606_bib0053","doi-asserted-by":"crossref","DOI":"10.1016\/j.asoc.2025.113625","article-title":"DAPlanner: Dual-agent framework with multi-modal large language model for autonomous driving motion planning","volume":"183","author":"Zhang","year":"2025","journal-title":"Applied Soft Computing"},{"key":"10.1016\/j.ipm.2025.104606_bib0054","series-title":"Proceedings of the twenty-ninth international joint conference on artificial intelligence, IJCAI 2020","first-page":"1097","article-title":"Mucko: Multi-layer cross-modal knowledge reasoning for fact-based visual question answering","author":"Zhu","year":"2020"}],"container-title":["Information Processing &amp; Management"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0306457325005473?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0306457325005473?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T14:42:45Z","timestamp":1776177765000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0306457325005473"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":54,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2026,6]]}},"alternative-id":["S0306457325005473"],"URL":"https:\/\/doi.org\/10.1016\/j.ipm.2025.104606","relation":{},"ISSN":["0306-4573"],"issn-type":[{"value":"0306-4573","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"PrQAC : Prompting LLaMA3 with question-aware image captions and answer candidates for knowledge-based VQA","name":"articletitle","label":"Article Title"},{"value":"Information Processing & Management","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.ipm.2025.104606","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"104606"}}