{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T02:02:22Z","timestamp":1780020142893,"version":"3.53.1"},"reference-count":48,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.knosys.2026.116157","type":"journal-article","created":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T11:18:53Z","timestamp":1778757533000},"page":"116157","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Enabling collaborative parametric knowledge calibration for retrieval-augmented Vision Question Answering"],"prefix":"10.1016","volume":"346","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-5426-127X","authenticated-orcid":false,"given":"Jiaqi","family":"Deng","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kaize","family":"Shi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4450-7738","authenticated-orcid":false,"given":"Zonghan","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Huan","family":"Huo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dingxian","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4493-6663","authenticated-orcid":false,"given":"Guandong","family":"Xu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"issue":"10","key":"10.1016\/j.knosys.2026.116157_b1","doi-asserted-by":"crossref","first-page":"78","DOI":"10.1145\/2629489","article-title":"Wikidata","volume":"57","author":"Vrande\u010di\u0107","year":"2014","journal-title":"Commun. ACM"},{"key":"10.1016\/j.knosys.2026.116157_b2","doi-asserted-by":"crossref","unstructured":"M. Luo, Y. Zeng, P. Banerjee, C. Baral, Weakly-Supervised Visual-Retriever-Reader for Knowledge-based Question Answering, in: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, 2021.","DOI":"10.18653\/v1\/2021.emnlp-main.517"},{"key":"10.1016\/j.knosys.2026.116157_b3","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2021.107650","article-title":"Fact-based visual question answering via dual-process system","volume":"237","author":"Liu","year":"2022","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116157_b4","first-page":"3081","article-title":"An empirical study of GPT-3 for few-shot knowledge-based VQA","volume":"vol. 36","author":"Yang","year":"2022"},{"key":"10.1016\/j.knosys.2026.116157_b5","doi-asserted-by":"crossref","unstructured":"J.B. Alayrac, J. Donahue, P. Luc, A. Miech, I. Barr, Y. Hasson, K. Lenc, A. Mensch, K. Millican, M. Reynolds, R. Ring, E. Rutherford, S. Cabi, T. Han, Z. Gong, S. Samangooei, M. Monteiro, J. Menick, S. Borgeaud, A. Brock, A. Nematzadeh, S. Sharifzadeh, M. Binkowski, R. Barreira, O. Vinyals, A. Zisserman, K. Simonyan, Flamingo: a Visual Language Model for Few-Shot Learning, in: Proceedings of Neural Information Processing Systems (NeurIPS), 2022.","DOI":"10.52202\/068431-1723"},{"key":"10.1016\/j.knosys.2026.116157_b6","unstructured":"X. Chen, X. Wang, S. Changpinyo, A. Piergiovanni, P. Padlewski, D. Salz, S. Goodman, A. Grycner, B. Mustafa, L. Beyer, A. Kolesnikov, J. Puigcerver, N. Ding, K. Rong, H. Akbari, G. Mishra, L. Xue, A. Thapliyal, J. Bradbury, W. Kuo, M. Seyedhosseini, C. Jia, B.K. Ayan, C. Riquelme, A. Steiner, A. Angelova, X. Zhai, N. Houlsby, R. Soricut, PaLI: A Jointly-Scaled Multilingual Language-Image Model, in: International Conference on Learning Representations, ICLR, 2023."},{"key":"10.1016\/j.knosys.2026.116157_b7","series-title":"GeReA: Question-aware prompt captions for knowledge-based visual question answering","author":"Ma","year":"2024"},{"key":"10.1016\/j.knosys.2026.116157_b8","article-title":"SCAG: Semantic co-occurring attention guided alignment for knowledge-based visual question answering","author":"Liu","year":"2025","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"10.1016\/j.knosys.2026.116157_b9","series-title":"Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","first-page":"9802","article-title":"When not to trust language models: Investigating effectiveness of parametric and non-parametric memories","author":"Mallen","year":"2023"},{"key":"10.1016\/j.knosys.2026.116157_b10","series-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing","first-page":"12076","article-title":"FActScore: Fine-grained atomic evaluation of factual precision in long form text generation","author":"Min","year":"2023"},{"key":"10.1016\/j.knosys.2026.116157_b11","doi-asserted-by":"crossref","unstructured":"L. Gui, B. Wang, Q. Huang, A. Hauptmann, Y. Bisk, J. Gao, KAT: A Knowledge Augmented Transformer for Vision-and-Language, in: Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics, 2021.","DOI":"10.18653\/v1\/2022.naacl-main.70"},{"key":"10.1016\/j.knosys.2026.116157_b12","unstructured":"Y. Lin, Y. Xie, D. Chen, Y. Xu, C. Zhu, L. Yuan, REVIVE: Regional Visual Representation Matters in Knowledge-Based Visual Question Answering, in: 36th Conference on Neural Information Processing Systems (NeurIPS 2022), 2022."},{"key":"10.1016\/j.knosys.2026.116157_b13","unstructured":"W. Lin, J. Chen, J. Mei, A. Coca, B. Byrne, Fine-grained Late-interaction Multi-modal Retrieval for Retrieval Augmented Visual Question Answering, in: 37th Conference on Neural Information Processing Systems, 2023."},{"key":"10.1016\/j.knosys.2026.116157_b14","series-title":"European Conference on Computer Vision","first-page":"132","article-title":"HYDRA: A hyper agent for dynamic compositional visual reasoning","author":"Ke","year":"2025"},{"key":"10.1016\/j.knosys.2026.116157_b15","doi-asserted-by":"crossref","unstructured":"W. Lin, B. Byrne, Retrieval Augmented Visual Question Answering with Outside Knowledge, in: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, 2022.","DOI":"10.18653\/v1\/2022.emnlp-main.772"},{"key":"10.1016\/j.knosys.2026.116157_b16","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1162\/tacl_a_00530","article-title":"Improving the domain adaptation of retrieval augmented generation (RAG) models for open domain question answering","volume":"11","author":"Siriwardhana","year":"2023","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"10.1016\/j.knosys.2026.116157_b17","series-title":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"5057","article-title":"Transform-retrieve-generate: Natural language-centric outside-knowledge visual question answering","author":"Gao","year":"2022"},{"key":"10.1016\/j.knosys.2026.116157_b18","unstructured":"A. Radford, J.W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin, J. Clark, G. Krueger, I. Sutskever, Learning Transferable Visual Models From Natural Language Supervision, in: Proceedings of the 38 Th International Conference on Machine Learning, 2021."},{"key":"10.1016\/j.knosys.2026.116157_b19","unstructured":"J. Li, D. Li, C. Xiong, S. Hoi, BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation, in: Proceedings of the 39 th International Conference on Machine Learning, 2022."},{"key":"10.1016\/j.knosys.2026.116157_b20","unstructured":"J. Li, D. Li, S. Savarese, S. Hoi, BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models, in: Proceedings of the 40 th International Conference on Machine Learning, 2023."},{"key":"10.1016\/j.knosys.2026.116157_b21","doi-asserted-by":"crossref","unstructured":"W. Dai, J. Li, D. Li, A.M.H. Tiong, J. Zhao, W. Wang, B. Li, P. Fung, S. Hoi, InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning, in: Proceedings of the 37th International Conference on Neural Information Processing Systems, 2023.","DOI":"10.52202\/075280-2142"},{"key":"10.1016\/j.knosys.2026.116157_b22","doi-asserted-by":"crossref","unstructured":"W. Chen, H. Hu, X. Chen, P. Verga, W.W. Cohen, MuRAG: Multimodal Retrieval-Augmented Generator for Open Question Answering over Images and Text, in: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, 2022.","DOI":"10.18653\/v1\/2022.emnlp-main.375"},{"key":"10.1016\/j.knosys.2026.116157_b23","unstructured":"M. Yasunaga, A. Aghajanyan, W. Shi, R. James, J. Leskovec, P. Liang, M. Lewis, L. Zettlemoyer, W.t. Yih, Retrieval-Augmented Multimodal Language Modeling, in: Proceedings of the 40 th International Conference on Machine Learning, 2022."},{"key":"10.1016\/j.knosys.2026.116157_b24","doi-asserted-by":"crossref","unstructured":"Z. Hu, A. Iscen, C. Sun, Z. Wang, K.W. Chang, Y. Sun, C. Schmid, D.A. Ross, A. Fathi, REVEAL: Retrieval-Augmented Visual-Language Pre-Training with Multi-Source Multimodal Knowledge Memory, in: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022.","DOI":"10.1109\/CVPR52729.2023.02238"},{"key":"10.1016\/j.knosys.2026.116157_b25","doi-asserted-by":"crossref","unstructured":"W. Lin, J. Mei, J. Chen, B. Byrne, PreFLMR: Scaling Up Fine-Grained Late-Interaction Multi-modal Retrievers, in: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics, 2024.","DOI":"10.18653\/v1\/2024.acl-long.289"},{"key":"10.1016\/j.knosys.2026.116157_b26","doi-asserted-by":"crossref","unstructured":"O. Khattab, M. Zaharia, ColBERT: Efficient and Effective Passage Search via Contextualized Late Interaction over BERT, in: Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval, 2020.","DOI":"10.1145\/3397271.3401075"},{"key":"10.1016\/j.knosys.2026.116157_b27","unstructured":"Z. Yu, X. Ouyang, Z. Shao, M. Wang, J. Yu, Prophet: Prompting Large Language Models with Complementary Answer Heuristics for Knowledge-based Visual Question Answering, in: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2023."},{"key":"10.1016\/j.knosys.2026.116157_b28","series-title":"Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval","first-page":"110","article-title":"A symmetric dual encoding dense retrieval framework for knowledge-intensive visual question answering","author":"Salemi","year":"2023"},{"key":"10.1016\/j.knosys.2026.116157_b29","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.113556","article-title":"HiLINK: Hierarchical linking of context-aware knowledge prediction and prompt tuning for bilingual knowledge-based visual question answering","volume":"319","author":"Jeong","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116157_b30","unstructured":"T.B. Brown, B. Mann, N. Ryder, M. Subbiah, J. Kaplan, P. Dhariwal, A. Neelakantan, P. Shyam, G. Sastry, A. Askell, S. Agarwal, A. Herbert-Voss, G. Krueger, T. Henighan, R. Child, A. Ramesh, D.M. Ziegler, J. Wu, C. Winter, C. Hesse, M. Chen, E. Sigler, M. Litwin, S. Gray, B. Chess, J. Clark, C. Berner, S. McCandlish, A. Radford, I. Sutskever, D. Amodei, Language Models are Few-Shot Learners, in: 34th Conference on Neural Information Processing Systems (NeurIPS 2020), 2020."},{"key":"10.1016\/j.knosys.2026.116157_b31","series-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","first-page":"4231","article-title":"Open domain question answering using early fusion of knowledge bases and text","author":"Sun","year":"2018"},{"key":"10.1016\/j.knosys.2026.116157_b32","series-title":"Proceedings of the 30th ACM International Conference on Multimedia","first-page":"2061","article-title":"A unified end-to-end retriever-reader framework for knowledge-based VQA","author":"Guo","year":"2022"},{"key":"10.1016\/j.knosys.2026.116157_b33","doi-asserted-by":"crossref","unstructured":"R. Speer, J. Chin, C. Havasi, ConceptNet 5.5: An Open Multilingual Graph of General Knowledge, in: Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence, 2016.","DOI":"10.1609\/aaai.v31i1.11164"},{"key":"10.1016\/j.knosys.2026.116157_b34","series-title":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"5079","article-title":"MuKEA: Multimodal knowledge extraction and accumulation for knowledge-based visual question answering","author":"Ding","year":"2022"},{"key":"10.1016\/j.knosys.2026.116157_b35","doi-asserted-by":"crossref","unstructured":"H. Liu, K. Son, J. Yang, C. Liu, J. Gao, Y.J. Lee, C. Li, Learning Customized Visual Models with Retrieval-Augmented Knowledge, in: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2023.","DOI":"10.1109\/CVPR52729.2023.01454"},{"key":"10.1016\/j.knosys.2026.116157_b36","series-title":"Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume","first-page":"874","article-title":"Leveraging passage retrieval with generative models for open domain question answering","author":"Izacard","year":"2021"},{"key":"10.1016\/j.knosys.2026.116157_b37","unstructured":"A. Dosovitskiy, L. Beyer, A. Kolesnikov, D. Weissenborn, X. Zhai, T. Unterthiner, M. Dehghani, M. Minderer, G. Heigold, S. Gelly, J. Uszkoreit, N. Houlsby, An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale, in: 9th International Conference on Learning Representations, ICLR 2021, 2020."},{"key":"10.1016\/j.knosys.2026.116157_b38","first-page":"5485","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2019","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.knosys.2026.116157_b39","series-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing","first-page":"6769","article-title":"Dense passage retrieval for open-domain question answering","author":"Karpukhin","year":"2020"},{"key":"10.1016\/j.knosys.2026.116157_b40","doi-asserted-by":"crossref","unstructured":"K. Santhanam, O. Khattab, C. Potts, M. Zaharia, PLAID: An Efficient Engine for Late Interaction Retrieval, in: CIKM \u201922: Proceedings of the 31st ACM International Conference on Information & Knowledge Management, 2022.","DOI":"10.1145\/3511808.3557325"},{"key":"10.1016\/j.knosys.2026.116157_b41","doi-asserted-by":"crossref","unstructured":"K. Marino, M. Rastegari, A. Farhadi, R. Mottaghi, OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge, in: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2019.","DOI":"10.1109\/CVPR.2019.00331"},{"key":"10.1016\/j.knosys.2026.116157_b42","doi-asserted-by":"crossref","unstructured":"Y. Chen, H. Hu, Y. Luan, H. Sun, S. Changpinyo, A. Ritter, M.W. Chang, Can Pre-trained Vision and Language Models Answer Visual Information-Seeking Questions?, in: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, 2023.","DOI":"10.18653\/v1\/2023.emnlp-main.925"},{"key":"10.1016\/j.knosys.2026.116157_b43","doi-asserted-by":"crossref","unstructured":"T.-Y. Lin, M. Maire, S. Belongie, L. Bourdev, R. Girshick, J. Hays, P. Perona, D. Ramanan, C.L. Zitnick, P. Doll\u00e1r, Microsoft COCO: Common Objects in Context, in: European Conference on Computer Vision, 2014.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"10.1016\/j.knosys.2026.116157_b44","series-title":"2023 IEEE\/CVF International Conference on Computer Vision","first-page":"12031","article-title":"Open-domain visual entity recognition: Towards recognizing millions of wikipedia entities","author":"Hu","year":"2023"},{"key":"10.1016\/j.knosys.2026.116157_b45","unstructured":"S.Y. Liu, C.Y. Wang, H. Yin, P. Molchanov, Y.C.F. Wang, K.T. Cheng, M.H. Chen, DoRA: Weight-Decomposed Low-Rank Adaptation, in: Proceedings of the 41st International Conference on Machine Learning, 2024."},{"key":"10.1016\/j.knosys.2026.116157_b46","unstructured":"I. Loshchilov, F. Hutter, Decoupled Weight Decay Regularization, in: The Seventh International Conference on Learning Representations, 2017."},{"key":"10.1016\/j.knosys.2026.116157_b47","first-page":"2712","article-title":"Multi-modal answer validation for knowledge-based VQA","volume":"vol. 36","author":"Wu","year":"2022"},{"key":"10.1016\/j.knosys.2026.116157_b48","unstructured":"D. Driess, F. Xia, M.S.M. Sajjadi, C. Lynch, A. Chowdhery, B. Ichter, A. Wahid, J. Tompson, Q. Vuong, T. Yu, W. Huang, Y. Chebotar, P. Sermanet, D. Duckworth, S. Levine, V. Vanhoucke, K. Hausman, M. Toussaint, K. Greff, A. Zeng, I. Mordatch, P. Florence, PaLM-E: An Embodied Multimodal Language Model, in: Proceedings of the 40th International Conference on Machine Learning, 2023."}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S095070512600883X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S095070512600883X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T01:12:11Z","timestamp":1780017131000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S095070512600883X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":48,"alternative-id":["S095070512600883X"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116157","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Enabling collaborative parametric knowledge calibration for retrieval-augmented Vision Question Answering","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116157","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Published by Elsevier B.V.","name":"copyright","label":"Copyright"}],"article-number":"116157"}}