{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T08:52:55Z","timestamp":1763196775106,"version":"3.45.0"},"publisher-location":"Singapore","reference-count":33,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819533480","type":"print"},{"value":"9789819533497","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,16]],"date-time":"2025-11-16T00:00:00Z","timestamp":1763251200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,16]],"date-time":"2025-11-16T00:00:00Z","timestamp":1763251200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-3349-7_7","type":"book-chapter","created":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T08:49:21Z","timestamp":1763196561000},"page":"81-93","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Retrieving and\u00a0Reading Multimodal Documents for\u00a0Knowledge-Based VQA"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-7849-338X","authenticated-orcid":false,"given":"Wen","family":"Xie","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2550-3056","authenticated-orcid":false,"given":"Xin","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6335-1340","authenticated-orcid":false,"given":"Meishan","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3895-5510","authenticated-orcid":false,"given":"Min","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,16]]},"reference":[{"key":"7_CR1","unstructured":"Bai, J., et al.: Qwen-VL: a versatile vision-language model for understanding, localization, text reading, and beyond (2023)"},{"key":"7_CR2","doi-asserted-by":"crossref","unstructured":"Bin, Y., Li, H., Xu, Y., Xu, X., Yang, Y., Shen, H.T.: Unifying two-stream encoders with transformers for cross-modal retrieval. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 3041\u20133050 (2023)","DOI":"10.1145\/3581783.3612427"},{"key":"7_CR3","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems, vol.\u00a033, pp. 1877\u20131901 (2020)"},{"key":"7_CR4","doi-asserted-by":"crossref","unstructured":"Bulian, J., Buck, C., Gajewski, W., B\u00f6rschinger, B., Schuster, T.: Tomayto, Tomahto. Beyond token-level answer equivalence for question answering evaluation. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing. Abu Dhabi, United Arab Emirates, pp. 291\u2013305 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.20"},{"key":"7_CR5","doi-asserted-by":"crossref","unstructured":"Caffagni, D., et al.: Wiki-LLAVA: hierarchical retrieval-augmented generation for multimodal LLMs. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1818\u20131826 (2024)","DOI":"10.1109\/CVPRW63382.2024.00188"},{"key":"7_CR6","doi-asserted-by":"crossref","unstructured":"Chen, Y., et al.: Can pre-trained vision and language models answer visual information-seeking questions? In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, Singapore, pp. 14948\u201314968 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.925"},{"key":"7_CR7","doi-asserted-by":"crossref","unstructured":"Cocchi, F., Moratelli, N., Cornia, M., Baraldi, L., Cucchiara, R.: Augmenting multimodal LLMs with self-reflective tokens for knowledge-based visual question answering (2024). https:\/\/arxiv.org\/abs\/2411.16863","DOI":"10.1109\/CVPR52734.2025.00859"},{"key":"7_CR8","doi-asserted-by":"crossref","unstructured":"Gao, F., Ping, Q., Thattai, G., Reganti, A., Wu, Y.N., Natarajan, P.: Transform-retrieve-generate: Natural language-centric outside-knowledge visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5067\u20135077 (2022)","DOI":"10.1109\/CVPR52688.2022.00501"},{"key":"7_CR9","doi-asserted-by":"crossref","unstructured":"Gard\u00e8res, F., Ziaeefard, M., Abeloos, B., Lecue, F.: ConceptBERT: concept-aware representation for visual question answering. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 489\u2013498 (2020)","DOI":"10.18653\/v1\/2020.findings-emnlp.44"},{"key":"7_CR10","doi-asserted-by":"crossref","unstructured":"Gui, L., Wang, B., Huang, Q., Hauptmann, A., Bisk, Y., Gao, J.: KAT: a knowledge augmented transformer for vision-and-language. arXiv preprint arXiv:2112.08614 (2021)","DOI":"10.18653\/v1\/2022.naacl-main.70"},{"key":"7_CR11","doi-asserted-by":"crossref","unstructured":"Ishmam, M.F., Shovon, M.S.H., Mridha, M.F., Dey, N.: From image to language: a critical analysis of visual question answering (VQA) approaches, challenges, and opportunities. Inf. Fusion., 102270 (2024)","DOI":"10.1016\/j.inffus.2024.102270"},{"key":"7_CR12","unstructured":"Izacard, G., et al.: Unsupervised dense information retrieval with contrastive learning. Trans. Mach. Learn. Res. 2022 (2022)"},{"key":"7_CR13","doi-asserted-by":"crossref","unstructured":"Karpukhin, V., et al.: Dense passage retrieval for open-domain question answering. In: Proceedings the EMNLP 2020, pp. 6769\u20136781 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.550"},{"key":"7_CR14","doi-asserted-by":"crossref","unstructured":"Lerner, P., Ferret, O., Guinaudeau, C.: Cross-modal retrieval for knowledge-based visual question answering. In: European Conference on Information Retrieval, pp. 421\u2013438. Springer (2024)","DOI":"10.1007\/978-3-031-56027-9_26"},{"key":"7_CR15","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.C.H.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, ICML 2023, Honolulu, Hawaii, USA, 23\u201329 July 2023, vol.\u00a0202, pp. 19730\u201319742 (2023)"},{"key":"7_CR16","unstructured":"Li, Z., Zhang, X., Zhang, Y., Long, D., Xie, P., Zhang, M.: Towards general text embeddings with multi-stage contrastive learning. arXiv preprint arXiv:2308.03281 (2023)"},{"key":"7_CR17","doi-asserted-by":"crossref","unstructured":"Lin, W., Byrne, B.: Retrieval augmented visual question answering with outside knowledge. In: Proceedings of the EMNLP, pp. 11238\u201311254, Abu Dhabi, United Arab Emirates (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.772"},{"key":"7_CR18","first-page":"22820","volume":"36","author":"W Lin","year":"2023","unstructured":"Lin, W., Chen, J., Mei, J., Coca, A., Byrne, B.: Fine-grained late-interaction multi-modal retrieval for retrieval augmented visual question answering. Adv. Neural. Inf. Process. Syst. 36, 22820\u201322840 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"7_CR19","doi-asserted-by":"crossref","unstructured":"Lin, W., Mei, J., Chen, J., Byrne, B.: PreFLMR: scaling up fine-grained late-interaction multi-modal retrievers. In: Proceedings of the ACL, Bangkok, Thailand, pp. 5294\u20135316 (2024)","DOI":"10.18653\/v1\/2024.acl-long.289"},{"key":"7_CR20","unstructured":"Liu, H., et al.: LLaVA-NeXT: improved reasoning, OCR, and world knowledge (2024). https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"7_CR21","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning (2023)"},{"key":"7_CR22","unstructured":"Lu, H., et\u00a0al.: DeepSeek-VL: towards real-world vision-language understanding. arXiv preprint arXiv:2403.05525 (2024)"},{"key":"7_CR23","doi-asserted-by":"crossref","unstructured":"Luo, M., Zeng, Y., Banerjee, P., Baral, C.: Weakly-supervised visual-retriever-reader for knowledge-based question answering. In: Proceedings of the EMNLP, Online and Punta Cana, Dominican Republic, pp. 6417\u20136431 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.517"},{"key":"7_CR24","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., Mottaghi, R.: OK-VQA: a visual question answering benchmark requiring external knowledge. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3195\u20133204 (2019)","DOI":"10.1109\/CVPR.2019.00331"},{"key":"7_CR25","doi-asserted-by":"crossref","unstructured":"Mensink, T., et al.: Encyclopedic VQA: visual questions about detailed properties of fine-grained categories. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3113\u20133124 (2023)","DOI":"10.1109\/ICCV51070.2023.00289"},{"key":"7_CR26","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"7_CR27","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"7_CR28","doi-asserted-by":"crossref","unstructured":"Schwenk, D., Khandelwal, A., Clark, C., Marino, K., Mottaghi, R.: A-OKVQA: a benchmark for visual question answering using world knowledge. In: Proceedings of the ECCV 2022, vol. 13668, pp. 146\u2013162 (2022)","DOI":"10.1007\/978-3-031-20074-8_9"},{"key":"7_CR29","doi-asserted-by":"crossref","unstructured":"Shah, S., Mishra, A., Yadati, N., Talukdar, P.P.: KVQA: knowledge-aware visual question answering. In: The Thirty-Third AAAI Conference on Artificial Intelligence, pp. 8876\u20138884 (2019)","DOI":"10.1609\/aaai.v33i01.33018876"},{"key":"7_CR30","doi-asserted-by":"crossref","unstructured":"Trotman, A., Puurula, A., Burgess, B.: Improvements to BM25 and language models examined. In: Proceedings of the 2014 Australasian Document Computing Symposium, ADCS 2014, Melbourne, VIC, Australia, p.\u00a058 (2014)","DOI":"10.1145\/2682862.2682863"},{"key":"7_CR31","unstructured":"Wang, P., et\u00a0al.: Qwen2-VL: enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)"},{"key":"7_CR32","unstructured":"Yao, Y., et\u00a0al.: MiniCPM-V: a GPT-4V level MLLM on your phone. arXiv:2408.01800 (2024)"},{"key":"7_CR33","unstructured":"Zhang, T., et al.: RAFT: adapting language model to domain specific RAG. arXiv preprint arXiv:2403.10131 (2024)"}],"container-title":["Lecture Notes in Computer Science","Natural Language Processing and Chinese Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-3349-7_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T08:49:30Z","timestamp":1763196570000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-3349-7_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,16]]},"ISBN":["9789819533480","9789819533497"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-3349-7_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,16]]},"assertion":[{"value":"16 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"NLPCC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CCF International Conference on Natural Language Processing and Chinese Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"nlpcc2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/tcci.ccf.org.cn\/conference\/2025\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}