{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,4]],"date-time":"2025-12-04T18:49:49Z","timestamp":1764874189961,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":54,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819626403","type":"print"},{"value":"9789819626410","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-2641-0_22","type":"book-chapter","created":{"date-parts":[[2025,3,31]],"date-time":"2025-03-31T01:00:16Z","timestamp":1743382816000},"page":"324-337","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Exploring Visual Multiple-Choice Question Answering with\u00a0Pre-trained Vision-Language Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-1889-1465","authenticated-orcid":false,"given":"Gia-Nghia","family":"Tran","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3893-8582","authenticated-orcid":false,"given":"Duc-Tuan","family":"Luu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8340-1405","authenticated-orcid":false,"given":"Dang-Van","family":"Thin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,29]]},"reference":[{"key":"22_CR1","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"22_CR2","unstructured":"Anil, R., et\u00a0al.: Palm 2 technical report. arXiv preprint arXiv:2305.10403 (2023)"},{"key":"22_CR3","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., Parikh, D.: VQA: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"22_CR4","unstructured":"Bai, J., et al.: QWEN-VL: a versatile vision-language model for understanding, localization, text reading, and beyond. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"22_CR5","doi-asserted-by":"crossref","unstructured":"Biten, A.F., et al.: Scene text visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4291\u20134301 (2019)","DOI":"10.1109\/ICCV.2019.00439"},{"key":"22_CR6","doi-asserted-by":"crossref","unstructured":"Chen, K., Wu, X.: VTQA: visual text question answering via entity alignment and cross-media reasoning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 27218\u201327227 (2024)","DOI":"10.1109\/CVPR52733.2024.02570"},{"key":"22_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58577-8_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y-C Chen","year":"2020","unstructured":"Chen, Y.-C., et al.: UNITER: UNiversal image-TExt representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 104\u2013120. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7"},{"key":"22_CR8","doi-asserted-by":"crossref","unstructured":"Chen, Z.,et\u00a0al.: How far are we to GPT-4V? Closing the gap to commercial multimodal models with open-source suites. arXiv preprint arXiv:2404.16821 (2024)","DOI":"10.1007\/s11432-024-4231-5"},{"key":"22_CR9","doi-asserted-by":"crossref","unstructured":"Chen, Z., et\u00a0al.: InternVL: scaling up vision foundation models and aligning for generic visual-linguistic tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24185\u201324198 (2024)","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"22_CR10","unstructured":"Dubey, A., et\u00a0al.: The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)"},{"key":"22_CR11","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D.H., Yang, D., Rohrbach, A., Darrell, T., Rohrbach, M.: Multimodal compact bilinear pooling for visual question answering and visual grounding. arXiv preprint arXiv:1606.01847 (2016)","DOI":"10.18653\/v1\/D16-1044"},{"key":"22_CR12","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: Elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6904\u20136913 (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"22_CR13","doi-asserted-by":"crossref","unstructured":"Guo, J., et al.: From images to textual prompts: zero-shot visual question answering with frozen large language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10867\u201310877 (2023)","DOI":"10.1109\/CVPR52729.2023.01046"},{"key":"22_CR14","doi-asserted-by":"crossref","unstructured":"He, X., Zhang, Y., Mou, L., Xing, E., Xie, P.: PathVQA: 30000+ questions for medical visual question answering. arXiv preprint arXiv:2003.10286 (2020)","DOI":"10.36227\/techrxiv.13127537.v1"},{"key":"22_CR15","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6700\u20136709 (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"22_CR16","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"22_CR17","unstructured":"Jiang, A.Q., et\u00a0al.: Mistral 7b. arXiv preprint arXiv:2310.06825 (2023)"},{"key":"22_CR18","doi-asserted-by":"crossref","unstructured":"Johnson, J., Hariharan, B., Van Der\u00a0Maaten, L., Fei-Fei, L., Lawrence\u00a0Zitnick, C., Girshick, R.: Clevr: A diagnostic dataset for compositional language and elementary visual reasoning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2901\u20132910 (2017)","DOI":"10.1109\/CVPR.2017.215"},{"key":"22_CR19","unstructured":"Kenton, J.D.M.W.C., Toutanova, L.K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL-HLT. vol.\u00a01, p.\u00a02. Minneapolis, Minnesota (2019)"},{"key":"22_CR20","doi-asserted-by":"crossref","unstructured":"Khan, Z., Fu, Y.: Consistency and uncertainty: Identifying unreliable responses from black-box vision-language models for selective visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10854\u201310863 (2024)","DOI":"10.1109\/CVPR52733.2024.01032"},{"key":"22_CR21","doi-asserted-by":"crossref","unstructured":"Lan, Y., Li, X., Liu, X., Li, Y., Qin, W., Qian, W.: Improving zero-shot visual question answering via large language models with reasoning question prompts. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 4389\u20134400 (2023)","DOI":"10.1145\/3581783.3612389"},{"issue":"1","key":"22_CR22","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1038\/s41597-018-0002-5","volume":"5","author":"JJ Lau","year":"2018","unstructured":"Lau, J.J., Gayen, S., Ben Abacha, A., Demner-Fushman, D.: A dataset of clinically generated visual questions and answers about radiology images. Sci. Data 5(1), 1\u201310 (2018)","journal-title":"Sci. Data"},{"key":"22_CR23","doi-asserted-by":"crossref","unstructured":"Li, L., Peng, J., Chen, H., Gao, C., Yang, X.: How to configure good in-context sequence for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26710\u201326720 (2024)","DOI":"10.1109\/CVPR52733.2024.02522"},{"key":"22_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"22_CR25","doi-asserted-by":"crossref","unstructured":"Liu, B., Zhan, L.M., Xu, L., Ma, L., Yang, Y., Wu, X.M.: Slake: A semantically-labeled knowledge-enhanced dataset for medical visual question answering. In: 2021 IEEE 18th International Symposium on Biomedical Imaging (ISBI), pp. 1650\u20131654. IEEE (2021)","DOI":"10.1109\/ISBI48211.2021.9434010"},{"key":"22_CR26","volume":"580","author":"C Liu","year":"2024","unstructured":"Liu, C., Wang, C., Peng, Y., Li, Z.: ZVQAF: zero-shot visual question answering with feedback from large language models. Neurocomputing 580, 127505 (2024)","journal-title":"Neurocomputing"},{"key":"22_CR27","unstructured":"Liu, Y., Liang, Z., Wang, Y., He, M., Li, J., Zhao, B.: Seeing clearly, answering incorrectly: a multimodal robustness benchmark for evaluating MLLMS on leading questions. arXiv preprint arXiv:2406.10638 (2024)"},{"key":"22_CR28","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: VilBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"22_CR29","unstructured":"Lu, J., Yang, J., Batra, D., Parikh, D.: Hierarchical question-image co-attention for visual question answering. In: Advances in Neural Information Processing Systems, vol. 29 (2016)"},{"key":"22_CR30","unstructured":"Lu, P., et al.: MathVista: evaluating mathematical reasoning of foundation models in visual contexts. arXiv preprint arXiv:2310.02255 (2023)"},{"key":"22_CR31","doi-asserted-by":"crossref","unstructured":"Malinowski, M., Rohrbach, M., Fritz, M.: Ask your neurons: a neural-based approach to answering questions about images. In: Proceedings of the IEEE International Conference on Computer Vision, pp.\u00a01\u20139 (2015)","DOI":"10.1109\/ICCV.2015.9"},{"key":"22_CR32","doi-asserted-by":"crossref","unstructured":"Masry, A., Long, D.X., Tan, J.Q., Joty, S., Hoque, E.: ChartQA: a benchmark for question answering about charts with visual and logical reasoning. arXiv preprint arXiv:2203.10244 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"22_CR33","doi-asserted-by":"crossref","unstructured":"Mathew, M., Bagal, V., Tito, R., Karatzas, D., Valveny, E., Jawahar, C.: Infographicvqa. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1697\u20131706 (2022)","DOI":"10.1109\/WACV51458.2022.00264"},{"key":"22_CR34","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.: DocVQA: a dataset for VQA on document images. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2200\u20132209 (2021)","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"22_CR35","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"22_CR36","unstructured":"Reid, M., et\u00a0al.: Gemini 1.5: unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)"},{"key":"22_CR37","unstructured":"Ren, M., Kiros, R., Zemel, R.: Exploring models and data for image question answering. In: Advances in Neural Information Processing Systems, vol. 28 (2015)"},{"key":"22_CR38","doi-asserted-by":"publisher","unstructured":"Schwenk, D., Khandelwal, A., Clark, C., Marino, K., Mottaghi, R.: A-okvqa: A benchmark for visual question answering using world knowledge. In: European Conference on Computer Vision, pp. 146\u2013162. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-20074-8_9","DOI":"10.1007\/978-3-031-20074-8_9"},{"key":"22_CR39","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"22_CR40","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"22_CR41","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"778","DOI":"10.1007\/978-3-030-86331-9_50","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021","author":"R Tito","year":"2021","unstructured":"Tito, R., Karatzas, D., Valveny, E.: Document collection visual question answering. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) ICDAR 2021. LNCS, vol. 12822, pp. 778\u2013792. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86331-9_50"},{"key":"22_CR42","unstructured":"Touvron, H., et\u00a0al.: LLAMA 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"22_CR43","unstructured":"Vaswani, A.: Attention is all you need. In: Advances in Neural Information Processing Systems (2017)"},{"key":"22_CR44","unstructured":"Wang, P.,et\u00a0al.: Qwen2-VL: enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)"},{"issue":"10","key":"22_CR45","doi-asserted-by":"crossref","first-page":"2413","DOI":"10.1109\/TPAMI.2017.2754246","volume":"40","author":"P Wang","year":"2017","unstructured":"Wang, P., Wu, Q., Shen, C., Dick, A., Van Den Hengel, A.: FVQA: fact-based visual question answering. IEEE Trans. Pattern Anal. Mach. Intell. 40(10), 2413\u20132427 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"22_CR46","unstructured":"Wang, Z., Yu, J., Yu, A.W., Dai, Z., Tsvetkov, Y., Cao, Y.: SimVLM: simple visual language model pretraining with weak supervision. arXiv preprint arXiv:2108.10904 (2021)"},{"key":"22_CR47","doi-asserted-by":"crossref","unstructured":"Xiao, J., Shang, X., Yao, A., Chua, T.S.: NExT-QA: next phase of question-answering to explaining temporal actions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9777\u20139786 (2021)","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"22_CR48","unstructured":"Yang, A., et\u00a0al.: Qwen2 technical report. arXiv preprint arXiv:2407.10671 (2024)"},{"key":"22_CR49","unstructured":"Yao, Y., et\u00a0al.: MiniCPM-V: a GPT-4v level MLLM on your phone. arXiv preprint arXiv:2408.01800 (2024)"},{"key":"22_CR50","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: Coca: contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)"},{"key":"22_CR51","doi-asserted-by":"crossref","unstructured":"Yu, Z., et al.: ActivityNet-QA: a dataset for understanding complex web videos via question answering. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 9127\u20139134 (2019)","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"22_CR52","unstructured":"Yuan, L., et\u00a0al.: Florence: a new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)"},{"key":"22_CR53","doi-asserted-by":"crossref","unstructured":"Zhai, X., Mustafa, B., Kolesnikov, A., Beyer, L.: Sigmoid loss for language image pre-training. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11975\u201311986 (2023)","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"22_CR54","doi-asserted-by":"crossref","unstructured":"Zhang, C., Gao, F., Jia, B., Zhu, Y., Zhu, S.C.: Raven: a dataset for relational and analogical visual reasoning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5317\u20135327 (2019)","DOI":"10.1109\/CVPR.2019.00546"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-2641-0_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,31]],"date-time":"2025-03-31T01:00:57Z","timestamp":1743382857000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-2641-0_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819626403","9789819626410"],"references-count":54,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-2641-0_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"29 March 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hanoi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vietnam","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}