{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T06:52:46Z","timestamp":1743058366715,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":32,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819794300"},{"type":"electronic","value":"9789819794317"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-9431-7_16","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T14:03:04Z","timestamp":1730383384000},"page":"201-214","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["FacGPT: An Effective and\u00a0Efficient Method for\u00a0Evaluating Knowledge-Based Visual Question Answering"],"prefix":"10.1007","author":[{"given":"Sirui","family":"Cheng","sequence":"first","affiliation":[]},{"given":"Siyu","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Jiayi","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Muchen","family":"Lan","sequence":"additional","affiliation":[]},{"given":"Yaoru","family":"Sun","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"16_CR1","unstructured":"Bubeck, S., Chandrasekaran, V., Eldan, R.: Sparks of artificial general Intelligence: Early experiments with GPT-4.arXiv preprint arXiv:2303.12712 (2023)"},{"key":"16_CR2","unstructured":"Touvron, H., Lavril, T., Izacard, G.: LLaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"16_CR3","unstructured":"Politzer, T.: Vision is our dominant sense. Brainline. https:\/\/www.brainline.org\/article\/vision-our-dominant-sense (2008)"},{"key":"16_CR4","unstructured":"Alayrac, J. B., Donahue, J., Luc, P.: Flamingo: a visual language model for few-shot learning. In: International Conference on Advances in Neural Information Processing Systems, vol. 35, pp. 23716\u201323736 (2022)"},{"key":"16_CR5","unstructured":"Liu, H., Li, C., Wu, Q.: Visual instruction tuning. In: International Conference on Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"16_CR6","unstructured":"Zhu, D., Chen, J., Shen, X.: Minigpt-4: Enhancing vision-language understanding with advanced large language models. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"16_CR7","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y. J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv: 2310.03744 (2023)"},{"key":"16_CR8","unstructured":"Dai, W., Li, J., Li, D.: Instructblip: towards general-purpose vision-language models with instruction tuning. In: International Conference on Advances in Neural Information Processing Systems, vol. 36 (2023)"},{"key":"16_CR9","unstructured":"Gao, P., Han, J., Zhang, R.: Llama-adapter v2: parameter-efficient visual instruction model. arXiv preprint arXiv: 2304.15010 (2023)"},{"key":"16_CR10","unstructured":"Ye, Q., Xu, H., Xu, G.: mplug-owl: modularization empowers large language models with multimodality. arXiv preprint arXiv: 2304.14178 (2023)"},{"key":"16_CR11","unstructured":"OpenAI GPT-4V(ision) system card. https:\/\/openai.com\/research\/gpt-4v-system-card (2023)"},{"key":"16_CR12","doi-asserted-by":"crossref","unstructured":"Lee, S., Kim, S., Park, S. H.: Prometheus-vision: vision-language model as a judge for fine-grained evaluation. arXiv preprint arXiv: 2401.06591 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.672"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Agrawal, A., Kajic, I., Bugliarello, E.: Reassessing evaluation practices in visual question answering: a case study on out-of-distribution generalization. Findings of the Association for Computational Linguistics: EACL, pp. 1201\u20131226 (2023)","DOI":"10.18653\/v1\/2023.findings-eacl.90"},{"key":"16_CR14","doi-asserted-by":"crossref","unstructured":"Ma\u00f1as, O., Krojer, B., Agrawal, A.: Improving automatic VQA evaluation using large language models. In: The AAAI Conference on Artificial Intelligence, vol. 38, no. 5, pp. 4171\u20134179 (2024)","DOI":"10.1609\/aaai.v38i5.28212"},{"key":"16_CR15","unstructured":"Zhang, T., Kishore, V., Wu, F.: BERTScore: evaluating text generation with BERT. In: International Conference on Learning Representations (2019)"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Yin, S., Fu, C., Zhao, S.: A survey on multimodal large language models. arXiv preprint arXiv:2306.13549 (2023)","DOI":"10.1093\/nsr\/nwae403"},{"key":"16_CR17","unstructured":"Yu, W., Yang, Z., Li, L.: Mm-vet: evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490 (2023)"},{"key":"16_CR18","unstructured":"Ye, S., Kim, D., Kim, S.: Flask: fine-grained language model evaluation based on alignment skill sets. arXiv preprint arXiv:2307.10928 (2023)"},{"key":"16_CR19","doi-asserted-by":"crossref","unstructured":"Kim, S., Joo, S. J., Jang, Y.: Cotever: chain of thought prompting annotation toolkit for explanation verification. In:17th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp. 195\u2013208 (2023)","DOI":"10.18653\/v1\/2023.eacl-demo.23"},{"key":"16_CR20","unstructured":"Kim, S., Shin, J., Cho, Y.: Prometheus: inducing fine-grained evaluation capability in language models. arXiv preprint arXiv:2310.08491 (2023)"},{"key":"16_CR21","unstructured":"Wu, Z., Hu, Y., Shi, W.: Finegrained human feedback gives better rewards for language model training. In: International Conference on Advances in Neural Information Processing Systems, vol. 36 (2023)"},{"key":"16_CR22","unstructured":"Jang, J., Kim, S., Lin, B. Y.: Personalized soups: personalized large language model alignment via post-hoc parameter merging. arXiv preprint arXiv:2310.11564 (2023)"},{"key":"16_CR23","doi-asserted-by":"crossref","unstructured":"Kim, T. S., Lee, T., Shin, J.: Evallm: interactive evaluation of large language model prompts on user-defined criteria. arXiv preprint arXiv:2309.13633 (2023)","DOI":"10.1145\/3613904.3642216"},{"key":"16_CR24","doi-asserted-by":"crossref","unstructured":"Liu, Y., Duan, H., Zhang, Y.: MMBench: is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281(2023)","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"16_CR25","doi-asserted-by":"crossref","unstructured":"Wang, P., Wu, Q., Shen, C., Dick, A.: Explicit knowledge-based reasoning for visual question answering. In: 26th International Joint Conference on Artificial Intelligence, pp. 1290\u20131296 (2017)","DOI":"10.24963\/ijcai.2017\/179"},{"key":"16_CR26","doi-asserted-by":"crossref","unstructured":"Chen, Z., Chen, J., Geng, Y.: Zero-shot visual question answering using knowledge graph. In: 20th International Semantic Web Conference, pp. 146\u2013162 (2021)","DOI":"10.1007\/978-3-030-88361-4_9"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., Mottaghi, R.: Ok-VQA: a visual question answering benchmark requiring external knowledge. In: the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3195\u20133204 (2019)","DOI":"10.1109\/CVPR.2019.00331"},{"key":"16_CR28","unstructured":"Chen, J., Zhu, D., Shen, X.: Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"16_CR29","unstructured":"Li, J., Li, D., Savarese, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: 40th International Conference on Machine Learning, pp. 19730\u201319742 (2023)"},{"key":"16_CR30","doi-asserted-by":"crossref","unstructured":"Du, Z., Qian, Y., Liu, X.: GLM: general language model pretraining with autoregressive blank infilling. In: 60th Annual Meeting of the Association for Computational Linguistics, pp. 320\u2013335 (2022)","DOI":"10.18653\/v1\/2022.acl-long.26"},{"key":"16_CR31","unstructured":"Awadalla, A., Gao, I., Gardner, J.: OpenFlamingo: an open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"16_CR32","unstructured":"Ge, W., Chen, S., Hardy Chen, G.: MLLM-Bench: evaluating multimodals LLMs with per-sample criteria. arXiv preprint arXiv:2311.13951 (2024)"}],"container-title":["Lecture Notes in Computer Science","Natural Language Processing and Chinese Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-9431-7_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T15:54:23Z","timestamp":1732982063000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-9431-7_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9789819794300","9789819794317"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-9431-7_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"NLPCC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CCF International Conference on Natural Language Processing and Chinese Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hangzhou","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 November 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"nlpcc2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/tcci.ccf.org.cn\/conference\/2024\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}