{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T06:34:20Z","timestamp":1763793260734,"version":"3.45.0"},"publisher-location":"Singapore","reference-count":39,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819533428","type":"print"},{"value":"9789819533435","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T00:00:00Z","timestamp":1763856000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T00:00:00Z","timestamp":1763856000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-3343-5_42","type":"book-chapter","created":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T06:30:49Z","timestamp":1763793049000},"page":"542-553","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["See Better, Say Better: Vision-Augmented Decoding for\u00a0Mitigating Hallucinations in\u00a0Large Vision-Language Models"],"prefix":"10.1007","author":[{"given":"Xinyi","family":"Sun","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Diandian","family":"Guo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cong","family":"Cao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fangfang","family":"Yuan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dakui","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanbing","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,11,23]]},"reference":[{"key":"42_CR1","unstructured":"Bai, J., et al.: Qwen-VL: a frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966, vol. 1, no. 2, p. 3 (2023)"},{"key":"42_CR2","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"42_CR3","unstructured":"Chuang, Y.S., Xie, Y., Luo, H., Kim, Y., Glass, J., He, P.: DoLa: decoding by contrasting layers improves factuality in large language models. arXiv preprint arXiv:2309.03883 (2023)"},{"key":"42_CR4","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning (2023)"},{"key":"42_CR5","unstructured":"Dosovitskiy, A., et al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. ArXiv abs\/2010.11929 (2020). https:\/\/api.semanticscholar.org\/CorpusID:225039882"},{"key":"42_CR6","unstructured":"Fu, C., et al.: MME: a comprehensive evaluation benchmark for multimodal large language models. ArXiv abs\/2306.13394 (2023). https:\/\/api.semanticscholar.org\/CorpusID:259243928"},{"key":"42_CR7","unstructured":"Gong, T., et al.: Multimodal-GPT: a vision and language model for dialogue with humans. arXiv preprint arXiv:2305.04790 (2023)"},{"key":"42_CR8","doi-asserted-by":"crossref","unstructured":"Gunjal, A., Yin, J., Bas, E.: Detecting and preventing hallucinations in large vision language models. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 18135\u201318143 (2024)","DOI":"10.1609\/aaai.v38i16.29771"},{"key":"42_CR9","unstructured":"Han, Y., Nie, L., Yin, J., Wu, J., Yan, Y.: Visual perturbation-aware collaborative learning for overcoming the language prior problem. arXiv preprint arXiv:2207.11850 (2022)"},{"key":"42_CR10","doi-asserted-by":"crossref","unstructured":"He, X., Wei, L., Xie, L., Tian, Q.: Incorporating visual experts to resolve the information loss in multimodal large language models. arXiv preprint arXiv:2401.03105 (2024)","DOI":"10.24963\/ijcai.2024\/123"},{"key":"42_CR11","unstructured":"Hu, H., Zhang, J., Zhao, M., Sun, Z.: CIEM: contrastive instruction evaluation method for better instruction tuning. arXiv preprint arXiv:2309.02301 (2023)"},{"key":"42_CR12","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6700\u20136709 (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"42_CR13","doi-asserted-by":"crossref","unstructured":"Jiang, C., et al.: Hallucination augmented contrastive learning for multimodal large language model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 27036\u201327046 (2024)","DOI":"10.1109\/CVPR52733.2024.02553"},{"key":"42_CR14","unstructured":"Lee, J., Cha, S., Lee, Y., Yang, C.: Visual question answering instruction: unlocking multimodal large language model to domain-specific visual multitasks. arXiv preprint arXiv:2402.08360 (2024)"},{"key":"42_CR15","doi-asserted-by":"crossref","unstructured":"Leng, S., et al.: Mitigating object hallucinations in large vision-language models through visual contrastive decoding. In: 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 13872\u201313882 (2023). https:\/\/api.semanticscholar.org\/CorpusID:265466833","DOI":"10.1109\/CVPR52733.2024.01316"},{"key":"42_CR16","doi-asserted-by":"crossref","unstructured":"Leng, S., et al.: Mitigating object hallucinations in large vision-language models through visual contrastive decoding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13872\u201313882 (2024)","DOI":"10.1109\/CVPR52733.2024.01316"},{"key":"42_CR17","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742. PMLR (2023)"},{"key":"42_CR18","unstructured":"Li, X.L., et al.: Contrastive decoding: open-ended text generation as optimization. arXiv preprint arXiv:2210.15097 (2022)"},{"key":"42_CR19","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., Wen, J.R.: Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"42_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"42_CR21","unstructured":"Liu, F., Lin, K., Li, L., Wang, J., Yacoob, Y., Wang, L.: Mitigating hallucination in large multi-modal models via robust instruction tuning. arXiv preprint arXiv:2306.14565 (2023)"},{"key":"42_CR22","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26296\u201326306 (2024)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"42_CR23","first-page":"34892","volume":"36","author":"H Liu","year":"2023","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Adv. Neural. Inf. Process. Syst. 36, 34892\u201334916 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"5","key":"42_CR24","first-page":"6","volume":"2","author":"A Meta","year":"2024","unstructured":"Meta, A.: Introducing meta llama 3: the most capable openly available LLM to date. Meta AI 2(5), 6 (2024)","journal-title":"Meta AI"},{"key":"42_CR25","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (2021). https:\/\/api.semanticscholar.org\/CorpusID:231591445"},{"key":"42_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"146","DOI":"10.1007\/978-3-031-20074-8_9","volume-title":"Computer Vision \u2013 ECCV 2022","author":"D Schwenk","year":"2022","unstructured":"Schwenk, D., Khandelwal, A., Clark, C., Marino, K., Mottaghi, R.: A-OKVQA: a benchmark for visual question answering using world knowledge. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13668, pp. 146\u2013162. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20074-8_9"},{"key":"42_CR27","doi-asserted-by":"crossref","unstructured":"Sun, Z., et\u00a0al.: Aligning large multimodal models with factually augmented RLHF. arXiv preprint arXiv:2309.14525 (2023)","DOI":"10.18653\/v1\/2024.findings-acl.775"},{"key":"42_CR28","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"42_CR29","unstructured":"Wang, T., et al.: Caption anything: interactive image description with diverse multimodal controls. arXiv preprint arXiv:2305.02677 (2023)"},{"key":"42_CR30","doi-asserted-by":"crossref","unstructured":"Wang, X., Pan, J., Ding, L., Biemann, C.: Mitigating hallucinations in large vision-language models with instruction contrastive decoding. arXiv preprint arXiv:2403.18715 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.937"},{"key":"42_CR31","unstructured":"Yang, A., et\u00a0al.: Qwen2. 5 technical report. arXiv preprint arXiv:2412.15115 (2024)"},{"issue":"12","key":"42_CR32","doi-asserted-by":"publisher","first-page":"220105","DOI":"10.1007\/s11432-024-4251-x","volume":"67","author":"S Yin","year":"2024","unstructured":"Yin, S., et al.: Woodpecker: hallucination correction for multimodal large language models. SCI. CHINA Inf. Sci. 67(12), 220105 (2024)","journal-title":"SCI. CHINA Inf. Sci."},{"key":"42_CR33","doi-asserted-by":"crossref","unstructured":"Yu, Q., et al.: Hallucidoctor: mitigating hallucinatory toxicity in visual instruction data. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12944\u201312953 (2024)","DOI":"10.1109\/CVPR52733.2024.01230"},{"key":"42_CR34","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"196","DOI":"10.1007\/978-3-031-73113-6_12","volume-title":"Computer Vision \u2013 ECCV 2024","author":"J Zhang","year":"2024","unstructured":"Zhang, J., Wang, T., Zhang, H., Lu, P., Zheng, F.: Reflective instruction tuning: mitigating hallucinations in large vision-language models. In: Leonardis, A., Ricci, E., Roth, S., Russakovsky, O., Sattler, T., Varol, G. (eds.) ECCV 2024. LNCS, vol. 15126, pp. 196\u2013213. Springer, Cham (2024). https:\/\/doi.org\/10.1007\/978-3-031-73113-6_12"},{"key":"42_CR35","unstructured":"Zhang, Z., Zhang, A., Li, M., Zhao, H., Karypis, G., Smola, A.: Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923 (2023)"},{"key":"42_CR36","unstructured":"Zhao, Z., Wang, B., Ouyang, L., Dong, X., Wang, J., He, C.: Beyond hallucinations: enhancing LVLMs through hallucination-aware direct preference optimization. arXiv preprint arXiv:2311.16839 (2023)"},{"key":"42_CR37","unstructured":"Zhou, Y., et al.: Analyzing and mitigating object hallucination in large vision-language models. arXiv preprint arXiv:2310.00754 (2023)"},{"key":"42_CR38","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"},{"key":"42_CR39","doi-asserted-by":"crossref","unstructured":"Zhu, J.: Overcoming language priors with counterfactual inference for visual question answering. Chin. Comput. Linguist. 58 (2023)","DOI":"10.1007\/978-981-99-6207-5_4"}],"container-title":["Lecture Notes in Computer Science","Natural Language Processing and Chinese Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-3343-5_42","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T06:30:56Z","timestamp":1763793056000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-3343-5_42"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,23]]},"ISBN":["9789819533428","9789819533435"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-3343-5_42","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,23]]},"assertion":[{"value":"23 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"NLPCC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CCF International Conference on Natural Language Processing and Chinese Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"nlpcc2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/tcci.ccf.org.cn\/conference\/2025\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}