{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T20:43:43Z","timestamp":1776977023359,"version":"3.51.4"},"reference-count":17,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T00:00:00Z","timestamp":1771891200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T00:00:00Z","timestamp":1771891200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea (NRF)","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,2,24]]},"DOI":"10.1109\/icaiic68212.2026.11454196","type":"proceedings-article","created":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T19:50:24Z","timestamp":1774986624000},"page":"535-540","source":"Crossref","is-referenced-by-count":0,"title":["Fine-Grained Rewards for Visual CoT: Mitigating Hallucinations in Vision-Language Models"],"prefix":"10.1109","author":[{"given":"Jimyung","family":"Park","sequence":"first","affiliation":[{"name":"Yonsei University,Department of Industrial Engineering,Seoul,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minhyuk","family":"Jeong","sequence":"additional","affiliation":[{"name":"Yonsei University,Department of Industrial Engineering,Seoul,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dongjun","family":"Kim","sequence":"additional","affiliation":[{"name":"Yonsei University,Department of Industrial Engineering,Seoul,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hyunjun","family":"Yuh","sequence":"additional","affiliation":[{"name":"Yonsei University,Department of Industrial Engineering,Seoul,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jeonghoon","family":"Mo","sequence":"additional","affiliation":[{"name":"Yonsei University,Department of Industrial Engineering,Seoul,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Gpt-4 technical report","year":"2023","journal-title":"arXiv preprint"},{"key":"ref2","article-title":"Gemini: A family of highly capable multimodal models","author":"Anil","year":"2023","journal-title":"arXiv preprint"},{"key":"ref3","article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref4","article-title":"Visual instruction tuning","volume-title":"Advances in Neural Information Processing Systems 36 (NeurIPS 2023)","author":"Liu","year":"2023"},{"key":"ref5","doi-asserted-by":"crossref","first-page":"292","DOI":"10.18653\/v1\/2023.emnlp-main.20","article-title":"Evaluating object hallucination in large vision-language models","volume-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing (EMNLP)","author":"Li","year":"2023"},{"key":"ref6","doi-asserted-by":"crossref","first-page":"13088","DOI":"10.18653\/v1\/2024.findings-acl.775","article-title":"Aligning large multimodal models with factually augmented RLHF","volume-title":"Findings of the Association for Computational Linguistics: ACL 2024","author":"Sun","year":"2024"},{"key":"ref7","article-title":"Visual cot: Advancing multi-modal language models with a comprehensive dataset and benchmark for chain-of-thought reasoning","author":"Shao","year":"2024","journal-title":"arXiv preprint"},{"key":"ref8","article-title":"Ground-r1: Incentivizing grounded visual reasoning via reinforcement learning","author":"Cao","year":"2025","journal-title":"arXiv preprint"},{"key":"ref9","doi-asserted-by":"crossref","DOI":"10.1109\/CVPR52733.2024.01310","article-title":"Rlhf-v: Towards trustworthy mllms via behavior alignment from fine-grained correctional human feedback","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Yu","year":"2024"},{"key":"ref10","doi-asserted-by":"crossref","first-page":"13088","DOI":"10.18653\/v1\/2024.findings-acl.775","article-title":"Aligning large multimodal models with factually augmented RLHF","volume-title":"Findings of the Association for Computational Linguistics: ACL 2024","author":"Sun","year":"2024"},{"key":"ref11","article-title":"Aligning modalities in vision large language models via preference fine-tuning","author":"Zhou","year":"2024","journal-title":"arXiv preprint"},{"key":"ref12","article-title":"Beyond hallucinations: Enhancing lvlms through hallucination-aware direct preference optimization","author":"Zhao","year":"2023","journal-title":"arXiv preprint"},{"key":"ref13","first-page":"391","article-title":"Volcano: Mitigating multimodal hallucination through self-feedback guided revision","volume-title":"Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics (NAACL), Long Papers","author":"Lee","year":"2024"},{"key":"ref14","article-title":"Fgaif: Aligning large vision-language models with fine-grained ai feedback","volume-title":"Transactions on Machine Learning Research (TMLR)","author":"Jing","year":"2025"},{"key":"ref15","first-page":"6904","article-title":"Making the v in VQA matter: Elevating the role of image understanding in visual question answering","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Goyal","year":"2017"},{"key":"ref16","article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref17","volume-title":"Unified-reward: Open and unified reward model for llms\/lvlms","author":"Team","year":"2025"}],"event":{"name":"2026 International Conference on Artificial Intelligence in Information and Communication (ICAIIC)","location":"Tokyo, Japan","start":{"date-parts":[[2026,2,24]]},"end":{"date-parts":[[2026,2,27]]}},"container-title":["2026 International Conference on Artificial Intelligence in Information and Communication (ICAIIC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11454127\/11454137\/11454196.pdf?arnumber=11454196","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T19:56:05Z","timestamp":1776974165000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11454196\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,24]]},"references-count":17,"URL":"https:\/\/doi.org\/10.1109\/icaiic68212.2026.11454196","relation":{},"subject":[],"published":{"date-parts":[[2026,2,24]]}}}