{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:07:14Z","timestamp":1777889234702,"version":"3.51.4"},"reference-count":58,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00137","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"1391-1401","source":"Crossref","is-referenced-by-count":0,"title":["Controlling Multimodal Llms Via Reward-Guided Decoding"],"prefix":"10.1109","author":[{"given":"Oscar","family":"Ma\u00f1as","sequence":"first","affiliation":[{"name":"Mila - Quebec AI Institute"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pierluca","family":"D'Oro","sequence":"additional","affiliation":[{"name":"Mila - Quebec AI Institute"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Koustuv","family":"Sinha","sequence":"additional","affiliation":[{"name":"Meta FAIR"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Adriana","family":"Romero-Soriano","sequence":"additional","affiliation":[{"name":"Mila - Quebec AI Institute"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Michal","family":"Drozdzal","sequence":"additional","affiliation":[{"name":"Meta FAIR"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aishwarya","family":"Agrawal","sequence":"additional","affiliation":[{"name":"Mila - Quebec AI Institute"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Gpt-4 technical report","volume-title":"arXiv preprint","author":"Achiam","year":"2023"},{"key":"ref2","article-title":"Understanding alignment in multimodal 11 ms: A comprehensive study","author":"Amirloo","year":"2024","journal-title":"arXiv preprint"},{"key":"ref3","article-title":"Hallucination of multimodal large language models: A survey","author":"Bai","year":"2024","journal-title":"arXiv preprint"},{"key":"ref4","article-title":"Paligemma: A versatile 3 b vlm for transfer","author":"Beyer","year":"2024","journal-title":"arXiv preprint"},{"key":"ref5","article-title":"An introduction to vision-language modeling","author":"Bordes","year":"2024","journal-title":"arXiv preprint"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.2307\/2334029"},{"key":"ref7","first-page":"2","article-title":"Large language monkeys: Scaling inference compute with repeated sampling","author":"Brown","year":"2024","journal-title":"arXiv preprint"},{"key":"ref8","article-title":"Language models are few-shot learners","author":"Brown","year":"2020","journal-title":"arXiv preprint"},{"key":"ref9","article-title":"The (r) evolution of multimodal large language models: A survey","author":"Caffagni","year":"2024","journal-title":"arXiv preprint"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref11","article-title":"Plug and play language models: A simple approach to controlled text generation","volume-title":"In International Conference on Learning Representations.","author":"Dathathri"},{"key":"ref12","first-page":"2","article-title":"Seeing is believing: Mitigating hallucination in large visionlanguage models via clip-guided decoding","author":"Deng","year":"2024","journal-title":"arXiv preprint"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.721"},{"key":"ref14","first-page":"1","article-title":"The llama 3 herd of models","author":"Dubey","year":"2024","journal-title":"arXiv preprint"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01356"},{"key":"ref16","article-title":"Value augmented sampling for language model alignment and personalization","author":"Han","year":"2024","journal-title":"arXiv preprint"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1355"},{"key":"ref18","article-title":"Lora: Lowrank adaptation of large language models","volume-title":"In International Conference on Learning Representations.","author":"Hu"},{"key":"ref19","article-title":"Args: Alignment as reward-guided search","volume-title":"In The Twelfth International Conference on Learning Representations. 1","author":"Khanov"},{"key":"ref20","article-title":"Rewardbench: Evaluating reward models for language modeling","author":"Lambert","year":"2024","journal-title":"arXiv preprint"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01316"},{"key":"ref22","article-title":"Sequential monte carlo steering of large language models using probabilistic programs","volume-title":"In ICML 2023 Workshop: Sampling and Optimization in Discrete Space.","author":"Lew"},{"key":"ref23","first-page":"2","article-title":"Cascade reward sampling for efficient decoding-time alignment","volume-title":"In ICML 2024 Next Generation of AI Safety Workshop.","author":"Li"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.358"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"ref26","article-title":"Mitigating hallucination in large multi-modal models via robust instruction tuning","volume-title":"In The Twelfth International Conference on Learning Representations. 1","author":"Liu"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref28","article-title":"Don\u2019t throw away your value model! generating more preferable text with value-guided monte-carlo tree search decoding","volume-title":"In First Conference on Language Modeling","author":"Liu"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1016"},{"key":"ref30","article-title":"Smolvlm: Redefining small and efficient multimodal models","author":"Marafioti","year":"2025","journal-title":"arXiv preprint"},{"key":"ref31","article-title":"Scaling open-vocabulary object detection","volume":"36","author":"Minderer","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref32","article-title":"Controlled decoding from language models","volume-title":"In Forty-first International Conference on Machine Learning. 1","author":"Mudgal"},{"key":"ref33","first-page":"2773027744","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang","year":"2022","journal-title":"Advances in neural information processing systems"},{"key":"ref34","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"In International conference on machine learning","author":"Radford"},{"key":"ref35","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume":"36","author":"Rafailov","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref36","article-title":"A critical look at tokenwise reward-guided text generation","author":"Rashid","year":"2024","journal-title":"arXiv preprint"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1437"},{"key":"ref39","first-page":"2","article-title":"Mitigating object hallucination via data augmented contrastive tuning","author":"Sarkar","year":"2024","journal-title":"arXiv preprint"},{"key":"ref40","first-page":"1","article-title":"Scaling 11 m test-time compute optimally can be more effective than scaling model parameters","author":"Snell","year":"2024","journal-title":"arXiv preprint"},{"key":"ref41","article-title":"Paligemma 2: A family of versatile vlms for transfer","author":"Steiner","year":"2024","journal-title":"arXiv preprint"},{"key":"ref42","first-page":"1","article-title":"Aligning large multimodal models with factually augmented rlhf","author":"Sun","year":"2023","journal-title":"arXiv preprint"},{"key":"ref43","article-title":"Gemini: a family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv preprint"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72986-7_12"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.460"},{"key":"ref46","first-page":"4","article-title":"An 11 m-free multi-dimensional benchmark for mllms hallucination evaluation","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.276"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4251-x"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01310"},{"key":"ref50","article-title":"Rlaif-v: Aligning mllms through open-source ai feedback for super gpt-4v trustworthiness","author":"Yu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.633"},{"key":"ref52","article-title":"Selfcorrecting decoding with generative feedback for mitigating hallucinations in large vision-language models","author":"Zhang","year":"2025","journal-title":"arXiv preprint"},{"key":"ref53","article-title":"Multimodal chain-of-thought reasoning in language models","author":"Zhang","journal-title":"Transactions on Machine Learning Research. 1"},{"key":"ref54","article-title":"Mitigating object hallucination in large visionlanguage models via classifier-free guidance","author":"Zhao","year":"2024","journal-title":"arXiv preprint"},{"key":"ref55","first-page":"2","article-title":"Beyond hallucinations: Enhancing lvlms through hallucination-aware direct preference optimization","author":"Zhao","year":"2023","journal-title":"arXiv preprint"},{"key":"ref56","article-title":"Aligning modalities in vision large language models via preference fine-tuning","volume-title":"In ICLR 2024 Workshop on Reliable and Responsible Foundation Models","author":"Zhou"},{"key":"ref57","article-title":"Analyzing and mitigating object hallucination in large vision-language models","volume-title":"In The Twelfth International Conference on Learning Representations","author":"Zhou"},{"key":"ref58","first-page":"2","article-title":"Calibrated self-rewarding vision language models","author":"Zhou","year":"2024","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11443531.pdf?arnumber=11443531","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:08:10Z","timestamp":1777612090000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11443531\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":58,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00137","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}