{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:04:39Z","timestamp":1765339479068,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["Grant No. 61977045"],"award-info":[{"award-number":["Grant No. 61977045"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755372","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:54:15Z","timestamp":1761375255000},"page":"4251-4260","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MPI-CD: Multi-Path Information Contrastive Decoding for Mitigating Hallucinations in Large Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5469-507X","authenticated-orcid":false,"given":"Jiacheng","family":"Ruan","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0674-4329","authenticated-orcid":false,"given":"Zongyun","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6271-0903","authenticated-orcid":false,"given":"Jingsheng","family":"Gao","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7571-564X","authenticated-orcid":false,"given":"Wenzhen","family":"Yuan","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3489-4578","authenticated-orcid":false,"given":"Ting","family":"Liu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5516-3016","authenticated-orcid":false,"given":"Yuzhuo","family":"Fu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02784"},{"key":"e_1_3_2_1_2_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_3_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_4_1","volume-title":"Hallucination of multimodal large language models: A survey. arXiv preprint arXiv:2404.18930","author":"Bai Zechen","year":"2024","unstructured":"Zechen Bai, Pichao Wang, Tianjun Xiao, Tong He, Zongbo Han, Zheng Zhang, and Mike Zheng Shou. 2024. Hallucination of multimodal large language models: A survey. arXiv preprint arXiv:2404.18930 (2024)."},{"key":"e_1_3_2_1_5_1","first-page":"15","article-title":"Visual information processing and its brain mechanism","volume":"35","author":"Bao Min","year":"2017","unstructured":"Min Bao, CB Huang, Li Wang, Tao Zhang, and Yi Jiang. 2017. Visual information processing and its brain mechanism. Sci. Technol. Rev, Vol. 35 (2017), 15-20.","journal-title":"Sci. Technol. Rev"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00188"},{"key":"e_1_3_2_1_7_1","volume-title":"Visual perception and cognitive performance. Helmet-mounted displays: sensation, Perception and Cognitive Issues","author":"Cap\u00f3-Aponte Jos\u00e9 E","year":"2009","unstructured":"Jos\u00e9 E Cap\u00f3-Aponte, Leonard A Temme, H Lee Task, Alan R Pinkus, Melvyn E Kalich, Allan J Pantle, and Clarence E Rash. 2009. Visual perception and cognitive performance. Helmet-mounted displays: sensation, Perception and Cognitive Issues (2009), 335-390."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00398"},{"key":"e_1_3_2_1_9_1","volume-title":"Halc: Object hallucination reduction via adaptive focal-contrast decoding. arXiv preprint arXiv:2403.00425","author":"Chen Zhaorun","year":"2024","unstructured":"Zhaorun Chen, Zhuokai Zhao, Hongyin Luo, Huaxiu Yao, Bo Li, and Jiawei Zhou. 2024. Halc: Object hallucination reduction via adaptive focal-contrast decoding. arXiv preprint arXiv:2403.00425 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys. org (accessed","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E Gonzalez, et al., 2023. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys. org (accessed 14 April 2023), Vol. 2, 3 (2023), 6."},{"key":"e_1_3_2_1_11_1","unstructured":"Xiangxiang Chu Limeng Qiao Xinyu Zhang Shuang Xu Fei Wei Yang Yang Xiaofei Sun Yiming Hu Xinyang Lin Bo Zhang et al. 2024. Mobilevlm v2: Faster and stronger baseline for vision language model. arXiv preprint arXiv:2402.03766 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"Dola: Decoding by contrasting layers improves factuality in large language models. arXiv preprint arXiv:2309.03883","author":"Chuang Yung-Sung","year":"2023","unstructured":"Yung-Sung Chuang, Yujia Xie, Hongyin Luo, Yoon Kim, James Glass, and Pengcheng He. 2023. Dola: Decoding by contrasting layers improves factuality in large language models. arXiv preprint arXiv:2309.03883 (2023)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00106"},{"key":"e_1_3_2_1_14_1","first-page":"49250","volume-title":"Levine (Eds.)","volume":"36","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, DONGXU LI, Anthony Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. In Advances in Neural Information Processing Systems, A. Oh, T. Naumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 49250-49267. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/9a6a435e75419a836fe47ab6793623e6-Paper-Conference.pdf"},{"key":"e_1_3_2_1_15_1","volume-title":"Benchmarking and improving detail image caption. arXiv preprint arXiv:2405.19092","author":"Dong Hongyuan","year":"2024","unstructured":"Hongyuan Dong, Jiawen Li, Bohong Wu, Jiacong Wang, Yuan Zhang, and Haoyuan Guo. 2024. Benchmarking and improving detail image caption. arXiv preprint arXiv:2405.19092 (2024)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01356"},{"key":"e_1_3_2_1_17_1","volume-title":"MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:2306.13394","author":"Fu Chaoyou","year":"2023","unstructured":"Chaoyou Fu, Peixian Chen, Yunhang Shen, Yulei Qin, Mengdan Zhang, Xu Lin, Jinrui Yang, Xiawu Zheng, Ke Li, Xing Sun, et al., 2023. MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:2306.13394 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"Visual perturbation-aware collaborative learning for overcoming the language prior problem. arXiv preprint arXiv:2207.11850","author":"Han Yudong","year":"2022","unstructured":"Yudong Han, Liqiang Nie, Jianhua Yin, Jianlong Wu, and Yan Yan. 2022. Visual perturbation-aware collaborative learning for overcoming the language prior problem. arXiv preprint arXiv:2207.11850 (2022)."},{"key":"e_1_3_2_1_19_1","unstructured":"Lei Huang Weijiang Yu Weitao Ma Weihong Zhong Zhangyin Feng Haotian Wang Qianglong Chen Weihua Peng Xiaocheng Feng Bing Qin et al. 2023. A survey on hallucination in large language models: Principles taxonomy challenges and open questions. ACM Transactions on Information Systems (2023)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01274"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102270"},{"key":"e_1_3_2_1_23_1","volume-title":"A comprehensive survey of foundation models in medicine","author":"Wong Joshua K","year":"2025","unstructured":"Wasif Khan, Seowung Leem, Kyle B See, Joshua K Wong, Shaoting Zhang, and Ruogu Fang. 2025. A comprehensive survey of foundation models in medicine. IEEE Reviews in Biomedical Engineering (2025)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00018-011-0641-6"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00018-011-0641-6"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01316"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1561\/9781638283379"},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023b. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_29_1","volume-title":"Wayne Xin Zhao, and Ji-Rong Wen","author":"Li Yifan","year":"2023","unstructured":"Yifan Li, Yifan Du, Kun Zhou, Jinpeng Wang, Wayne Xin Zhao, and Ji-Rong Wen. 2023a. Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)."},{"key":"e_1_3_2_1_30_1","first-page":"740","volume-title":"Zurich","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision-ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13. Springer, 740-755."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_32_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024b. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"A survey on hallucination in large vision-language models. arXiv preprint arXiv:2402.00253","author":"Liu Hanchao","year":"2024","unstructured":"Hanchao Liu, Wenyuan Xue, Yifei Chen, Dapeng Chen, Xiutian Zhao, Ke Wang, Liping Hou, Rongjun Li, and Wei Peng. 2024c. A survey on hallucination in large vision-language models. arXiv preprint arXiv:2402.00253 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"The Thirteenth International Conference on Learning Representations.","author":"Liu Sheng","year":"2025","unstructured":"Sheng Liu, Haotian Ye, and James Zou. 2025. Reducing hallucinations in large vision-language models via latent space steering. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_35_1","volume-title":"Introducing meta llama 3: The most capable openly available llm to date. Meta AI","author":"Meta AI","year":"2024","unstructured":"AI Meta. 2024. Introducing meta llama 3: The most capable openly available llm to date. Meta AI (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Large language models: A survey. arXiv preprint arXiv:2402.06196","author":"Minaee Shervin","year":"2024","unstructured":"Shervin Minaee, Tomas Mikolov, Narjes Nikzad, Meysam Chenaghlu, Richard Socher, Xavier Amatriain, and Jianfeng Gao. 2024. Large language models: A survey. arXiv preprint arXiv:2402.06196 (2024)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2009.06.006"},{"key":"e_1_3_2_1_38_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et al. 2023. Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"compare, decide: Alleviating hallucination in large vision-language models via multi-view multi-path reasoning. arXiv preprint arXiv:2408.17150","author":"Qu Xiaoye","year":"2024","unstructured":"Xiaoye Qu, Jiashuo Sun, Wei Wei, and Yu Cheng. 2024. Look, compare, decide: Alleviating hallucination in large vision-language models via multi-view multi-path reasoning. arXiv preprint arXiv:2408.17150 (2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01236"},{"key":"e_1_3_2_1_42_1","volume-title":"Ftii-bench: A comprehensive multimodal benchmark for flow text with image insertion. arXiv preprint arXiv:2410.12564","author":"Ruan Jiacheng","year":"2024","unstructured":"Jiacheng Ruan, Yebin Yang, Zehao Lin, Yuchen Feng, Feiyu Xiong, Zeyun Tang, and Zhiyu Li. 2024. Ftii-bench: A comprehensive multimodal benchmark for flow text with image insertion. arXiv preprint arXiv:2410.12564 (2024)."},{"key":"e_1_3_2_1_43_1","volume-title":"Vlrmbench: A comprehensive and challenging benchmark for vision-language reward models. arXiv preprint arXiv:2503.07478","author":"Ruan Jiacheng","year":"2025","unstructured":"Jiacheng Ruan, Wenzhen Yuan, Xian Gao, Ye Guo, Daoxin Zhang, Zhe Xu, Yao Hu, Ting Liu, and Yuzhuo Fu. 2025a. Vlrmbench: A comprehensive and challenging benchmark for vision-language reward models. arXiv preprint arXiv:2503.07478 (2025)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i7.32723"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_9"},{"key":"e_1_3_2_1_46_1","volume-title":"Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang, and Yue Cao. 2023. Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)."},{"key":"e_1_3_2_1_47_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"Transformer mechanisms mimic frontostriatal gating operations when trained on human working memory tasks. arXiv preprint arXiv:2402.08211","author":"Traylor Aaron","year":"2024","unstructured":"Aaron Traylor, Jack Merullo, Michael J Frank, and Ellie Pavlick. 2024. Transformer mechanisms mimic frontostriatal gating operations when trained on human working memory tasks. arXiv preprint arXiv:2402.08211 (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"An llm-free multi-dimensional benchmark for mllms hallucination evaluation. CoRR","author":"Wang Junyang","year":"2023","unstructured":"Junyang Wang, Yuhang Wang, Guohai Xu, Jing Zhang, Yukai Gu, Haitao Jia, Ming Yan, Ji Zhang, and Jitao Sang. 2023b. An llm-free multi-dimensional benchmark for mllms hallucination evaluation. CoRR (2023)."},{"key":"e_1_3_2_1_50_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024a. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_51_1","volume-title":"Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079","author":"Wang Weihan","year":"2023","unstructured":"Weihan Wang, Qingsong Lv, Wenmeng Yu, Wenyi Hong, Ji Qi, Yan Wang, Junhui Ji, Zhuoyi Yang, Lei Zhao, Xixuan Song, et al., 2023a. Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"Mitigating hallucinations in large vision-language models with instruction contrastive decoding. arXiv preprint arXiv:2403.18715","author":"Wang Xintong","year":"2024","unstructured":"Xintong Wang, Jingheng Pan, Liang Ding, and Chris Biemann. 2024b. Mitigating hallucinations in large vision-language models with instruction contrastive decoding. arXiv preprint arXiv:2403.18715 (2024)."},{"key":"e_1_3_2_1_53_1","volume-title":"Relating transformers to models and neural representations of the hippocampal formation. arXiv preprint arXiv:2112.04035","author":"Whittington James CR","year":"2021","unstructured":"James CR Whittington, Joseph Warren, and Timothy EJ Behrens. 2021. Relating transformers to models and neural representations of the hippocampal formation. arXiv preprint arXiv:2112.04035 (2021)."},{"key":"e_1_3_2_1_54_1","volume-title":"RITUAL: Random Image Transformations as a Universal Anti-hallucination Lever in LVLMs. arXiv preprint arXiv:2405.17821","author":"Woo Sangmin","year":"2024","unstructured":"Sangmin Woo, Jaehyuk Jang, Donguk Kim, Yubin Choi, and Changick Kim. 2024. RITUAL: Random Image Transformations as a Universal Anti-hallucination Lever in LVLMs. arXiv preprint arXiv:2405.17821 (2024)."},{"key":"e_1_3_2_1_55_1","unstructured":"Jiannan Wu Muyan Zhong Sen Xing Zeqiang Lai Zhaoyang Liu Wenhai Wang Zhe Chen Xizhou Zhu Lewei Lu Tong Lu et al. 2024. VisionLLM v2: An End-to-End Generalist Multimodal Large Language Model for Hundreds of Vision-Language Tasks. arXiv preprint arXiv:2406.08394 (2024)."},{"key":"e_1_3_2_1_56_1","unstructured":"An Yang Baosong Yang Binyuan Hui Bo Zheng Bowen Yu Chang Zhou Chengpeng Li Chengyuan Li Dayiheng Liu Fei Huang et al. 2024a. Qwen2 Technical Report. arXiv preprint arXiv:2407.10671 (2024)."},{"key":"e_1_3_2_1_57_1","unstructured":"An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chengyuan Li Dayiheng Liu Fei Huang Haoran Wei et al. 2024b. Qwen2. 5 technical report. arXiv preprint arXiv:2412.15115 (2024)."},{"key":"e_1_3_2_1_58_1","volume-title":"A survey on multimodal large language models. National Science Review","author":"Yin Shukang","year":"2024","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Ke Li, Xing Sun, Tong Xu, and Enhong Chen. 2024. A survey on multimodal large language models. National Science Review (2024), nwae403."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01310"},{"key":"e_1_3_2_1_60_1","volume-title":"Rlaif-v: Aligning mllms through open-source ai feedback for super gpt-4v trustworthiness. arXiv preprint arXiv:2405.17220","author":"Yu Tianyu","year":"2024","unstructured":"Tianyu Yu, Haoye Zhang, Yuan Yao, Yunkai Dang, Da Chen, Xiaoman Lu, Ganqu Cui, Taiwen He, Zhiyuan Liu, Tat-Seng Chua, et al., 2024b. Rlaif-v: Aligning mllms through open-source ai feedback for super gpt-4v trustworthiness. arXiv preprint arXiv:2405.17220 (2024)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2025.01.020"},{"key":"e_1_3_2_1_63_1","volume-title":"European Conference on Computer Vision. Springer, 74-91","author":"Zhang Zheng","year":"2024","unstructured":"Zheng Zhang, Yeyao Ma, Enming Zhang, and Xiang Bai. 2024. Psalm: Pixelwise segmentation with large multi-modal model. In European Conference on Computer Vision. Springer, 74-91."},{"key":"e_1_3_2_1_64_1","volume-title":"Mitigating modality prior-induced hallucinations in multimodal large language models via deciphering attention causality. arXiv preprint arXiv:2410.04780","author":"Zhou Guanyu","year":"2024","unstructured":"Guanyu Zhou, Yibo Yan, Xin Zou, Kun Wang, Aiwei Liu, and Xuming Hu. 2024b. Mitigating modality prior-induced hallucinations in multimodal large language models via deciphering attention causality. arXiv preprint arXiv:2410.04780 (2024)."},{"key":"e_1_3_2_1_65_1","volume-title":"Walter Zimmer, Hu Cao, and Alois C Knoll.","author":"Zhou Xingcheng","year":"2024","unstructured":"Xingcheng Zhou, Mingyu Liu, Ekim Yurtsever, Bare Luka Zagar, Walter Zimmer, Hu Cao, and Alois C Knoll. 2024a. Vision language models in autonomous driving: A survey and outlook. IEEE Transactions on Intelligent Vehicles (2024)."},{"key":"e_1_3_2_1_66_1","volume-title":"Overcoming language priors with counterfactual inference for visual question answering. Chinese Computational Linguistics","author":"Zhu Jingbo","year":"2023","unstructured":"Jingbo Zhu. 2023. Overcoming language priors with counterfactual inference for visual question answering. Chinese Computational Linguistics (2023), 58."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.890"},{"key":"e_1_3_2_1_68_1","unstructured":"Xin Zou Yizhou Wang Yibo Yan Yuanhuiyi Lyu Kening Zheng Sirui Huang Junkai Chen Peijie Jiang Jia Liu Chang Tang et al. 2024. Look twice before you answer: Memory-space visual retracing for hallucination mitigation in multimodal large language models. arXiv preprint arXiv:2410.03577 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755372","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:00:50Z","timestamp":1765339250000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755372"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":68,"alternative-id":["10.1145\/3746027.3755372","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755372","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}