{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T17:55:52Z","timestamp":1775325352684,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","funder":[{"name":"Intelligent Game and Decision Lab","award":["2022A000300"],"award-info":[{"award-number":["2022A000300"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755364","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:54:15Z","timestamp":1761375255000},"page":"4232-4241","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Debiasing Multimodal Large Language Models via Penalization of Language Priors"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6227-0183","authenticated-orcid":false,"given":"YiFan","family":"Zhang","sequence":"first","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9241-236X","authenticated-orcid":false,"given":"Yang","family":"Shi","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7935-2358","authenticated-orcid":false,"given":"Weichen","family":"Yu","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4516-2524","authenticated-orcid":false,"given":"Qingsong","family":"Wen","sequence":"additional","affiliation":[{"name":"Squirrel AI Learning, Bellevue, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2296-9688","authenticated-orcid":false,"given":"Xue","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6997-0406","authenticated-orcid":false,"given":"Wenjing","family":"Yang","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9425-3065","authenticated-orcid":false,"given":"Zhang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5224-8647","authenticated-orcid":false,"given":"Liang","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8797-4646","authenticated-orcid":false,"given":"Rong","family":"Jin","sequence":"additional","affiliation":[{"name":"Meta, Seattle, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et al. 2023a. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023b. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_4_1","unstructured":"Hung-Ting Chen Michael Zhang and Eunsol Choi. 2022. Rich Knowledge Sources Bring Complex Knowledge Conflicts: Recalibrating Models to Reflect Conflicting Evidence. In EMNLP."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/MOST60774.2024.00009"},{"key":"e_1_3_2_1_6_1","volume-title":"Towards end-to-end embodied decision making via multi-modal large language model: Explorations with gpt4-vision and beyond. arXiv preprint arXiv:2310.02071","author":"Chen Liang","year":"2023","unstructured":"Liang Chen, Yichi Zhang, Shuhuai Ren, Haozhe Zhao, Zefan Cai, Yuchi Wang, Peiyi Wang, Tianyu Liu, and Baobao Chang. 2023. Towards end-to-end embodied decision making via multi-modal large language model: Explorations with gpt4-vision and beyond. arXiv preprint arXiv:2310.02071 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"VersaVid-R1: A Versatile Video Understanding and Reasoning Model from Question Answering to Captioning Tasks. arXiv preprint arXiv:2506.09079","author":"Chen Xinlong","year":"2025","unstructured":"Xinlong Chen, Yuanxing Zhang, Yushuo Guan, Bohan Zeng, Yang Shi, Sihan Yang, Pengfei Wan, Qiang Liu, Liang Wang, and Tieniu Tan. 2025. VersaVid-R1: A Versatile Video Understanding and Reasoning Model from Question Answering to Captioning Tasks. arXiv preprint arXiv:2506.09079 (2025)."},{"key":"e_1_3_2_1_8_1","unstructured":"Zhili Cheng Yuge Tu Ran Li Shiqi Dai Jinyi Hu Shengding Hu Jiahao Li Yang Shi Tianyu Yu Weize Chen et al. 2025. EmbodiedEval: Evaluate Multimodal LLMs as Embodied Agents. arXiv preprint arXiv:2501.11858 (2025)."},{"key":"e_1_3_2_1_9_1","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys. org (accessed","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E Gonzalez, et al., 2023. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys. org (accessed 14 April 2023) (2023)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00106"},{"key":"e_1_3_2_1_11_1","volume-title":"Instructblip: Towards general-purpose vision-language models with instruction tuning. arxiv","author":"Dai W","year":"2023","unstructured":"W Dai, J Li, D Li, AMH Tiong, J Zhao, W Wang, B Li, P Fung, and S Hoi. 2023. Instructblip: Towards general-purpose vision-language models with instruction tuning. arxiv 2023. arXiv preprint arXiv:2305.06500 (2023)."},{"key":"e_1_3_2_1_12_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_13_1","volume-title":"MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:2306.13394","author":"Fu Chaoyou","year":"2023","unstructured":"Chaoyou Fu, Peixian Chen, Yunhang Shen, Yulei Qin, Mengdan Zhang, Xu Lin, Jinrui Yang, Xiawu Zheng, Ke Li, Xing Sun, Yunsheng Wu, and Rongrong Ji. 2023. MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:2306.13394 (2023)."},{"key":"e_1_3_2_1_14_1","unstructured":"Chaoyou Fu Haojia Lin Xiong Wang Yi-Fan Zhang Yunhang Shen Xiaoyu Liu Yangze Li Zuwei Long Heting Gao Ke Li et al. 2025. Vita-1.5: Towards gpt-4o level real-time vision and speech interaction. arXiv preprint arXiv:2501.01957 (2025)."},{"key":"e_1_3_2_1_15_1","volume-title":"Detecting and preventing hallucinations in large vision language models. arXiv preprint arXiv:2308.06394","author":"Gunjal Anisha","year":"2023","unstructured":"Anisha Gunjal, Jihan Yin, and Erhan Bas. 2023. Detecting and preventing hallucinations in large vision language models. arXiv preprint arXiv:2308.06394 (2023)."},{"key":"e_1_3_2_1_16_1","unstructured":"Chuan Guo Geoff Pleiss Yu Sun and Kilian Q Weinberger. 2017. On calibration of modern neural networks. In ICML."},{"key":"e_1_3_2_1_17_1","volume-title":"Gqa: A new dataset for real-world visual reasoning and compositional question answering. In CVPR.","author":"Hudson Drew A","year":"2019","unstructured":"Drew A Hudson and Christopher D Manning. 2019. Gqa: A new dataset for real-world visual reasoning and compositional question answering. In CVPR."},{"key":"e_1_3_2_1_18_1","volume-title":"Andrea Madotto, and Pascale Fung.","author":"Ji Ziwei","year":"2023","unstructured":"Ziwei Ji, Nayeon Lee, Rita Frieske, Tiezheng Yu, Dan Su, Yan Xu, Etsuko Ishii, Ye Jin Bang, Andrea Madotto, and Pascale Fung. 2023. Survey of hallucination in natural language generation. Comput. Surveys (2023)."},{"key":"e_1_3_2_1_19_1","unstructured":"Jae Myung Kim A Koepke Cordelia Schmid and Zeynep Akata. 2023. Exposing and Mitigating Spurious Correlations for Cross-Modal Retrieval. In CVPR."},{"key":"e_1_3_2_1_20_1","volume-title":"Mitigating object hallucinations in large vision-language models through visual contrastive decoding. arXiv preprint arXiv:2311.16922","author":"Leng Sicong","year":"2023","unstructured":"Sicong Leng, Hang Zhang, Guanzheng Chen, Xin Li, Shijian Lu, Chunyan Miao, and Lidong Bing. 2023. Mitigating object hallucinations in large vision-language models through visual contrastive decoding. arXiv preprint arXiv:2311.16922 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Li Chunyuan","year":"2024","unstructured":"Chunyuan Li, Cliff Wong, Sheng Zhang, Naoto Usuyama, Haotian Liu, Jianwei Yang, Tristan Naumann, Hoifung Poon, and Jianfeng Gao. 2024. Llava-med: Training a large language-and-vision assistant for biomedicine in one day. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"Contrastive decoding: Open-ended text generation as optimization. arXiv preprint arXiv:2210.15097","author":"Li Xiang Lisa","year":"2022","unstructured":"Xiang Lisa Li, Ari Holtzman, Daniel Fried, Percy Liang, Jason Eisner, Tatsunori Hashimoto, Luke Zettlemoyer, and Mike Lewis. 2022. Contrastive decoding: Open-ended text generation as optimization. arXiv preprint arXiv:2210.15097 (2022)."},{"key":"e_1_3_2_1_23_1","volume-title":"Wayne Xin Zhao, and Ji-Rong Wen","author":"Li Yifan","year":"2023","unstructured":"Yifan Li, Yifan Du, Kun Zhou, Jinpeng Wang, Wayne Xin Zhao, and Ji-Rong Wen. 2023. Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)."},{"key":"e_1_3_2_1_24_1","unstructured":"Victor Weixin Liang Yuhui Zhang Yongchan Kwon Serena Yeung and James Y Zou. 2022. Mind the Gap: Understanding the Modality Gap in Multi-modal Contrastive Representation Learning. In NeurIPS S. Koyejo S. Mohamed A. Agarwal D. Belgrave K. Cho and A. Oh (Eds.)."},{"key":"e_1_3_2_1_25_1","unstructured":"Tsung-Yi Lin Michael Maire Serge Belongie James Hays Pietro Perona Deva Ramanan Piotr Doll\u00e1r and C Lawrence Zitnick. 2024. Microsoft coco: Common objects in context. In ECCV."},{"key":"e_1_3_2_1_26_1","volume-title":"Aligning Large Multi-Modal Model with Robust Instruction Tuning. arXiv preprint arXiv:2306.14565","author":"Liu Fuxiao","year":"2023","unstructured":"Fuxiao Liu, Kevin Lin, Linjie Li, Jianfeng Wang, Yaser Yacoob, and Lijuan Wang. 2023c. Aligning Large Multi-Modal Model with Robust Instruction Tuning. arXiv preprint arXiv:2306.14565 (2023)."},{"key":"e_1_3_2_1_27_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li and Yong Jae Lee. 2023a. Improved Baselines with Visual Instruction Tuning."},{"key":"e_1_3_2_1_28_1","volume-title":"Visual instruction tuning. arXiv preprint arXiv:2304.08485","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023b. Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"NVILA: Efficient frontier visual language models. arXiv preprint arXiv:2412.04468","author":"Liu Zhijian","year":"2024","unstructured":"Zhijian Liu, Ligeng Zhu, Baifeng Shi, Zhuoyang Zhang, Yuming Lou, Shang Yang, Haocheng Xi, Shiyi Cao, Yuxian Gu, Dacheng Li, et al., 2024. NVILA: Efficient frontier visual language models. arXiv preprint arXiv:2412.04468 (2024)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Mary Phuong and Christoph H Lampert. 2019. Distillation-based training for multi-exit architectures. In ICCV.","DOI":"10.1109\/ICCV.2019.00144"},{"key":"e_1_3_2_1_31_1","unstructured":"John Platt et al. 1999. Probabilistic outputs for support vector machines and comparisons to regularized likelihood methods. Advances in large margin classifiers (1999)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.lanwpc.2024.101048"},{"key":"e_1_3_2_1_33_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv:2103.00020 [cs.CV]"},{"key":"e_1_3_2_1_34_1","volume-title":"Kaylee Burns, Trevor Darrell, and Kate Saenko.","author":"Rohrbach Anna","year":"2018","unstructured":"Anna Rohrbach, Lisa Anne Hendricks, Kaylee Burns, Trevor Darrell, and Kate Saenko. 2018. Object hallucination in image captioning. arXiv preprint arXiv:1809.02156 (2018)."},{"key":"e_1_3_2_1_35_1","volume-title":"Online deep learning: Learning deep neural networks on the fly. arXiv preprint arXiv:1711.03705","author":"Sahoo Doyen","year":"2017","unstructured":"Doyen Sahoo, Quang Pham, Jing Lu, and Steven CH Hoi. 2017. Online deep learning: Learning deep neural networks on the fly. arXiv preprint arXiv:1711.03705 (2017)."},{"key":"e_1_3_2_1_36_1","volume-title":"A-okvqa: A benchmark for visual question answering using world knowledge. In ECCV.","author":"Schwenk Dustin","year":"2022","unstructured":"Dustin Schwenk, Apoorv Khandelwal, Christopher Clark, Kenneth Marino, and Roozbeh Mottaghi. 2022. A-okvqa: A benchmark for visual question answering using world knowledge. In ECCV."},{"key":"e_1_3_2_1_37_1","volume-title":"Trusting Your Evidence: Hallucinate Less with Context-aware Decoding. arXiv preprint arXiv:2305.14739","author":"Shi Weijia","year":"2023","unstructured":"Weijia Shi, Xiaochuang Han, Mike Lewis, Yulia Tsvetkov, Luke Zettlemoyer, and Scott Wen-tau Yih. 2023. Trusting Your Evidence: Hallucinate Less with Context-aware Decoding. arXiv preprint arXiv:2305.14739 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Mavors: Multi-granularity video representation for multimodal large language model. arXiv preprint arXiv:2504.10068","author":"Shi Yang","year":"2025","unstructured":"Yang Shi, Jiaheng Liu, Yushuo Guan, Zhenhua Wu, Yuanxing Zhang, Zihao Wang, Weihong Lin, Jingyun Hua, Zekun Wang, Xinlong Chen, et al., 2025a. Mavors: Multi-granularity video representation for multimodal large language model. arXiv preprint arXiv:2504.10068 (2025)."},{"key":"e_1_3_2_1_39_1","unstructured":"Yang Shi Huanqian Wang Wulin Xie Huanyao Zhang Lijie Zhao Yi-Fan Zhang Xinfeng Li Chaoyou Fu Zhuoer Wen Wenting Liu et al. 2025b. MME-VideoOCR: Evaluating OCR-Based Capabilities of Multimodal LLMs in Video Scenarios. arXiv preprint arXiv:2505.21333 (2025)."},{"key":"e_1_3_2_1_40_1","unstructured":"Zhiqing Sun Sheng Shen Shengcao Cao Haotian Liu Chunyuan Li Yikang Shen Chuang Gan Liang-Yan Gui Yu-Xiong Wang Yiming Yang et al. 2023. Aligning large multimodal models with factually augmented rlhf. arXiv preprint arXiv:2309.14525 (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"A comprehensive survey of hallucination mitigation techniques in large language models. arXiv preprint arXiv:2401.01313","author":"Tonmoy SM","year":"2024","unstructured":"SM Tonmoy, SM Zaman, Vinija Jain, Anku Rani, Vipula Rawte, Aman Chadha, and Amitava Das. 2024. A comprehensive survey of hallucination mitigation techniques in large language models. arXiv preprint arXiv:2401.01313 (2024)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539335"},{"key":"e_1_3_2_1_43_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_44_1","unstructured":"Jason Wei Yi Tay Rishi Bommasani Colin Raffel Barret Zoph Sebastian Borgeaud Dani Yogatama Maarten Bosma Denny Zhou Donald Metzler et al. 2022. Emergent abilities of large language models. arXiv preprint arXiv:2206.07682 (2022)."},{"key":"e_1_3_2_1_45_1","unstructured":"An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chengyuan Li Dayiheng Liu Fei Huang Haoran Wei et al. 2024. Qwen2. 5 technical report. arXiv preprint arXiv:2412.15115 (2024)."},{"key":"e_1_3_2_1_46_1","volume-title":"Speculative contrastive decoding. arXiv preprint arXiv:2311.08981","author":"Yuan Hongyi","year":"2023","unstructured":"Hongyi Yuan, Keming Lu, Fei Huang, Zheng Yuan, and Chang Zhou. 2023. Speculative contrastive decoding. arXiv preprint arXiv:2311.08981 (2023)."},{"key":"e_1_3_2_1_47_1","volume-title":"MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI. arXiv preprint arXiv:2311.16502","author":"Yue Xiang","year":"2023","unstructured":"Xiang Yue, Yuansheng Ni, Kai Zhang, Tianyu Zheng, Ruoqi Liu, Ge Zhang, Samuel Stevens, Dongfu Jiang, Weiming Ren, Yuxuan Sun, Cong Wei, Botao Yu, Ruibin Yuan, Renliang Sun, Ming Yin, Boyuan Zheng, Zhenzhu Yang, Yibo Liu, Wenhao Huang, Huan Sun, Yu Su, and Wenhu Chen. 2023. MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI. arXiv preprint arXiv:2311.16502 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"Mm-rlhf: The next step forward in multimodal llm alignment. arXiv preprint arXiv:2502.10391","author":"Zhang Yi-Fan","year":"2025","unstructured":"Yi-Fan Zhang, Tao Yu, Haochen Tian, Chaoyou Fu, Peiyan Li, Jianshu Zeng, Wulin Xie, Yang Shi, Huanyu Zhang, Junkang Wu, et al., 2025. Mm-rlhf: The next step forward in multimodal llm alignment. arXiv preprint arXiv:2502.10391 (2025)."},{"key":"e_1_3_2_1_49_1","unstructured":"Zihao Zhao Eric Wallace Shi Feng Dan Klein and Sameer Singh. 2021. Calibrate before use: Improving few-shot performance of language models. In ICML."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Wenxuan Zhou Sheng Zhang Hoifung Poon and Muhao Chen. 2023b. Context-faithful Prompting for Large Language Models. arXiv:2303.11315 [cs.CL]","DOI":"10.18653\/v1\/2023.findings-emnlp.968"},{"key":"e_1_3_2_1_51_1","volume-title":"Analyzing and mitigating object hallucination in large vision-language models. arXiv preprint arXiv:2310.00754","author":"Zhou Yiyang","year":"2023","unstructured":"Yiyang Zhou, Chenhang Cui, Jaehong Yoon, Linjun Zhang, Zhun Deng, Chelsea Finn, Mohit Bansal, and Huaxiu Yao. 2023a. Analyzing and mitigating object hallucination in large vision-language models. arXiv preprint arXiv:2310.00754 (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755364","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T03:59:41Z","timestamp":1765339181000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755364"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":51,"alternative-id":["10.1145\/3746027.3755364","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755364","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}