{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:04:07Z","timestamp":1765343047470,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754784","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"6791-6800","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Identify, Isolate, and Purge: Mitigating Hallucinations in LVLMs via Self-Evolving Distillation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-4010-3536","authenticated-orcid":false,"given":"Wenhao","family":"Li","sequence":"first","affiliation":[{"name":"The University of Sydney, Sydney, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9863-5404","authenticated-orcid":false,"given":"Xiu","family":"Su","sequence":"additional","affiliation":[{"name":"Central South University, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8283-0646","authenticated-orcid":false,"given":"Jingyi","family":"Wu","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3495-5257","authenticated-orcid":false,"given":"Feng","family":"Yang","sequence":"additional","affiliation":[{"name":"Southeast University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1312-0146","authenticated-orcid":false,"given":"Yang","family":"Liu","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4283-7485","authenticated-orcid":false,"given":"Yi","family":"Chen","sequence":"additional","affiliation":[{"name":"HKUST, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1964-0430","authenticated-orcid":false,"given":"Shan","family":"You","sequence":"additional","affiliation":[{"name":"Sensetime Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4756-0609","authenticated-orcid":false,"given":"Chang","family":"Xu","sequence":"additional","affiliation":[{"name":"The University of Sydney, Sydney, Australia"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems 35 (2022) 23716--23736."},{"key":"e_1_3_2_1_3_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et al. 2023. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond. arXiv preprint arXiv:2308.12966 1, 2","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond. arXiv preprint arXiv:2308.12966 1, 2 (2023), 3."},{"key":"e_1_3_2_1_5_1","volume-title":"Hallucination of multimodal large language models: A survey. arXiv preprint arXiv:2404.18930","author":"Bai Zechen","year":"2024","unstructured":"Zechen Bai, Pichao Wang, Tianjun Xiao, Tong He, Zongbo Han, Zheng Zhang, and Mike Zheng Shou. 2024. Hallucination of multimodal large language models: A survey. arXiv preprint arXiv:2404.18930 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"Cycles of thought: Measuring llm confidence through stable explanations. arXiv preprint arXiv:2406.03441","author":"Becker Evan","year":"2024","unstructured":"Evan Becker and Stefano Soatto. 2024. Cycles of thought: Measuring llm confidence through stable explanations. arXiv preprint arXiv:2406.03441 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00253"},{"key":"e_1_3_2_1_8_1","volume-title":"Language models are few-shot learners. arXiv preprint arXiv:2005.14165","author":"Brown Tom B","year":"2020","unstructured":"Tom B Brown. 2020. Language models are few-shot learners. arXiv preprint arXiv:2005.14165 (2020)."},{"key":"e_1_3_2_1_9_1","volume-title":"Pali: A jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794","author":"Chen Xi","year":"2022","unstructured":"Xi Chen, Xiao Wang, Soravit Changpinyo, AJ Piergiovanni, Piotr Padlewski, Daniel Salz, Sebastian Goodman, Adam Grycner, Basil Mustafa, Lucas Beyer, et al. 2022. Pali: A jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794 (2022)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Zhe Chen Weiyun Wang Hao Tian Shenglong Ye Zhangwei Gao Erfei Cui Wenwen Tong Kongzhi Hu Jiapeng Luo Zheng Ma et al. 2024. How far are we to gpt-4v? closing the gap to commercial multimodal models with open-source suites. arXiv preprint arXiv:2404.16821 (2024).","DOI":"10.1007\/s11432-024-4231-5"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185--24198","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al. 2024. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185--24198."},{"key":"e_1_3_2_1_12_1","volume-title":"Halc: Object hallucination reduction via adaptive focal-contrast decoding. arXiv preprint arXiv:2403.00425","author":"Chen Zhaorun","year":"2024","unstructured":"Zhaorun Chen, Zhuokai Zhao, Hongyin Luo, Huaxiu Yao, Bo Li, and Jiawei Zhou. 2024. Halc: Object hallucination reduction via adaptive focal-contrast decoding. arXiv preprint arXiv:2403.00425 (2024)."},{"key":"e_1_3_2_1_13_1","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys.org (accessed","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E Gonzalez, et al. 2023. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys.org (accessed 14 April 2023) 2, 3 (2023), 6."},{"key":"e_1_3_2_1_14_1","volume-title":"Seeing is believing: Mitigating hallucination in large vision-language models via clip-guided decoding. arXiv preprint arXiv:2402.15300","author":"Deng Ailin","year":"2024","unstructured":"Ailin Deng, Zhirui Chen, and Bryan Hooi. 2024. Seeing is believing: Mitigating hallucination in large vision-language models via clip-guided decoding. arXiv preprint arXiv:2402.15300 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_16_1","volume-title":"MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv:2306.13394 [cs.CV] https:\/\/arxiv.org\/abs\/2306.13394","author":"Fu Chaoyou","year":"2024","unstructured":"Chaoyou Fu, Peixian Chen, Yunhang Shen, Yulei Qin, Mengdan Zhang, Xu Lin, Jinrui Yang, Xiawu Zheng, Ke Li, Xing Sun, Yunsheng Wu, and Rongrong Ji. 2024. MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv:2306.13394 [cs.CV] https:\/\/arxiv.org\/abs\/2306.13394"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.2305016120"},{"key":"e_1_3_2_1_18_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Gu Yuxian","year":"2024","unstructured":"Yuxian Gu, Li Dong, Furu Wei, and Minlie Huang. 2024. MiniLLM: Knowledge distillation of large language models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_19_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 17980--17989","author":"Hu Xiaowei","year":"2022","unstructured":"Xiaowei Hu, Zhe Gan, Jianfeng Wang, Zhengyuan Yang, Zicheng Liu, Yumao Lu, and Lijuan Wang. 2022. Scaling up vision-language pre-training for image captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 17980--17989."},{"key":"e_1_3_2_1_21_1","unstructured":"Lei Huang Weijiang Yu Weitao Ma Weihong Zhong Zhangyin Feng Haotian Wang Qianglong Chen Weihua Peng Xiaocheng Feng Bing Qin et al. 2023. A survey on hallucination in large language models: Principles taxonomy challenges and open questions. arXiv preprint arXiv:2311.05232 (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01274"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02553"},{"key":"e_1_3_2_1_24_1","volume-title":"BERT: a review of applications in natural language processing and understanding. arXiv preprint arXiv:2103.11943","author":"Koroteev Mikhail V","year":"2021","unstructured":"Mikhail V Koroteev. 2021. BERT: a review of applications in natural language processing and understanding. arXiv preprint arXiv:2103.11943 (2021)."},{"key":"e_1_3_2_1_25_1","first-page":"34586","article-title":"Factuality enhanced language models for open-ended text generation","volume":"35","author":"Lee Nayeon","year":"2022","unstructured":"Nayeon Lee, Wei Ping, Peng Xu, Mostofa Patwary, Pascale N Fung, Mohammad Shoeybi, and Bryan Catanzaro. 2022. Factuality enhanced language models for open-ended text generation. Advances in Neural Information Processing Systems 35 (2022), 34586--34599.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01316"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01316"},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_29_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_1_30_1","volume-title":"Wayne Xin Zhao, and Ji-Rong Wen","author":"Li Yifan","year":"2023","unstructured":"Yifan Li, Yifan Du, Kun Zhou, Jinpeng Wang, Wayne Xin Zhao, and Ji-Rong Wen. 2023. Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"Wayne Xin Zhao, and Ji-Rong Wen","author":"Li Yifan","year":"2023","unstructured":"Yifan Li, Yifan Du, Kun Zhou, Jinpeng Wang, Wayne Xin Zhao, and Ji-Rong Wen. 2023. Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings, Part V 13","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6--12, 2014, Proceedings, Part V 13. Springer, 740--755."},{"key":"e_1_3_2_1_33_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Liu Fuxiao","year":"2023","unstructured":"Fuxiao Liu, Kevin Lin, Linjie Li, Jianfeng Wang, Yaser Yacoob, and Lijuan Wang. 2023. Mitigating hallucination in large multi-modal models via robust instruction tuning. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_35_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems 36","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems 36 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 364","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 364 (2019)."},{"key":"e_1_3_2_1_37_1","volume-title":"Look","author":"Qu Xiaoye","year":"2024","unstructured":"Xiaoye Qu, Jiashuo Sun, Wei Wei, and Yu Cheng. 2024. Look, Compare, Decide: Alleviating Hallucination in Large Vision-Language Models via Multi-View Multi-Path Reasoning. arXiv preprint arXiv:2408.17150 (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"Kaylee Burns, Trevor Darrell, and Kate Saenko.","author":"Rohrbach Anna","year":"2018","unstructured":"Anna Rohrbach, Lisa Anne Hendricks, Kaylee Burns, Trevor Darrell, and Kate Saenko. 2018. Object hallucination in image captioning. arXiv preprint arXiv:1809.02156 (2018)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"e_1_3_2_1_40_1","unstructured":"Zhiqing Sun Sheng Shen Shengcao Cao Haotian Liu Chunyuan Li Yikang Shen Chuang Gan Liang-Yan Gui Yu-Xiong Wang Yiming Yang et al. 2023. Aligning large multimodal models with factually augmented rlhf. arXiv preprint arXiv:2309.14525 (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"A comprehensive survey of hallucination mitigation techniques in large language models. arXiv preprint arXiv:2401.01313","author":"Tonmoy SM","year":"2024","unstructured":"SM Tonmoy, SM Zaman, Vinija Jain, Anku Rani, Vipula Rawte, Aman Chadha, and Amitava Das. 2024. A comprehensive survey of hallucination mitigation techniques in large language models. arXiv preprint arXiv:2401.01313 (2024)."},{"key":"e_1_3_2_1_42_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100","author":"Wang Jianfeng","year":"2022","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. 2022. Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100 (2022)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-53302-0_3"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-53302-0_3"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72980-5_17"},{"key":"e_1_3_2_1_47_1","volume-title":"Logical closed loop: Uncovering object hallucinations in large vision-language models. arXiv preprint arXiv:2402.11622","author":"Wu Junfei","year":"2024","unstructured":"Junfei Wu, Qiang Liu, Ding Wang, Jinghao Zhang, Shu Wu, Liang Wang, and Tieniu Tan. 2024. Logical closed loop: Uncovering object hallucinations in large vision-language models. arXiv preprint arXiv:2402.11622 (2024)."},{"key":"e_1_3_2_1_48_1","volume-title":"Detecting and mitigating hallucination in large vision language models via fine-grained ai feedback. arXiv preprint arXiv:2404.14233","author":"Xiao Wenyi","year":"2024","unstructured":"Wenyi Xiao, Ziwei Huang, Leilei Gan, Wanggui He, Haoyuan Li, Zhelun Yu, Hao Jiang, Fei Wu, and Linchao Zhu. 2024. Detecting and mitigating hallucination in large vision language models via fine-grained ai feedback. arXiv preprint arXiv:2404.14233 (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"International Conference on Machine Learning. PMLR, 38728--38748","author":"Xu Haiyang","year":"2023","unstructured":"Haiyang Xu, Qinghao Ye, Ming Yan, Yaya Shi, Jiabo Ye, Yuanhong Xu, Chenliang Li, Bin Bi, Qi Qian, Wei Wang, et al. 2023. mplug-2: A modularized multi-modal foundation model across text, image and video. In International Conference on Machine Learning. PMLR, 38728--38748."},{"key":"e_1_3_2_1_50_1","volume-title":"Cognitive mirage: A review of hallucinations in large language models. arXiv preprint arXiv:2309.06794","author":"Ye Hongbin","year":"2023","unstructured":"Hongbin Ye, Tong Liu, Aijia Zhang, Wei Hua, and Weiqiang Jia. 2023. Cognitive mirage: A review of hallucinations in large language models. arXiv preprint arXiv:2309.06794 (2023)."},{"key":"e_1_3_2_1_51_1","volume-title":"A survey on multimodal large language models. arXiv preprint arXiv:2306.13549","author":"Yin Shukang","year":"2023","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Ke Li, Xing Sun, Tong Xu, and Enhong Chen. 2023. A survey on multimodal large language models. arXiv preprint arXiv:2306.13549 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"Woodpecker: Hallucination correction for multimodal large language models. arXiv preprint arXiv:2310.16045","author":"Yin Shukang","year":"2023","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Tong Xu, Hao Wang, Dianbo Sui, Yunhang Shen, Ke Li, Xing Sun, and Enhong Chen. 2023. Woodpecker: Hallucination correction for multimodal large language models. arXiv preprint arXiv:2310.16045 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"Ferret: Refer and ground anything anywhere at any granularity. arXiv preprint arXiv:2310.07704","author":"You Haoxuan","year":"2023","unstructured":"Haoxuan You, Haotian Zhang, Zhe Gan, Xianzhi Du, Bowen Zhang, Zirui Wang, Liangliang Cao, Shih-Fu Chang, and Yinfei Yang. 2023. Ferret: Refer and ground anything anywhere at any granularity. arXiv preprint arXiv:2310.07704 (2023)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01230"},{"key":"e_1_3_2_1_55_1","volume-title":"Mm-vet: Evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490","author":"Yu Weihao","year":"2023","unstructured":"Weihao Yu, Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Zicheng Liu, Xinchao Wang, and Lijuan Wang. 2023. Mm-vet: Evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490 (2023)."},{"key":"e_1_3_2_1_56_1","volume-title":"Tinyllama: An open-source small language model. arXiv preprint arXiv:2401.02385","author":"Zhang Peiyuan","year":"2024","unstructured":"Peiyuan Zhang, Guangtao Zeng, Tianduo Wang, and Wei Lu. 2024. Tinyllama: An open-source small language model. arXiv preprint arXiv:2401.02385 (2024)."},{"key":"e_1_3_2_1_57_1","volume-title":"Analyzing and mitigating object hallucination in large vision-language models. arXiv preprint arXiv:2310.00754","author":"Zhou Yiyang","year":"2023","unstructured":"Yiyang Zhou, Chenhang Cui, Jaehong Yoon, Linjun Zhang, Zhun Deng, Chelsea Finn, Mohit Bansal, and Huaxiu Yao. 2023. Analyzing and mitigating object hallucination in large vision-language models. arXiv preprint arXiv:2310.00754 (2023)."},{"key":"e_1_3_2_1_58_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."},{"key":"e_1_3_2_1_59_1","volume-title":"Ibd: Alleviating hallucinations in large vision-language models via image-biased decoding. arXiv preprint arXiv:2402.18476","author":"Zhu Lanyun","year":"2024","unstructured":"Lanyun Zhu, Deyi Ji, Tianrun Chen, Peng Xu, Jieping Ye, and Jun Liu. 2024. Ibd: Alleviating hallucinations in large vision-language models via image-biased decoding. arXiv preprint arXiv:2402.18476 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754784","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:59:44Z","timestamp":1765342784000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754784"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":59,"alternative-id":["10.1145\/3746027.3754784","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754784","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}