{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:48:30Z","timestamp":1778082510075,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680576","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"525-534","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":22,"title":["Hal-Eval: A Universal and Fine-grained Hallucination Evaluation Framework for Large Vision Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-7282-159X","authenticated-orcid":false,"given":"Chaoya","family":"Jiang","sequence":"first","affiliation":[{"name":"National Engineering Research Center for Software Engineering, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4835-4920","authenticated-orcid":false,"given":"Hongrui","family":"Jia","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Software Engineering, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2526-9166","authenticated-orcid":false,"given":"Mengfan","family":"Dong","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Software Engineering, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9331-4716","authenticated-orcid":false,"given":"Wei","family":"Ye","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Software Engineering, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9442-5912","authenticated-orcid":false,"given":"Haiyang","family":"Xu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4959-8878","authenticated-orcid":false,"given":"Ming","family":"Yan","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3835-7975","authenticated-orcid":false,"given":"Ji","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8576-2674","authenticated-orcid":false,"given":"Shikun","family":"Zhang","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Software Engineering, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"23716","article-title":"Flamingo: a visual language model for few-shot learning","volume":"35","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katherine Millican, Malcolm Reynolds, et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in Neural Information Processing Systems, Vol. 35 (2022), 23716--23736.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_2_1","volume-title":"Gemini: A Family of Highly Capable Multimodal Models. ArXiv","author":"Google Rohan Anil Gemini Team","year":"2023","unstructured":"Gemini Team Google Rohan Anil, Sebastian Borgeaud, Yonghui Wu, Jean-Baptiste Alayrac, Jiahui Yu, Radu Soricut, and Johan Schalkwyk. 2023. Gemini: A Family of Highly Capable Multimodal Models. ArXiv, Vol. abs\/2312.11805 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, et al. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Qwen-VL: A Frontier Large Vision-Language Model with Versatile Abilities. ArXiv","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-VL: A Frontier Large Vision-Language Model with Versatile Abilities. ArXiv, Vol. abs\/2308.12966 (2023). https:\/\/api.semanticscholar.org\/CorpusID:263875678"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"e_1_3_2_1_7_1","volume-title":"Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. ArXiv","author":"Chen Ke","year":"2023","unstructured":"Ke Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. ArXiv, Vol. abs\/2306.15195 (2023). https:\/\/api.semanticscholar.org\/CorpusID:259262082"},{"key":"e_1_3_2_1_8_1","volume-title":"Pan Zhang, Conghui He, Jiaqi Wang, Feng Zhao, and Dahua Lin.","author":"Chen Lin","year":"2023","unstructured":"Lin Chen, Jinsong Li, Xiao wen Dong, Pan Zhang, Conghui He, Jiaqi Wang, Feng Zhao, and Dahua Lin. 2023. ShareGPT4V: Improving Large Multi-Modal Models with Better Captions. ArXiv, Vol. abs\/2311.12793 (2023). https:\/\/api.semanticscholar.org\/CorpusID:265308687"},{"key":"e_1_3_2_1_9_1","article-title":"PaLM: Scaling Language Modeling with Pathways","volume":"24","author":"Chowdhery Aakanksha","year":"2022","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, Parker Schuh, Kensen Shi, Sasha Tsvyashchenko, Joshua Maynez, Abhishek Rao, Parker Barnes, Yi Tay, Noam M. Shazeer, Vinodkumar Prabhakaran, Emily Reif, Nan Du, Benton C. Hutchinson, Reiner Pope, James Bradbury, Jacob Austin, Michael Isard, Guy Gur-Ari, Pengcheng Yin, Toju Duke, Anselm Levskaya, Sanjay Ghemawat, Sunipa Dev, Henryk Michalewski, Xavier Garc\u00eda, Vedant Misra, Kevin Robinson, Liam Fedus, Denny Zhou, Daphne Ippolito, David Luan, Hyeontaek Lim, Barret Zoph, Alexander Spiridonov, Ryan Sepassi, David Dohan, Shivani Agrawal, Mark Omernick, Andrew M. Dai, Thanumalayan Sankaranarayana Pillai, Marie Pellat, Aitor Lewkowycz, Erica Moreira, Rewon Child, Oleksandr Polozov, Katherine Lee, Zongwei Zhou, Xuezhi Wang, Brennan Saeta, Mark D\u00edaz, Orhan Firat, Michele Catasta, Jason Wei, Kathleen S. Meier-Hellstern, Douglas Eck, Jeff Dean, Slav Petrov, and Noah Fiedel. 2022. PaLM: Scaling Language Modeling with Pathways. J. Mach. Learn. Res., Vol. 24 (2022), 240:1--240:113. https:\/\/api.semanticscholar.org\/CorpusID:247951931","journal-title":"J. Mach. Learn. Res."},{"key":"e_1_3_2_1_10_1","unstructured":"Wenliang Dai Junnan Li Dongxu Li et al. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv preprint arXiv:2305.06500 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"PaLM-E: An Embodied Multimodal Language Model. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:257364842","author":"Driess Danny","unstructured":"Danny Driess, F. Xia, Mehdi S. M. Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Ho Vuong, Tianhe Yu, Wenlong Huang, Yevgen Chebotar, Pierre Sermanet, Daniel Duckworth, Sergey Levine, Vincent Vanhoucke, Karol Hausman, Marc Toussaint, Klaus Greff, Andy Zeng, Igor Mordatch, and Peter R. Florence. 2023. PaLM-E: An Embodied Multimodal Language Model. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:257364842"},{"key":"e_1_3_2_1_12_1","volume-title":"MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:2306.13394","author":"Fu Chaoyou","year":"2023","unstructured":"Chaoyou Fu, Peixian Chen, Yunhang Shen, Yulei Qin, Mengdan Zhang, Xu Lin, Zhenyu Qiu, Wei Lin, Jinrui Yang, Xiawu Zheng, et al. 2023. MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:2306.13394 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Goyal Yash","year":"2017","unstructured":"Yash Goyal, Tejas Khot, Douglas Summers-Stay, Dhruv Batra, and Devi Parikh. 2017. Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Tianrui Guan Fuxiao Liu Xiyang Wu et al. 2023. HallusionBench: An Advanced Diagnostic Suite for Entangled Language Hallucination & Visual Illusion in Large Vision-Language Models. arXiv preprint arXiv:2310.14566 (2023).","DOI":"10.1109\/CVPR52733.2024.01363"},{"key":"e_1_3_2_1_15_1","volume-title":"Detecting and preventing hallucinations in large vision language models. arXiv preprint arXiv:2308.06394","author":"Gunjal Anisha","year":"2023","unstructured":"Anisha Gunjal, Jihan Yin, and Erhan Bas. 2023. Detecting and preventing hallucinations in large vision language models. arXiv preprint arXiv:2308.06394 (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"CIEM: Contrastive Instruction Evaluation Method for Better Instruction Tuning. In NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following.","author":"Hu Hongyu","year":"2023","unstructured":"Hongyu Hu, Jiyuan Zhang, Minyi Zhao, et al. 2023. CIEM: Contrastive Instruction Evaluation Method for Better Instruction Tuning. In NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following."},{"key":"e_1_3_2_1_17_1","volume-title":"Qiang Liu, Kriti Aggarwal, Zewen Chi, Johan Bjorck, Vishrav Chaudhary, Subhojit Som, Xia Song, and Furu Wei.","author":"Huang Shaohan","year":"2023","unstructured":"Shaohan Huang, Li Dong, Wenhui Wang, Yaru Hao, Saksham Singhal, Shuming Ma, Tengchao Lv, Lei Cui, Owais Khan Mohammed, Qiang Liu, Kriti Aggarwal, Zewen Chi, Johan Bjorck, Vishrav Chaudhary, Subhojit Som, Xia Song, and Furu Wei. 2023. Language Is Not All You Need: Aligning Perception with Language Models. ArXiv, Vol. abs\/2302.14045 (2023). https:\/\/api.semanticscholar.org\/CorpusID:257219775"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"e_1_3_2_1_19_1","volume-title":"FAITHSCORE: Evaluating Hallucinations in Large Vision-Language Models. arXiv preprint arXiv:2311.01477","author":"Jing Liqiang","year":"2023","unstructured":"Liqiang Jing, Ruosen Li, Yunmo Chen, et al. 2023. FAITHSCORE: Evaluating Hallucinations in Large Vision-Language Models. arXiv preprint arXiv:2311.01477 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"2023 d. Seed-bench: Benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125","author":"Li Bohao","year":"2023","unstructured":"Bohao Li, Rui Wang, Guangzhi Wang, Yuying Ge, Yixiao Ge, and Ying Shan. 2023 d. Seed-bench: Benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"2023 e. Otter: A multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726","author":"Li Bo","year":"2023","unstructured":"Bo Li, Yuanhan Zhang, Liangyu Chen, et al. 2023 e. Otter: A multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)."},{"key":"e_1_3_2_1_22_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese et al. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In ICML."},{"key":"e_1_3_2_1_23_1","volume-title":"Hoi","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven C. H. Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. ArXiv, Vol. abs\/2301.12597 (2023). https:\/\/api.semanticscholar.org\/CorpusID:256390509"},{"key":"e_1_3_2_1_24_1","unstructured":"Yifan Li Yifan Du Kun Zhou et al. 2023. Evaluating object hallucination in large vision-language models. In EMNLP."},{"key":"e_1_3_2_1_25_1","volume-title":"ROUGE: A Package for Automatic Evaluation of Summaries. In Annual Meeting of the Association for Computational Linguistics. https:\/\/api.semanticscholar.org\/CorpusID:964287","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Annual Meeting of the Association for Computational Linguistics. https:\/\/api.semanticscholar.org\/CorpusID:964287"},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings, Part V 13","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6--12, 2014, Proceedings, Part V 13. Springer, 740--755."},{"key":"e_1_3_2_1_27_1","volume-title":"2023 f. Mitigating Hallucination in Large Multi-Modal Models via Robust Instruction Tuning. arXiv preprint arXiv:2306.14565","author":"Liu Fuxiao","year":"2023","unstructured":"Fuxiao Liu, Kevin Lin, Linjie Li, et al. 2023 f. Mitigating Hallucination in Large Multi-Modal Models via Robust Instruction Tuning. arXiv preprint arXiv:2306.14565 (2023)."},{"key":"e_1_3_2_1_28_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li et al. 2023. Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"Improved Baselines with Visual Instruction Tuning. ArXiv","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023. Improved Baselines with Visual Instruction Tuning. ArXiv, Vol. abs\/2310.03744 (2023). https:\/\/api.semanticscholar.org\/CorpusID:263672058"},{"key":"e_1_3_2_1_30_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu et al. 2023 e. Visual instruction tuning. In NeurIPS."},{"key":"e_1_3_2_1_31_1","volume-title":"2023 d. Visual Instruction Tuning. ArXiv","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023 d. Visual Instruction Tuning. ArXiv, Vol. abs\/2304.08485 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258179774"},{"key":"e_1_3_2_1_32_1","volume-title":"A Survey on Hallucination in Large Vision-Language Models. ArXiv","author":"Liu Hanchao","year":"2024","unstructured":"Hanchao Liu, Wenyuan Xue, Yifei Chen, Dapeng Chen, Xiutian Zhao, Ke Wang, Liping Hou, Rong-Zhi Li, and Wei Peng. 2024. A Survey on Hallucination in Large Vision-Language Models. ArXiv, Vol. abs\/2402.00253 (2024). https:\/\/api.semanticscholar.org\/CorpusID:267365472"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Yuan Liu Haodong Duan Yuanhan Zhang et al. 2023. MMBench: Is Your Multi-modal Model an All-around Player? arXiv preprint arXiv:2307.06281 (2023).","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Holy Lovenia Wenliang Dai Samuel Cahyawijaya et al. 2023. Negative Object Presence Evaluation (NOPE) to Measure Object Hallucination in Vision-Language Models. arXiv preprint arXiv:2310.05338 (2023).","DOI":"10.18653\/v1\/2024.alvr-1.4"},{"key":"e_1_3_2_1_36_1","unstructured":"OpenAI. 2023. GPT-4V(ision) System Card. https:\/\/cdn.openai.com\/papers\/GPTV_System_Card.pdf."},{"key":"e_1_3_2_1_37_1","volume-title":"Berg","author":"Ordonez Vicente","year":"2011","unstructured":"Vicente Ordonez, Girish Kulkarni, and Tamara L. Berg. 2011. Im2Text: Describing Images Using 1 Million Captioned Photographs. In Neural Information Processing Systems. https:\/\/api.semanticscholar.org\/CorpusID:14579301"},{"key":"e_1_3_2_1_38_1","volume-title":"Annual Meeting of the Association for Computational Linguistics. https:\/\/api.semanticscholar.org\/CorpusID:11080756","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a Method for Automatic Evaluation of Machine Translation. In Annual Meeting of the Association for Computational Linguistics. https:\/\/api.semanticscholar.org\/CorpusID:11080756"},{"key":"e_1_3_2_1_39_1","volume-title":"ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. arxiv","author":"Rajbhandari Samyam","year":"1910","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. 2020. ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. arxiv: 1910.02054 [cs.LG]"},{"key":"e_1_3_2_1_40_1","volume-title":"Kaylee Burns, et al.","author":"Rohrbach Anna","year":"2018","unstructured":"Anna Rohrbach, Lisa Anne Hendricks, Kaylee Burns, et al. 2018. Object Hallucination in Image Captioning. In EMNLP."},{"key":"e_1_3_2_1_41_1","first-page":"25278","article-title":"Laion-5b: An open large-scale dataset for training next generation image-text models","volume":"35","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, et al. 2022. Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems, Vol. 35 (2022), 25278--25294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_42_1","volume-title":"NeurIPS","volume":"33","author":"Stiennon Nisan","year":"2020","unstructured":"Nisan Stiennon, Long Ouyang, Jeffrey Wu, et al. 2020. Learning to summarize with human feedback. In NeurIPS, Vol. 33."},{"key":"e_1_3_2_1_43_1","unstructured":"Zhiqing Sun Sheng Shen Shengcao Cao et al. 2023. Aligning large multimodal models with factually augmented rlhf. arXiv preprint arXiv:2309.14525 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"Aligning Large Multimodal Models with Factually Augmented RLHF. ArXiv","author":"Sun Zhiqing","year":"2023","unstructured":"Zhiqing Sun, Sheng Shen, Shengcao Cao, Haotian Liu, Chunyuan Li, Yikang Shen, Chuang Gan, Liangyan Gui, Yu-Xiong Wang, Yiming Yang, Kurt Keutzer, and Trevor Darrell. 2023. Aligning Large Multimodal Models with Factually Augmented RLHF. ArXiv, Vol. abs\/2309.14525 (2023). https:\/\/api.semanticscholar.org\/CorpusID:262824780"},{"key":"e_1_3_2_1_45_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_46_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_47_1","volume-title":"Weijia Li, Wei Li, Jiaqi Wang, and Conghui He.","author":"Wang Bin","year":"2023","unstructured":"Bin Wang, Fan Wu, Xiao Han, Jiahui Peng, Huaping Zhong, Pan Zhang, Xiao wen Dong, Weijia Li, Wei Li, Jiaqi Wang, and Conghui He. 2023. VIGC: Visual Instruction Generation and Correction. ArXiv, Vol. abs\/2308.12714 (2023). https:\/\/api.semanticscholar.org\/CorpusID:261100735"},{"key":"e_1_3_2_1_48_1","unstructured":"Junyang Wang Yuhang Wang Guohai Xu et al. 2023. An llm-free multi-dimensional benchmark for mllms hallucination evaluation. arXiv preprint arXiv:2311.07397 (2023)."},{"key":"e_1_3_2_1_49_1","unstructured":"Junyang Wang Yiyang Zhou Guohai Xu et al. 2023. Evaluation and analysis of hallucination in large vision-language models. arXiv preprint arXiv:2308.15126 (2023)."},{"key":"e_1_3_2_1_50_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi et al. 2023. mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)."},{"key":"e_1_3_2_1_51_1","volume-title":"Mm-vet: Evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490","author":"Yu Weihao","year":"2023","unstructured":"Weihao Yu, Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Zicheng Liu, Xinchao Wang, and Lijuan Wang. 2023. Mm-vet: Evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490 (2023)."},{"key":"e_1_3_2_1_52_1","unstructured":"Bohan Zhai Shijia Yang Xiangchen Zhao et al. 2023. HallE-Switch: Rethinking and Controlling Object Existence Hallucinations in Large Vision Language Models for Detailed Caption. arXiv preprint arXiv:2310.01779 (2023)."},{"key":"e_1_3_2_1_53_1","unstructured":"Deyao Zhu Jun Chen Xiaoqian Shen et al. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."},{"key":"e_1_3_2_1_54_1","volume-title":"MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. ArXiv","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. ArXiv, Vol. abs\/2304.10592 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258291930"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680576","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680576","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:56Z","timestamp":1750295876000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680576"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":53,"alternative-id":["10.1145\/3664647.3680576","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680576","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}