{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:24:57Z","timestamp":1776885897489,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":22,"publisher":"ACM","funder":[{"name":"Amazon Research Award"},{"name":"National Science Foundation CAREER Grant","award":["IIS-2340435"],"award-info":[{"award-number":["IIS-2340435"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3734581","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:31:04Z","timestamp":1750876264000},"page":"2138-2139","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Tutorial Proposal: Hallucinations in Large Language Models and Large Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9827-5835","authenticated-orcid":false,"given":"Liqiang","family":"Jing","sequence":"first","affiliation":[{"name":"University of Texas at Dallas, Richardson, Texas, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1441-3163","authenticated-orcid":false,"given":"Yue","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Texas at Dallas, Richardson, TX, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4255-8013","authenticated-orcid":false,"given":"Xinya","family":"Du","sequence":"additional","affiliation":[{"name":"University of Texas at Dallas, Richardson, TX, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"VQA: Visual Question Answering. In IEEE International Conference on Computer Vision. IEEE Computer Society, 2425--2433","author":"Antol Stanislaw","year":"2015","unstructured":"Stanislaw Antol, Aishwarya Agrawal, Jiasen Lu, Margaret Mitchell, Dhruv Batra, C. Lawrence Zitnick, and Devi Parikh. 2015. VQA: Visual Question Answering. In IEEE International Conference on Computer Vision. IEEE Computer Society, 2425--2433."},{"key":"e_1_3_2_1_2_1","unstructured":"Rishi Bommasani Drew A. Hudson Ehsan Adeli Russ B. Altman Simran Arora Sydney von Arx Michael S. Bernstein Jeannette Bohg Antoine Bosselut Emma Brunskill Erik Brynjolfsson Shyamal Buch Dallas Card Rodrigo Castellon Niladri S. Chatterji Annie S. Chen Kathleen Creel Jared Quincy Davis Dorottya Demszky Chris Donahue Moussa Doumbouya Esin Durmus Stefano Ermon John Etchemendy Kawin Ethayarajh Li Fei-Fei Chelsea Finn Trevor Gale Lauren Gillespie Karan Goel Noah D. Goodman Shelby Grossman Neel Guha Tatsunori Hashimoto Peter Henderson John Hewitt Daniel E. Ho Jenny Hong Kyle Hsu Jing Huang Thomas Icard Saahil Jain Dan Jurafsky Pratyusha Kalluri Siddharth Karamcheti Geoff Keeling Fereshte Khani Omar Khattab Pang Wei Koh Mark S. Krass Ranjay Krishna Rohith Kuditipudi and et al. 2021. On the Opportunities and Risks of Foundation Models. CoRR Vol. abs\/2108.07258 (2021). showeprint[arXiv]2108.07258 https:\/\/arxiv.org\/abs\/2108.07258"},{"key":"e_1_3_2_1_3_1","volume-title":"Language models are few-shot learners. arXiv preprint arXiv:2005.14165","author":"Brown Tom B","year":"2020","unstructured":"Tom B Brown. 2020. Language models are few-shot learners. arXiv preprint arXiv:2005.14165 (2020)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2409.16494"},{"key":"e_1_3_2_1_5_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In ICLR. OpenReview.net."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3571730"},{"key":"e_1_3_2_1_7_1","volume-title":"FGAIF: Aligning Large Vision-Language Models with Fine-grained AI Feedback. arXiv preprint arXiv:2404.05046","author":"Jing Liqiang","year":"2024","unstructured":"Liqiang Jing and Xinya Du. 2024. FGAIF: Aligning Large Vision-Language Models with Fine-grained AI Feedback. arXiv preprint arXiv:2404.05046 (2024)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2409.07703"},{"key":"e_1_3_2_1_9_1","volume-title":"FaithScore: Fine-grained Evaluations of Hallucinations in Large Vision-Language Models. In Findings of the Association for Computational Linguistics EMNLP","author":"Jing Liqiang","year":"2024","unstructured":"Liqiang Jing, Ruosen Li, Yunmo Chen, and Xinya Du. 2024b. FaithScore: Fine-grained Evaluations of Hallucinations in Large Vision-Language Models. In Findings of the Association for Computational Linguistics EMNLP 2024. https:\/\/arxiv.org\/abs\/2311.01477"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2023.ACL-LONG.635"},{"key":"e_1_3_2_1_11_1","volume-title":"Wayne Xin Zhao, and Ji rong Wen","author":"Li Yifan","year":"2023","unstructured":"Yifan Li, Yifan Du, Kun Zhou, Jinpeng Wang, Wayne Xin Zhao, and Ji rong Wen. 2023. Evaluating Object Hallucination in Large Vision-Language Models. ArXiv, Vol. abs\/2305.10355 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"Visual Instruction Tuning. CoRR","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual Instruction Tuning. CoRR, Vol. abs\/2304.08485 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Meredith Ringel Morris, and Edward Cutrell","author":"MacLeod Haley","year":"2017","unstructured":"Haley MacLeod, Cynthia L. Bennett, Meredith Ringel Morris, and Edward Cutrell. 2017. Understanding Blind People's Experiences with Computer-Generated Captions of Social Media Images. In CHI. ACM, 5988--5999."},{"key":"e_1_3_2_1_14_1","unstructured":"OpenAI. 2022. Introducing chatgpt. (2022)."},{"key":"e_1_3_2_1_15_1","volume-title":"Kaylee Burns, Trevor Darrell, and Kate Saenko.","author":"Rohrbach Anna","year":"2018","unstructured":"Anna Rohrbach, Lisa Anne Hendricks, Kaylee Burns, Trevor Darrell, and Kate Saenko. 2018. Object Hallucination in Image Captioning. In EMNLP. ACL, 4035--4045."},{"key":"e_1_3_2_1_16_1","volume-title":"Aligning Large Multimodal Models with Factually Augmented RLHF. CoRR","author":"Sun Zhiqing","year":"2023","unstructured":"Zhiqing Sun, Sheng Shen, Shengcao Cao, Haotian Liu, Yikang Shen, Chuang Gan, Liang-Yan Gui, Yu-Xiong Wang, Yiming Yang, Kurt Keutzer, and Trevor Darrell. 2023. Aligning Large Multimodal Models with Factually Augmented RLHF. CoRR, Vol. abs\/2309.14525 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"LLaMA: Open and Efficient Foundation Language Models. CoRR","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aur\u00e9lien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. CoRR, Vol. abs\/2302.13971 (2023)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2409.13612"},{"key":"e_1_3_2_1_19_1","volume-title":"mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. CoRR","author":"Ye Qinghao","year":"2023","unstructured":"Qinghao Ye, Haiyang Xu, Guohai Xu, Jiabo Ye, Ming Yan, Yiyang Zhou, Junyang Wang, Anwen Hu, Pengcheng Shi, Yaya Shi, Chenliang Li, Yuanhong Xu, Hehong Chen, Junfeng Tian, Qian Qi, Ji Zhang, and Fei Huang. 2023. mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. CoRR, Vol. abs\/2304.14178 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"Woodpecker: Hallucination Correction for Multimodal Large Language Models. CoRR","author":"Yin Shukang","year":"2023","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Tong Xu, Hao Wang, Dianbo Sui, Yunhang Shen, Ke Li, Xing Sun, and Enhong Chen. 2023. Woodpecker: Hallucination Correction for Multimodal Large Language Models. CoRR, Vol. abs\/2310.16045 (2023)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/AAAI.V39I24.34792"},{"key":"e_1_3_2_1_22_1","unstructured":"Yue Zhang Jingxuan Zuo and Liqiang Jing. 2024. Fine-grained and Explainable Factuality Evaluation for Multimodal Summarization. arxiv: 2402.11414 [cs.CL] https:\/\/arxiv.org\/abs\/2402.11414"}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","location":"Chicago IL USA","acronym":"ICMR '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3734581","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:14:10Z","timestamp":1755749650000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3734581"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":22,"alternative-id":["10.1145\/3731715.3734581","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3734581","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}