{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T21:19:00Z","timestamp":1771708740349,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758308","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:38Z","timestamp":1761377198000},"page":"13442-13449","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["SHALE: A Scalable Benchmark for Fine-grained Hallucination Evaluation in LVLMs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-7948-7033","authenticated-orcid":false,"given":"Bei","family":"Yan","sequence":"first","affiliation":[{"name":"Key Laboratory of AI Safety of CAS, Institute of Computing Technology, Chinese Academy of Sciences (CAS), Beijing, China and University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6328-2744","authenticated-orcid":false,"given":"Zhiyuan","family":"Chen","sequence":"additional","affiliation":[{"name":"Key Laboratory of AI Safety of CAS, Institute of Computing Technology, Chinese Academy of Sciences (CAS), Beijing, China and University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0696-2468","authenticated-orcid":false,"given":"Yuecong","family":"Min","sequence":"additional","affiliation":[{"name":"Key Laboratory of AI Safety of CAS, Institute of Computing Technology, Chinese Academy of Sciences (CAS), Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8899-3996","authenticated-orcid":false,"given":"Jie","family":"Zhang","sequence":"additional","affiliation":[{"name":"Key Laboratory of AI Safety of CAS, Institute of Computing Technology, Chinese Academy of Sciences (CAS), Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6657-6403","authenticated-orcid":false,"given":"Jiahao","family":"Wang","sequence":"additional","affiliation":[{"name":"Trustworthy Technology and Engineering Laboratory, Huawei, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1960-6207","authenticated-orcid":false,"given":"Xiaozhen","family":"Wang","sequence":"additional","affiliation":[{"name":"Trustworthy Technology and Engineering Laboratory, Huawei, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8348-392X","authenticated-orcid":false,"given":"Shiguang","family":"Shan","sequence":"additional","affiliation":[{"name":"Key Laboratory of AI Safety of CAS, Institute of Computing Technology, Chinese Academy of Sciences (CAS), Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Nguyen Bach, Amit Bahree, Arash Bakhtiari, Jianmin Bao, Harkirat Behl, et al.","author":"Abdin Marah","year":"2024","unstructured":"Marah Abdin, Jyoti Aneja, Hany Awadalla, Ahmed Awadallah, Ammar Ahmad Awan, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Jianmin Bao, Harkirat Behl, et al., 2024. Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone. arXiv preprint arXiv:2404.14219 (2024)."},{"key":"e_1_3_2_1_2_1","unstructured":"Alibaba DAMO Academy. 2024. Qwen-VL-Max. https:\/\/huggingface.co\/spaces\/Qwen\/Qwen-VL-Max."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-76298-0_52"},{"key":"e_1_3_2_1_4_1","volume-title":"Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"Hallucination of Multimodal Large Language Models: A Survey. arXiv preprint arXiv:2404.18930","author":"Bai Zechen","year":"2024","unstructured":"Zechen Bai, Pichao Wang, Tianjun Xiao, Tong He, Zongbo Han, Zheng Zhang, and Mike Zheng Shou. 2024. Hallucination of Multimodal Large Language Models: A Survey. arXiv preprint arXiv:2404.18930 (2024)."},{"key":"e_1_3_2_1_6_1","first-page":"22680","article-title":"Mitigating Open-Vocabulary Caption Hallucinations","author":"Ben-Kish Assaf","year":"2024","unstructured":"Assaf Ben-Kish, Moran Yanuka, Morris Alper, Raja Giryes, and Hadar Averbuch-Elor. 2024. Mitigating Open-Vocabulary Caption Hallucinations. In EMNLP. 22680-22698.","journal-title":"EMNLP."},{"key":"e_1_3_2_1_7_1","volume-title":"Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv preprint arXiv:2306.15195","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv preprint arXiv:2306.15195 (2023)."},{"key":"e_1_3_2_1_8_1","unstructured":"Lin Chen Jinsong Li Xiaoyi Dong Pan Zhang Yuhang Zang Zehui Chen Haodong Duan Jiaqi Wang Yu Qiao Dahua Lin et al. [n.d.]. Are We on the Right Way for Evaluating Large Vision-Language Models?. In NeurIPS."},{"key":"e_1_3_2_1_9_1","first-page":"3235","article-title":"Unified Hallucination Detection for Multimodal Large Language Models","author":"Chen Xiang","year":"2024","unstructured":"Xiang Chen, Chenxi Wang, Yida Xue, Ningyu Zhang, Xiaoyan Yang, Qiang Li, Yue Shen, Lei Liang, Jinjie Gu, and Huajun Chen. 2024a. Unified Hallucination Detection for Multimodal Large Language Models. In ACL. 3235-3252.","journal-title":"ACL."},{"key":"e_1_3_2_1_10_1","first-page":"24185","article-title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al., 2024b. InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. In CVPR. 24185-24198.","journal-title":"CVPR."},{"key":"e_1_3_2_1_11_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2024","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi. 2024. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. NeurIPS, Vol. 36 (2024)."},{"key":"e_1_3_2_1_12_1","unstructured":"Google DeepMind. 2024. Gemini-2.0. https:\/\/blog.google\/technology\/google-deepmind\/google-gemini-ai-update-december-2024\/."},{"key":"e_1_3_2_1_13_1","first-page":"248","article-title":"ImageNet: A Large-Scale Hierarchical Image Database","author":"Deng Jia","year":"2009","unstructured":"Jia Deng, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. 2009. ImageNet: A Large-Scale Hierarchical Image Database. In CVPR. Ieee, 248-255.","journal-title":"CVPR. Ieee"},{"key":"e_1_3_2_1_14_1","first-page":"10707","article-title":"Hallu-PI","author":"Ding Peng","year":"2024","unstructured":"Peng Ding, Jingyu Wu, Jun Kuang, Dan Ma, Xuezhi Cao, Xunliang Cai, Shi Chen, Jiajun Chen, and Shujian Huang. 2024. Hallu-PI: Evaluating Hallucination in Multi-modal Large Language Models within Perturbed Inputs. In ACM MM. 10707-10715.","journal-title":"In ACM MM."},{"key":"e_1_3_2_1_15_1","unstructured":"Xiaoyi Dong Pan Zhang Yuhang Zang Yuhang Cao Bin Wang Linke Ouyang Xilin Wei Songyang Zhang Haodong Duan Maosong Cao et al. 2024. Internlm-xcomposer2: Mastering free-form text-image composition and comprehension in vision-language large model. arXiv preprint arXiv:2401.16420 (2024)."},{"key":"e_1_3_2_1_16_1","unstructured":"Team GLM Aohan Zeng Bin Xu Bowen Wang Chenhui Zhang Da Yin Dan Zhang Diego Rojas Guanyu Feng Hanlin Zhao et al. 2024. ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools. arXiv preprint arXiv:2406.12793 (2024)."},{"key":"e_1_3_2_1_17_1","first-page":"14375","author":"Guan Tianrui","year":"2024","unstructured":"Tianrui Guan, Fuxiao Liu, Xiyang Wu, Ruiqi Xian, Zongxia Li, Xiaoyu Liu, Xijun Wang, Lichang Chen, Furong Huang, Yaser Yacoob, et al., 2024. HallusionBench: An Advanced Diagnostic Suite for Entangled Language Hallucination and Visual Illusion in Large Vision-Language Models. In CVPR. 14375-14385.","journal-title":"In CVPR."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3703155"},{"key":"e_1_3_2_1_19_1","unstructured":"iFLYTEK Co. Ltd. 2024. iFLYTEK OCR. https:\/\/www.xfyun.cn\/services\/common-ocr."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3571730"},{"key":"e_1_3_2_1_21_1","volume-title":"Large Language Models in Law: A Survey. AI Open","author":"Lai Jinqi","year":"2024","unstructured":"Jinqi Lai, Wensheng Gan, Jiayang Wu, Zhenlian Qi, and S Yu Philip. 2024. Large Language Models in Law: A Survey. AI Open (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension. arXiv preprint arXiv:2307.16125","author":"Li Bohao","year":"2023","unstructured":"Bohao Li, Rui Wang, Guangzhi Wang, Yuying Ge, Yixiao Ge, and Ying Shan. 2023c. SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension. arXiv preprint arXiv:2307.16125 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Jingkang Yang, Chunyuan Li, and Ziwei Liu.","author":"Li Bo","year":"2025","unstructured":"Bo Li, Yuanhan Zhang, Liangyu Chen, Jinghao Wang, Fanyi Pu, Joshua Adrian Cahyono, Jingkang Yang, Chunyuan Li, and Ziwei Liu. 2025. Otter: a Multi-Modal Model with in-Context Instruction Tuning. IEEE TPAMI (2025)."},{"key":"e_1_3_2_1_24_1","first-page":"292","article-title":"Evaluating Object Hallucination in Large Vision-Language Models","author":"Li Yifan","year":"2023","unstructured":"Yifan Li, Yifan Du, Kun Zhou, Jinpeng Wang, Wayne Xin Zhao, and Ji-Rong Wen. 2023a. Evaluating Object Hallucination in Large Vision-Language Models. In EMNLP. 292-305.","journal-title":"EMNLP."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3604237.3626869"},{"key":"e_1_3_2_1_26_1","volume-title":"Microsoft COCO: Common Objects in Context","author":"Lin Tsung-Yi","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In ECCV. Springer, 740-755."},{"key":"e_1_3_2_1_27_1","volume-title":"Evaluating Text-to-Visual Generation with Image-to-Text Generation","author":"Lin Zhiqiu","unstructured":"Zhiqiu Lin, Deepak Pathak, Baiqi Li, Jiayao Li, Xide Xia, Graham Neubig, Pengchuan Zhang, and Deva Ramanan. 2024. Evaluating Text-to-Visual Generation with Image-to-Text Generation. In ECCV. Springer, 366-384."},{"key":"e_1_3_2_1_28_1","unstructured":"Fuxiao Liu Kevin Lin Linjie Li Jianfeng Wang Yaser Yacoob and Lijuan Wang. 2024. Mitigating Hallucination in Large Multi-Modal Models via Robust Instruction Tuning. In ICLR."},{"key":"e_1_3_2_1_29_1","first-page":"34892","article-title":"Visual instruction tuning","volume":"36","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. NeurIPS, Vol. 36 (2023), 34892-34916.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_30_1","unstructured":"Jiazhen Liu Yuhan Fu Ruobing Xie Runquan Xie Xingwu Sun Fengzong Lian Zhanhui Kang and Xirong Li. 2025. PhD: A ChatGPT-Prompted Visual hallucination Evaluation Dataset. In CVPR."},{"key":"e_1_3_2_1_31_1","first-page":"37","article-title":"Negative Object Presence Evaluation (NOPE) to Measure Object Hallucination in Vision-Language Models","author":"Lovenia Holy","year":"2024","unstructured":"Holy Lovenia, Wenliang Dai, Samuel Cahyawijaya, Ziwei Ji, and Pascale Fung. 2024. Negative Object Presence Evaluation (NOPE) to Measure Object Hallucination in Vision-Language Models. In ALVR. 37-58.","journal-title":"ALVR."},{"key":"e_1_3_2_1_32_1","unstructured":"Aleksander Madry Aleksandar Makelov Ludwig Schmidt Dimitris Tsipras and Adrian Vladu. 2018. Towards Deep Learning Models Resistant to Adversarial Attacks. In ICLR."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Cheng Peng Xi Yang Aokun Chen Kaleb E Smith Nima PourNejatian Anthony B Costa Cheryl Martin Mona G Flores Ying Zhang Tanja Magoc et al. 2023. A Study of Generative Large Language Model for Medical Research and Healthcare. NPJ digital medicine Vol. 6 1 (2023) 210.","DOI":"10.1038\/s41746-023-00958-w"},{"key":"e_1_3_2_1_34_1","volume-title":"A Survey of Hallucination in Large Foundation Models. arXiv preprint arXiv:2309.05922","author":"Rawte Vipula","year":"2023","unstructured":"Vipula Rawte, Amit Sheth, and Amitava Das. 2023. A Survey of Hallucination in Large Foundation Models. arXiv preprint arXiv:2309.05922 (2023)."},{"key":"e_1_3_2_1_35_1","first-page":"4035","article-title":"Object Hallucination in Image Captioning","author":"Rohrbach Anna","year":"2018","unstructured":"Anna Rohrbach, Lisa Anne Hendricks, Kaylee Burns, Trevor Darrell, and Kate Saenko. 2018. Object Hallucination in Image Captioning. In EMNLP. 4035-4045.","journal-title":"EMNLP."},{"key":"e_1_3_2_1_36_1","first-page":"10684","article-title":"High-Resolution Image Synthesis with Latent Diffusion Models","author":"Rombach Robin","year":"2022","unstructured":"Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, and Bj\u00f6rn Ommer. 2022. High-Resolution Image Synthesis with Latent Diffusion Models. In CVPR. 10684-10695.","journal-title":"CVPR."},{"key":"e_1_3_2_1_37_1","first-page":"49659","volume":"36","author":"Sun Keqiang","year":"2023","unstructured":"Keqiang Sun, Junting Pan, Yuying Ge, Hao Li, Haodong Duan, Xiaoshi Wu, Renrui Zhang, Aojun Zhou, Zipeng Qin, Yi Wang, et al., 2023b. JourneyDB: A Benchmark for Generative Image Understanding. NeurIPS, Vol. 36 (2023), 49659-49678.","journal-title":"JourneyDB: A Benchmark for Generative Image Understanding. NeurIPS"},{"key":"e_1_3_2_1_38_1","volume-title":"EVA-CLIP: Improved Training Techniques for CLIP at Scale. arXiv preprint arXiv:2303.15389","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang, and Yue Cao. 2023a. EVA-CLIP: Improved Training Techniques for CLIP at Scale. arXiv preprint arXiv:2303.15389 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Yu Xiong Wang, Yiming Yang, et al.","author":"Sun Zhiqing","year":"2024","unstructured":"Zhiqing Sun, Sheng Shen, Shengcao Cao, Haotian Liu, Chunyuan Li, Yikang Shen, Chuang Gan, Liang Yan Gui, Yu Xiong Wang, Yiming Yang, et al., 2024. Aligning Large Multimodal Models with Factually Augmented RLHF. In Findings ACL. Association for Computational Linguistics (ACL), 13088-13110."},{"key":"e_1_3_2_1_40_1","volume-title":"An LLM-free Multi-dimensional Benchmark for MLLMs Hallucination Evaluation. arXiv preprint arXiv:2311.07397","author":"Wang Junyang","year":"2023","unstructured":"Junyang Wang, Yuhang Wang, Guohai Xu, Jing Zhang, Yukai Gu, Haitao Jia, Ming Yan, Ji Zhang, and Jitao Sang. 2023. An LLM-free Multi-dimensional Benchmark for MLLMs Hallucination Evaluation. arXiv preprint arXiv:2311.07397 (2023)."},{"key":"e_1_3_2_1_41_1","unstructured":"Zhiyu Wu Xiaokang Chen Zizheng Pan Xingchao Liu Wen Liu Damai Dai Huazuo Gao Yiyang Ma Chengyue Wu Bingxuan Wang et al. 2024. DeepSeek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding. arXiv preprint arXiv:2412.10302 (2024)."},{"key":"e_1_3_2_1_42_1","volume-title":"MMBench: Benchmarking End-to-End Multi-modal DNNs and Understanding Their Hardware-Software Implications. In IEEE International Symposium on Workload Characterization (IISWC).","author":"Xu Cheng","year":"2023","unstructured":"Cheng Xu, Xiaofeng Hou, Jiacheng Liu, Chao Li, Tianhao Huang, Xiaozhi Zhu, Mo Niu, Lingyu Sun, Peng Tang, Tongqiao Xu, Kwang-Ting Cheng, and Minyi Guo. 2023. MMBench: Benchmarking End-to-End Multi-modal DNNs and Understanding Their Hardware-Software Implications. In IEEE International Symposium on Workload Characterization (IISWC)."},{"key":"e_1_3_2_1_43_1","volume-title":"LLVLM-eHub: A Comprehensive Evaluation Benchmark for Large Vision-Language Models","author":"Xu Peng","year":"2024","unstructured":"Peng Xu, Wenqi Shao, Kaipeng Zhang, Peng Gao, Shuo Liu, Meng Lei, Fanqing Meng, Siyuan Huang, Yu Qiao, and Ping Luo. 2024. LLVLM-eHub: A Comprehensive Evaluation Benchmark for Large Vision-Language Models. IEEE TPAMI (2024)."},{"key":"e_1_3_2_1_44_1","volume-title":"A MultiModal Moral Benchmark for LVLMs. arXiv preprint arXiv:2412.20718","author":"Yan Bei","year":"2024","unstructured":"Bei Yan, Jie Zhang, Zhiyuan Chen, Shiguang Shan, and Xilin Chen. 2024. M^3oralBench: A MultiModal Moral Benchmark for LVLMs. arXiv preprint arXiv:2412.20718 (2024)."},{"key":"e_1_3_2_1_45_1","unstructured":"Yuan Yao Tianyu Yu Ao Zhang Chongyi Wang Junbo Cui Hongji Zhu Tianchi Cai Haoyu Li Weilin Zhao Zhihui He et al. 2024. MiniCPM-V: A GPT-4V Level MLLM on Your Phone. arXiv preprint arXiv:2408.01800 (2024)."},{"key":"e_1_3_2_1_46_1","first-page":"13040","article-title":"mPLUG-Owl2","author":"Ye Qinghao","year":"2024","unstructured":"Qinghao Ye, Haiyang Xu, Jiabo Ye, Ming Yan, Anwen Hu, Haowei Liu, Qi Qian, Ji Zhang, and Fei Huang. 2024. mPLUG-Owl2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration. In CVPR. 13040-13051.","journal-title":"Revolutionizing Multi-modal Large Language Model with Modality Collaboration. In CVPR."},{"key":"e_1_3_2_1_47_1","volume-title":"Dysca: A Dynamic and Scalable Benchmark for Evaluating Perception Ability of LVLMs. ICLR","author":"Zhang Jie","year":"2024","unstructured":"Jie Zhang, Zhongqi Wang, Mengqi Lei, Zheng Yuan, Bei Yan, Shiguang Shan, and Xilin Chen. 2024. Dysca: A Dynamic and Scalable Benchmark for Evaluating Perception Ability of LVLMs. ICLR (2024)."},{"key":"e_1_3_2_1_48_1","volume-title":"Yuhang Cao, Chao Xu, Linke Ouyang, Zhiyuan Zhao, Shuangrui Ding, Songyang Zhang, Haodong Duan, Hang Yan, et al.","author":"Zhang Pan","year":"2023","unstructured":"Pan Zhang, Xiaoyi Dong Bin Wang, Yuhang Cao, Chao Xu, Linke Ouyang, Zhiyuan Zhao, Shuangrui Ding, Songyang Zhang, Haodong Duan, Hang Yan, et al., 2023. InternLM-XComposer: A Vision-Language Large Model for Advanced Text-image Comprehension and Composition. arXiv preprint arXiv:2309.15112 (2023)."},{"key":"e_1_3_2_1_49_1","unstructured":"Jinguo Zhu Weiyun Wang Zhe Chen Zhaoyang Liu Shenglong Ye Lixin Gu Yuchen Duan Hao Tian Weijie Su Jie Shao et al. 2025. Internvl3: Exploring advanced training and test-time recipes for open-source multimodal models. arXiv preprint arXiv:2504.10479 (2025)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758308","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:45:46Z","timestamp":1765309546000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758308"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":49,"alternative-id":["10.1145\/3746027.3758308","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758308","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}