{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T04:59:02Z","timestamp":1781585942333,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,28]]},"DOI":"10.1145\/3696410.3714739","type":"proceedings-article","created":{"date-parts":[[2025,5,5]],"date-time":"2025-05-05T16:42:02Z","timestamp":1746463322000},"page":"2872-2881","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["Towards Multimodal Empathetic Response Generation: A Rich Text-Speech-Vision Avatar-based Benchmark"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-1204-9614","authenticated-orcid":false,"given":"Han","family":"Zhang","sequence":"first","affiliation":[{"name":"School of Electronic Engineering, Xidian University, Xi'an, Shannxi, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7379-7478","authenticated-orcid":false,"given":"Zixiang","family":"Meng","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Engineering, Wuhan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2274-5719","authenticated-orcid":false,"given":"Meng","family":"Luo","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8019-3740","authenticated-orcid":false,"given":"Hong","family":"Han","sequence":"additional","affiliation":[{"name":"School of Electronic Engineering, Xidian University, Xi'an, Shaanxi, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9973-3305","authenticated-orcid":false,"given":"Lizi","family":"Liao","sequence":"additional","affiliation":[{"name":"Singapore Management University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3030-1280","authenticated-orcid":false,"given":"Erik","family":"Cambria","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3026-6347","authenticated-orcid":false,"given":"Hao","family":"Fei","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et al. 2023. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3616855.3635836"},{"key":"e_1_3_2_1_3_1","volume-title":"Xing","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03--30-vicuna\/"},{"key":"e_1_3_2_1_4_1","first-page":"1","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung Hyung Won","year":"2024","unstructured":"Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Yunxuan Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, et al. 2024. Scaling instruction-finetuned language models. Journal of Machine Learning Research 25, 70 (2024), 1--53.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_5_1","first-page":"13","article-title":"Out of time: automated lip sync in the wild. In Computer Vision--ACCV 2016 Workshops: ACCV 2016 International Workshops, Taipei, Taiwan, November 20--24, 2016, Revised Selected Papers","author":"Chung Joon Son","year":"2017","unstructured":"Joon Son Chung and Andrew Zisserman. 2017. Out of time: automated lip sync in the wild. In Computer Vision--ACCV 2016 Workshops: ACCV 2016 International Workshops, Taipei, Taiwan, November 20--24, 2016, Revised Selected Papers, Part II 13. 251--263.","journal-title":"Part"},{"key":"e_1_3_2_1_6_1","volume-title":"Dreamllm: Synergistic multimodal comprehension and creation. arXiv preprint arXiv:2309.11499","author":"Dong Runpei","year":"2023","unstructured":"Runpei Dong, Chunrui Han, Yuang Peng, Zekun Qi, Zheng Ge, Jinrong Yang, Liang Zhao, Jianjian Sun, Hongyu Zhou, Haoran Wei, et al. 2023. Dreamllm: Synergistic multimodal comprehension and creation. arXiv preprint arXiv:2309.11499 (2023)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-short.101"},{"key":"e_1_3_2_1_8_1","volume-title":"Editing. Proceedings of the Advances in neural information processing systems.","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Shengqiong Wu, Hanwang Zhang, Tat-Seng Chua, and Shuicheng Yan. 2024. VITRON: A Unified Pixel-level Vision LLM for Understanding, Generating, Segmenting, Editing. Proceedings of the Advances in neural information processing systems."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3393452"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3689171"},{"key":"e_1_3_2_1_11_1","volume-title":"EmpathyEar: An Open-source Avatar Multimodal Empathetic Chatbot. arXiv preprint arXiv:2406.15177","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Han Zhang, Bin Wang, Lizi Liao, Qian Liu, and Erik Cambria. 2024. EmpathyEar: An Open-source Avatar Multimodal Empathetic Chatbot. arXiv preprint arXiv:2406.15177 (2024)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6271"},{"key":"e_1_3_2_1_13_1","first-page":"807","article-title":"Improving empathetic response generation by recognizing emotion cause in conversations. In Findings of the association for computational linguistics","volume":"2021","author":"Gao Jun","year":"2021","unstructured":"Jun Gao, Yuhan Liu, Haolin Deng, Wei Wang, Yu Cao, Jiachen Du, and Ruifeng Xu. 2021. Improving empathetic response generation by recognizing emotion cause in conversations. In Findings of the association for computational linguistics: EMNLP 2021. 807--819.","journal-title":"EMNLP"},{"key":"e_1_3_2_1_14_1","volume-title":"Kushal Lakhotia","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio, speech, and language processing 29 (2021), 3451--3460."},{"key":"e_1_3_2_1_15_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_16_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma Diederik P","year":"2013","unstructured":"Diederik P Kingma. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_1_17_1","volume-title":"Diaasq: A benchmark of conversational aspect-based sentiment quadruple analysis. arXiv preprint arXiv:2211.05705","author":"Li Bobo","year":"2022","unstructured":"Bobo Li, Hao Fei, Fei Li, Yuhan Wu, Jinsong Zhang, Shengqiong Wu, Jingye Li, Yijiang Liu, Lizi Liao, Tat-Seng Chua, et al. 2022. Diaasq: A benchmark of conversational aspect-based sentiment quadruple analysis. arXiv preprint arXiv:2211.05705 (2022)."},{"key":"e_1_3_2_1_18_1","volume-title":"A diversity-promoting objective function for neural conversation models. arXiv preprint arXiv:1510.03055","author":"Li Jiwei","year":"2015","unstructured":"Jiwei Li, Michel Galley, Chris Brockett, Jianfeng Gao, and Bill Dolan. 2015. A diversity-promoting objective function for neural conversation models. arXiv preprint arXiv:1510.03055 (2015)."},{"key":"e_1_3_2_1_19_1","volume-title":"International conference on machine learning. 19730--19742","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. 19730--19742."},{"key":"e_1_3_2_1_20_1","volume-title":"A Survey on Benchmarks of Multimodal Large Language Models. arXiv preprint arXiv:2408.08632","author":"Li Jian","year":"2024","unstructured":"Jian Li and Weiheng Lu. 2024. A Survey on Benchmarks of Multimodal Large Language Models. arXiv preprint arXiv:2408.08632 (2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21347"},{"key":"e_1_3_2_1_22_1","volume-title":"Styletts 2: Towards human-level text-to-speech through style diffusion and adversarial training with large speech language models. Advances in Neural Information Processing Systems 36","author":"Li Yinghao Aaron","year":"2024","unstructured":"Yinghao Aaron Li, Cong Han, Vinay Raghavan, Gavin Mischler, and Nima Mesgarani. 2024. Styletts 2: Towards human-level text-to-speech through style diffusion and adversarial training with large speech language models. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Bin Zhu, Yang Ye, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Moel: Mixture of empathetic listeners. arXiv preprint arXiv:1908.07687","author":"Lin Zhaojiang","year":"2019","unstructured":"Zhaojiang Lin, Andrea Madotto, Jamin Shin, Peng Xu, and Pascale Fung. 2019. Moel: Mixture of empathetic listeners. arXiv preprint arXiv:1908.07687 (2019)."},{"key":"e_1_3_2_1_25_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems 36","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems 36 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"The voice conversion challenge 2018: Promoting development of parallel and nonparallel methods. arXiv preprint arXiv:1804.04262","author":"Lorenzo-Trueba Jaime","year":"2018","unstructured":"Jaime Lorenzo-Trueba, Junichi Yamagishi, Tomoki Toda, Daisuke Saito, Fernando Villavicencio, Tomi Kinnunen, and Zhenhua Ling. 2018. The voice conversion challenge 2018: Promoting development of parallel and nonparallel methods. arXiv preprint arXiv:1804.04262 (2018)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02497"},{"key":"e_1_3_2_1_28_1","volume-title":"PanoSent: A Panoptic Sextuple Extraction Benchmark for Multimodal Conversational Aspect-based Sentiment Analysis. arXiv preprint arXiv:2408.09481","author":"Luo Meng","year":"2024","unstructured":"Meng Luo, Hao Fei, Bobo Li, Shengqiong Wu, Qian Liu, Soujanya Poria, Erik Cambria, Mong-Li Lee, and Wynne Hsu. 2024. PanoSent: A Panoptic Sextuple Extraction Benchmark for Multimodal Conversational Aspect-based Sentiment Analysis. arXiv preprint arXiv:2408.09481 (2024)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.semeval-1.226"},{"key":"e_1_3_2_1_30_1","volume-title":"Dreamtalk: When expressive talking head generation meets diffusion probabilistic models. arXiv preprint arXiv:2312.09767","author":"Ma Yifeng","year":"2023","unstructured":"Yifeng Ma, Shiwei Zhang, Jiayu Wang, Xiang Wang, Yingya Zhang, and Zhidong Deng. 2023. Dreamtalk: When expressive talking head generation meets diffusion probabilistic models. arXiv preprint arXiv:2312.09767 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"MIME: MIMicking emotions for empathetic response generation. arXiv preprint arXiv:2010.01454","author":"Majumder Navonil","year":"2020","unstructured":"Navonil Majumder, Pengfei Hong, Shanshan Peng, Jiankun Lu, Deepanway Ghosal, Alexander Gelbukh, Rada Mihalcea, and Soujanya Poria. 2020. MIME: MIMicking emotions for empathetic response generation. arXiv preprint arXiv:2010.01454 (2020)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2011.2131660"},{"key":"e_1_3_2_1_33_1","volume-title":"Harnessing the power of large language models for empathetic response generation: Empirical investigations and improvements. arXiv preprint arXiv:2310.05140","author":"Qian Yushan","year":"2023","unstructured":"Yushan Qian, Wei-Nan Zhang, and Ting Liu. 2023. Harnessing the power of large language models for empathetic response generation: Empirical investigations and improvements. arXiv preprint arXiv:2310.05140 (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"Empathetic conversational systems: A review of current advances, gaps, and opportunities","author":"Raamkumar Aravind Sesagiri","year":"2022","unstructured":"Aravind Sesagiri Raamkumar and Yinping Yang. 2022. Empathetic conversational systems: A review of current advances, gaps, and opportunities. IEEE Transactions on Affective Computing (2022), 2722--2739."},{"key":"e_1_3_2_1_35_1","volume-title":"International conference on machine learning. 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. 8748--8763."},{"key":"e_1_3_2_1_36_1","volume-title":"Towards empathetic open-domain conversation models: A new benchmark and dataset. arXiv preprint arXiv:1811.00207","author":"Rashkin Hannah","year":"2018","unstructured":"Hannah Rashkin. 2018. Towards empathetic open-domain conversation models: A new benchmark and dataset. arXiv preprint arXiv:1811.00207 (2018)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21373"},{"key":"e_1_3_2_1_39_1","volume-title":"Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355","author":"Su Yixuan","year":"2023","unstructured":"Yixuan Su, Tian Lan, Huayang Li, Jialu Xu, Yan Wang, and Deng Cai. 2023. Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv e-prints","author":"Team GLM","year":"2024","unstructured":"GLM Team, Aohan Zeng, Bin Xu, Bowen Wang, Chenhui Zhang, Da Yin, Diego Rojas, Guanyu Feng, Hanlin Zhao, Hanyu Lai, et al. 2024. Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv e-prints (2024), arXiv--2406."},{"key":"e_1_3_2_1_41_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_42_1","volume-title":"Attention is all you need. Advances in Neural Information Processing Systems","author":"Vaswani A","year":"2017","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_1_43_1","volume-title":"Measuring speech quality for text-to-speech systems: development and assessment of a modified mean opinion score (MOS) scale. Computer speech & language 19, 1","author":"Viswanathan Mahesh","year":"2005","unstructured":"Mahesh Viswanathan and Madhubalan Viswanathan. 2005. Measuring speech quality for text-to-speech systems: development and assessment of a modified mean opinion score (MOS) scale. Computer speech & language 19, 1 (2005), 55--83."},{"key":"e_1_3_2_1_44_1","volume-title":"A universal image quality index","author":"Wang Zhou","year":"2002","unstructured":"Zhou Wang and Alan C Bovik. 2002. A universal image quality index. IEEE signal processing letters 9, 3 (2002), 81--84."},{"key":"e_1_3_2_1_45_1","volume-title":"Towards Semantic Equivalence of Tokenization in Multimodal LLM. arXiv preprint arXiv:2406.05127","author":"Wu Shengqiong","year":"2024","unstructured":"Shengqiong Wu, Hao Fei, Xiangtai Li, Jiayi Ji, Hanwang Zhang, Tat-Seng Chua, and Shuicheng Yan. 2024. Towards Semantic Equivalence of Tokenization in Multimodal LLM. arXiv preprint arXiv:2406.05127 (2024)."},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the International Conference on Machine Learning. 53366--53397","author":"Wu Shengqiong","year":"2024","unstructured":"Shengqiong Wu, Hao Fei, Leigang Qu, Wei Ji, and Tat-Seng Chua. 2024. NExT-GPT: Any-to-Any Multimodal LLM. In Proceedings of the International Conference on Machine Learning. 53366--53397."},{"key":"e_1_3_2_1_47_1","volume-title":"Faithful Logical Reasoning via Symbolic Chain-of-Thought. arXiv preprint arXiv:2405.18357","author":"Xu Jundong","year":"2024","unstructured":"Jundong Xu, Hao Fei, Liangming Pan, Qian Liu, Mong-Li Lee, and Wynne Hsu. 2024. Faithful Logical Reasoning via Symbolic Chain-of-Thought. arXiv preprint arXiv:2405.18357 (2024)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.801"},{"key":"e_1_3_2_1_49_1","volume-title":"Exploiting emotion-semantic correlations for empathetic response generation. arXiv preprint arXiv:2402.17437","author":"Yang Zhou","year":"2024","unstructured":"Zhou Yang, Zhaochun Ren, Yufeng Wang, Xiaofei Zhu, Zhihao Chen, Tiecheng Cai, Yunbing Wu, Yisong Su, Sibo Ju, and Xiangwen Liao. 2024. Exploiting emotion-semantic correlations for empathetic response generation. arXiv preprint arXiv:2402.17437 (2024)."},{"key":"e_1_3_2_1_50_1","volume-title":"Enhancing Empathetic Response Generation by Augmenting LLMs with Small-scale Empathetic Models. arXiv preprint arXiv:2402.11801","author":"Yang Zhou","year":"2024","unstructured":"Zhou Yang, Zhaochun Ren, Wang Yufeng, Shizhong Peng, Haizhou Sun, Xiaofei Zhu, and Xiangwen Liao. 2024. Enhancing Empathetic Response Generation by Augmenting LLMs with Small-scale Empathetic Models. arXiv preprint arXiv:2402.11801 (2024)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.417"},{"key":"e_1_3_2_1_52_1","unstructured":"Wayne Xin Zhao Kun Zhou Junyi Li Tianyi Tang Xiaolei Wang Yupeng Hou Yingqian Min Beichen Zhang Junjie Zhang Zican Dong et al. 2023. A survey of large language models. arXiv preprint arXiv:2303.18223 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"ECQED: emotion-cause quadruple extraction in dialogs. arXiv preprint arXiv:2306.03969","author":"Zheng Li","year":"2023","unstructured":"Li Zheng, Donghong Ji, Fei Li, Hao Fei, Shengqiong Wu, Jingye Li, Bobo Li, and Chong Teng. 2023. ECQED: emotion-cause quadruple extraction in dialogs. arXiv preprint arXiv:2306.03969 (2023)."},{"key":"e_1_3_2_1_54_1","volume-title":"Case: Aligning coarse-to-fine cognition and affection for empathetic response generation. arXiv preprint arXiv:2208.08845","author":"Zhou Jinfeng","year":"2022","unstructured":"Jinfeng Zhou, Chujie Zheng, Bo Wang, Zheng Zhang, and Minlie Huang. 2022. Case: Aligning coarse-to-fine cognition and affection for empathetic response generation. arXiv preprint arXiv:2208.08845 (2022)."},{"key":"e_1_3_2_1_55_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia","acronym":"WWW '25","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714739","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696410.3714739","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:41Z","timestamp":1750295921000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714739"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":55,"alternative-id":["10.1145\/3696410.3714739","10.1145\/3696410"],"URL":"https:\/\/doi.org\/10.1145\/3696410.3714739","relation":{},"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"2025-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}