{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T14:47:00Z","timestamp":1781016420164,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62222213, U22B2059, 62072423"],"award-info":[{"award-number":["62222213, U22B2059, 62072423"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658104","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"533-542","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":24,"title":["Speak From Heart: An Emotion-Guided LLM-Based Multimodal Method for Emotional Dialogue Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-4877-8212","authenticated-orcid":false,"given":"Chenxiao","family":"Liu","sequence":"first","affiliation":[{"name":"University Of Science And Technology Of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7453-5781","authenticated-orcid":false,"given":"Zheyong","family":"Xie","sequence":"additional","affiliation":[{"name":"University Of Science And Technology Of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8103-0321","authenticated-orcid":false,"given":"Sirui","family":"Zhao","sequence":"additional","affiliation":[{"name":"University Of Science And Technology Of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2298-0807","authenticated-orcid":false,"given":"Jin","family":"Zhou","sequence":"additional","affiliation":[{"name":"University Of Science And Technology Of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4246-5386","authenticated-orcid":false,"given":"Tong","family":"Xu","sequence":"additional","affiliation":[{"name":"University Of Science And Technology Of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1427-3507","authenticated-orcid":false,"given":"Minglei","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Cloud, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4835-4102","authenticated-orcid":false,"given":"Enhong","family":"Chen","sequence":"additional","affiliation":[{"name":"University Of Science And Technology Of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"23716","article-title":"Flamingo: a visual language model for few-shot learning","volume":"35","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katherine Millican, Malcolm Reynolds, et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in Neural Information Processing Systems , Vol. 35 (2022), 23716--23736.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72."},{"key":"e_1_3_2_1_3_1","volume-title":"TOMGPT: Reliable Text-Only Training Approach for Cost-Effective Multi-modal Large Language Model. ACM Transactions on Knowledge Discovery from Data","author":"Chen Yunkai","year":"2024","unstructured":"Yunkai Chen, Qimeng Wang, Shiwei Wu, Yan Gao, Tong Xu, and Yao Hu. 2024. TOMGPT: Reliable Text-Only Training Approach for Cost-Effective Multi-modal Large Language Model. ACM Transactions on Knowledge Discovery from Data (2024)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.26"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2020.3015491"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.295"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.224"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19--1015"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.547"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19--1037"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10919-019-00293-3"},{"key":"e_1_3_2_1_12_1","unstructured":"Shanglin Lei Guanting Dong Xiaoping Wang Keheng Wang and Sirui Wang. 2023. InstructERC: Reforming Emotion Recognition in Conversation with a Retrieval Multi-task LLMs Framework. arxiv: 2309.11911 [cs.CL]"},{"key":"e_1_3_2_1_13_1","volume-title":"SPAGE: A Speaker and Position-Aware Graph Neural Network Model for Emotion Recognition in Conversation. In Proceedings of the 2nd Conference of the Asia-Pacific","author":"Liang Chen","year":"2022","unstructured":"Chen Liang, Jing Xu, Yangkun Lin, Chong Yang, and Yongliang Wang. 2022. SPAGE: A Speaker and Position-Aware Graph Neural Network Model for Emotion Recognition in Conversation. In Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), Yulan He, Heng Ji, Sujian Li, Yang Liu, and Chua-Hui Chang (Eds.). Association for Computational Linguistics, Online only, 148--157. https:\/\/aclanthology.org\/2022.aacl-main.12"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","unstructured":"Zujie Liang Huang Hu Can Xu Chongyang Tao Xiubo Geng Yining Chen Fan Liang and Daxin Jiang. 2021. Maria: A Visual Experience Powered Conversational Agent. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers) Chengqing Zong Fei Xia Wenjie Li and Roberto Navigli (Eds.). Association for Computational Linguistics Online 5596--5611. https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.435","DOI":"10.18653\/v1\/2021.acl-long.435"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.3115\/1218955.1219032"},{"key":"e_1_3_2_1_16_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li and Yong Jae Lee. 2023 a. Improved Baselines with Visual Instruction Tuning."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3560815"},{"key":"e_1_3_2_1_18_1","unstructured":"Siyang Liu Chujie Zheng Orianna Demasi Sahand Sabour Yu Li Zhou Yu Yong Jiang and Minlie Huang. 2021b. Towards Emotional Support Dialog Systems. In ACL."},{"key":"e_1_3_2_1_19_1","volume-title":"P-Tuning v2: Prompt Tuning Can Be Comparable to Fine-tuning Universally Across Scales and Tasks. CoRR","author":"Liu Xiao","year":"2021","unstructured":"Xiao Liu, Kaixuan Ji, Yicheng Fu, Zhengxiao Du, Zhilin Yang, and Jie Tang. 2021a. P-Tuning v2: Prompt Tuning Can Be Comparable to Fine-tuning Universally Across Scales and Tasks. CoRR , Vol. abs\/2110.07602 (2021). showeprint[arXiv]2110.07602 https:\/\/arxiv.org\/abs\/2110.07602"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016810"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2020.06.011"},{"key":"e_1_3_2_1_22_1","volume-title":"Video-ChatGPT: Towards Detailed Video Understanding via Large Vision and Language Models. arXiv preprint arXiv:2306.05424","author":"Maaz Muhammad","year":"2023","unstructured":"Muhammad Maaz, Hanoona Rasheed, Salman Khan, and Fahad Shahbaz Khan. 2023. Video-ChatGPT: Towards Detailed Video Understanding via Large Vision and Language Models. arXiv preprint arXiv:2306.05424 (2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.eacl-srw.7"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.721"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016818"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350923"},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_28_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1534"},{"key":"e_1_3_2_1_30_1","volume-title":"Directed acyclic graph network for conversational emotion recognition. arXiv preprint arXiv:2105.12907","author":"Shen Weizhou","year":"2021","unstructured":"Weizhou Shen, Siyue Wu, Yunyi Yang, and Xiaojun Quan. 2021. Directed acyclic graph network for conversational emotion recognition. arXiv preprint arXiv:2105.12907 (2021)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1359"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.204"},{"key":"e_1_3_2_1_33_1","volume-title":"Enhancing Personalized Dialogue Generation with Contrastive Latent Variables: Combining Sparse and Dense Persona. arXiv preprint arXiv:2305.11482","author":"Tang Yihong","year":"2023","unstructured":"Yihong Tang, Bo Wang, Miao Fang, Dongming Zhao, Kun Huang, Ruifang He, and Yuexian Hou. 2023. Enhancing Personalized Dialogue Generation with Contrastive Latent Variables: Combining Sparse and Dense Persona. arXiv preprint arXiv:2305.11482 (2023)."},{"key":"e_1_3_2_1_34_1","unstructured":"He sicheng Wang Yuxin Sun Qingxuan. 2023. M3E: Moka Massive Mixed Embedding Model."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3612920"},{"key":"e_1_3_2_1_36_1","volume-title":"A Survey on Multimodal Large Language Models. arXiv preprint arXiv:2306.13549","author":"Yin Shukang","year":"2023","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Ke Li, Xing Sun, Tong Xu, and Enhong Chen. 2023. A Survey on Multimodal Large Language Models. arXiv preprint arXiv:2306.13549 (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.391"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3571819"},{"key":"e_1_3_2_1_39_1","volume-title":"Comae: A multi-factor hierarchical framework for empathetic response generation. arXiv preprint arXiv:2105.08316","author":"Zheng Chujie","year":"2021","unstructured":"Chujie Zheng, Yong Liu, Wei Chen, Yongcai Leng, and Minlie Huang. 2021. Comae: A multi-factor hierarchical framework for empathetic response generation. arXiv preprint arXiv:2105.08316 (2021)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11325"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1104"},{"key":"e_1_3_2_1_42_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658104","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658104","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:51:08Z","timestamp":1755766268000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658104"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":42,"alternative-id":["10.1145\/3652583.3658104","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658104","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}