{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,11]],"date-time":"2026-05-11T10:58:53Z","timestamp":1778497133736,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":27,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3689092.3689403","type":"proceedings-article","created":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T18:33:17Z","timestamp":1729708397000},"page":"104-109","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Multimodal Emotion Captioning Using Large Language Model with Prompt Engineering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-7063-7317","authenticated-orcid":false,"given":"Yaoxun","family":"Xu","sequence":"first","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6363-891X","authenticated-orcid":false,"given":"Yixuan","family":"Zhou","sequence":"additional","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4431-2886","authenticated-orcid":false,"given":"Yunrui","family":"Cai","sequence":"additional","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2050-263X","authenticated-orcid":false,"given":"Jingran","family":"Xie","sequence":"additional","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9113-6318","authenticated-orcid":false,"given":"Runchuan","family":"Ye","sequence":"additional","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8533-0524","authenticated-orcid":false,"given":"Zhiyong","family":"Wu","sequence":"additional","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.38094\/jastt20291"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3475957.3484454"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3551876.3554805"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2024.106111"},{"key":"e_1_3_2_1_5_1","unstructured":"Qichen Han Weiqiang Yuan Dong Liu Xiang Li and Zhen Yang. 2021. Automated audio captioning with weakly supervised pre-training and word selection methods.. In DCASE. 6--10."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3247822"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i21.30570"},{"key":"e_1_3_2_1_8_1","volume-title":"MER 2024: Semi-Supervised Learning, Noise Robustness, and Open-Vocabulary Multimodal Emotion Recognition. arXiv preprint arXiv:2404","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Haiyang Sun, Licai Sun, Zhuofan Wen, Siyuan Zhang, Shun Chen, Hao Gu, Jinming Zhao, Ziyang Ma, Xie Chen, et al. 2024. MER 2024: Semi-Supervised Learning, Noise Robustness, and Open-Vocabulary Multimodal Emotion Recognition. arXiv preprint arXiv:2404.17113 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"AffectGPT: Dataset and Framework for Explainable Multimodal Emotion Recognition. arXiv preprint arXiv:2407.07653","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Haiyang Sun, Licai Sun, Jiangyan Yi, Bin Liu, and Jianhua Tao. 2024. AffectGPT: Dataset and Framework for Explainable Multimodal Emotion Recognition. arXiv preprint arXiv:2407.07653 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"MER 2023: Multi-label Learning, Modality Robustness, and Semi-Supervised Learning. arXiv preprint arXiv:2304","author":"Lian Zheng","year":"2023","unstructured":"Zheng Lian, Haiyang Sun, Licai Sun, Jinming Zhao, Ye Liu, Bin Liu, Jiangyan Yi, Meng Wang, Erik Cambria, Guoying Zhao, et al. 2023. MER 2023: Multi-label Learning, Modality Robustness, and Semi-Supervised Learning. arXiv preprint arXiv:2304.08981 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Explainable multimodal emotion reasoning. arXiv preprint arXiv:2306.15401","author":"Lian Zheng","year":"2023","unstructured":"Zheng Lian, Licai Sun, Mingyu Xu, Haiyang Sun, Ke Xu, Zhuofan Wen, Shun Chen, Bin Liu, and Jianhua Tao. 2023. Explainable multimodal emotion reasoning. arXiv preprint arXiv:2306.15401 (2023)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3475957.3484457"},{"key":"e_1_3_2_1_13_1","volume-title":"Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424","author":"Maaz Muhammad","year":"2023","unstructured":"Muhammad Maaz, Hanoona Rasheed, Salman Khan, and Fahad Shahbaz Khan. 2023. Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"Rethinking the role of demonstrations: What makes in-context learning work? arXiv preprint arXiv:2202.12837","author":"Min Sewon","year":"2022","unstructured":"Sewon Min, Xinxi Lyu, Ari Holtzman, Mikel Artetxe, Mike Lewis, Hannaneh Hajishirzi, and Luke Zettlemoyer. 2022. Rethinking the role of demonstrations: What makes in-context learning work? arXiv preprint arXiv:2202.12837 (2022)."},{"key":"e_1_3_2_1_15_1","volume-title":"A review of affective computing: From unimodal analysis to multimodal fusion. Information fusion","author":"Poria Soujanya","year":"2017","unstructured":"Soujanya Poria, Erik Cambria, Rajiv Bajpai, and Amir Hussain. 2017. A review of affective computing: From unimodal analysis to multimodal fusion. Information fusion, Vol. 37 (2017), 98--125."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2018.2882362"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.04.028"},{"key":"e_1_3_2_1_18_1","volume-title":"Multimodal emotion recognition in response to videos","author":"Soleymani Mohammad","year":"2011","unstructured":"Mohammad Soleymani, Maja Pantic, and Thierry Pun. 2011. Multimodal emotion recognition in response to videos. IEEE transactions on affective computing, Vol. 3, 2 (2011), 211--223."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3475957.3484456"},{"key":"e_1_3_2_1_20_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2017.2764438"},{"key":"e_1_3_2_1_22_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824--24837."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29902"},{"key":"e_1_3_2_1_24_1","volume-title":"Wenet: Production oriented streaming and non-streaming end-to-end speech recognition toolkit. arXiv preprint arXiv:2102.01547","author":"Yao Zhuoyuan","year":"2021","unstructured":"Zhuoyuan Yao, Di Wu, Xiong Wang, Binbin Zhang, Fan Yu, Chao Yang, Zhendong Peng, Xiaoyu Chen, Lei Xie, and Xin Lei. 2021. Wenet: Production oriented streaming and non-streaming end-to-end speech recognition toolkit. arXiv preprint arXiv:2102.01547 (2021)."},{"key":"e_1_3_2_1_25_1","volume-title":"Improving the performance of automated audio captioning via integrating the acoustic and semantic information. arXiv preprint arXiv:2110.06100","author":"Ye Zhongjie","year":"2021","unstructured":"Zhongjie Ye, Helin Wang, Dongchao Yang, and Yuexian Zou. 2021. Improving the performance of automated audio captioning via integrating the acoustic and semantic information. arXiv preprint arXiv:2110.06100 (2021)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2020.01.011"},{"key":"e_1_3_2_1_27_1","volume-title":"Deep learning-based multimodal emotion recognition from audio, visual, and text modalities: A systematic review of recent advancements and future prospects. Expert Systems with Applications","author":"Zhang Shiqing","year":"2023","unstructured":"Shiqing Zhang, Yijiao Yang, Chen Chen, Xingnan Zhang, Qingming Leng, and Xiaoming Zhao. 2023. Deep learning-based multimodal emotion recognition from audio, visual, and text modalities: A systematic review of recent advancements and future prospects. Expert Systems with Applications (2023), 121692."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2nd International Workshop on Multimodal and Responsible Affective Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689092.3689403","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3689092.3689403","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T01:59:40Z","timestamp":1755914380000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689092.3689403"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":27,"alternative-id":["10.1145\/3689092.3689403","10.1145\/3689092"],"URL":"https:\/\/doi.org\/10.1145\/3689092.3689403","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}