{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:41:04Z","timestamp":1765309264179,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["Nos.11801097"],"award-info":[{"award-number":["Nos.11801097"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"The Scientific Research Capacity Improvement Project of the Doctoral Program Construction Unit of Guangdong Polytechnic Normal University in 2022","award":["22GPNUZDJS31"],"award-info":[{"award-number":["22GPNUZDJS31"]}]},{"name":"Guangdong Provincial Characteristic Innovation Project for Colleges and Universities","award":["2024KTSCX172"],"award-info":[{"award-number":["2024KTSCX172"]}]},{"name":"Guangdong Provincial Postgraduate Education Innovation Program Project","award":["2024JGXM_098"],"award-info":[{"award-number":["2024JGXM_098"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3762009","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:56:44Z","timestamp":1761375404000},"page":"13848-13855","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Affective-CoT: Decomposing Multimodal Emotion Reasoning through a Hierarchical Cognitive Workflow"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-4028-448X","authenticated-orcid":false,"given":"Yuesheng","family":"Huang","sequence":"first","affiliation":[{"name":"School of Computer Science, Guangdong Polytechnic Normal University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8427-9733","authenticated-orcid":false,"given":"Jinming","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Guangdong Polytechnic Normal University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9703-5961","authenticated-orcid":false,"given":"Jiajia","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Computer Science, Guangdong Polytechnic Normal University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7637-3572","authenticated-orcid":false,"given":"Yihang","family":"Lin","sequence":"additional","affiliation":[{"name":"School of Computer Science, Guangdong Polytechnic Normal University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6064-2564","authenticated-orcid":false,"given":"Yanmei","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Computer Science, Guangdong Polytechnic Normal University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9891-8263","authenticated-orcid":false,"given":"Jianwei","family":"Dong","sequence":"additional","affiliation":[{"name":"College of Medical Information Engineering, Guangdong Pharmaceutical University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Deepanway Ghosal, Soujanya Poria, Asif Ekbal, and Pushpak Bhattacharyya.","author":"Akhtar Md Shad","year":"2019","unstructured":"Md Shad Akhtar, Dushyant Singh Chauhan, Deepanway Ghosal, Soujanya Poria, Asif Ekbal, and Pushpak Bhattacharyya. 2019. Multi-task learning for multi-modal emotion recognition and sentiment analysis. arXiv preprint arXiv:1905.05812 (2019)."},{"key":"e_1_3_2_1_2_1","volume-title":"MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with Interleaved Visual-Textual Tokens. arXiv preprint arXiv:2404.03413","author":"Ataallah Kirolos","year":"2024","unstructured":"Kirolos Ataallah, Xiaoqian Shen, Eslam Abdelrahman, Essam Sleiman, Deyao Zhu, Jian Ding, and Mohamed Elhoseiny. 2024. MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with Interleaved Visual-Textual Tokens. arXiv preprint arXiv:2404.03413 (2024)."},{"key":"e_1_3_2_1_3_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_4_1","volume-title":"Multimodal machine learning: A survey and taxonomy","author":"Baltru\u0161aitis Tadas","year":"2018","unstructured":"Tadas Baltru\u0161aitis, Chaitanya Ahuja, and Louis-Philippe Morency. 2018. Multimodal machine learning: A survey and taxonomy. IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 2 (2018), 423-443."},{"key":"e_1_3_2_1_5_1","volume-title":"IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation","author":"Busso Carlos","year":"2008","unstructured":"Carlos Busso, Murtaza Bulut, Chi-Chun Lee, Abe Kazemzadeh, Emily Mower, Samuel Kim, Jeannette N Chang, Sungbok Lee, and Shrikanth S Narayanan. 2008. IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation, Vol. 42, 4 (2008), 335-359."},{"key":"e_1_3_2_1_6_1","volume-title":"2015 IEEE International Conference on Bioinformatics and Biomedicine (BIBM). IEEE, 395-399","author":"Chen Jing","year":"2015","unstructured":"Jing Chen, Bin Hu, Lixin Xu, Philip Moore, and Yun Su. 2015. Feature-level fusion of multimodal physiological signals for emotion recognition. In 2015 IEEE International Conference on Bioinformatics and Biomedicine (BIBM). IEEE, 395-399."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 24185-24198","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al., 2024. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 24185-24198."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3518"},{"key":"e_1_3_2_1_9_1","unstructured":"Zesen Cheng Sicong Leng Hang Zhang Yifei Xin Xin Li Guanzheng Chen Yongxin Zhu Wenqi Zhang Ziyang Luo Deli Zhao et al. 2024b. Videollama 2: Advancing spatial-temporal modeling and audio understanding in video-llms. arXiv preprint arXiv:2406.07476 (2024)."},{"key":"e_1_3_2_1_10_1","unstructured":"Yunfei Chu Jin Xu Qian Yang Haojie Wei Xipin Wei Zhifang Guo Yichong Leng Yuanjun Lv Jinzheng He Junyang Lin et al. 2024. Qwen2-audio technical report. arXiv preprint arXiv:2407.10759 (2024)."},{"key":"e_1_3_2_1_11_1","article-title":"Multi-modal emotion recognition from speech and text","volume":"9","author":"Chuang Ze-Jing","year":"2004","unstructured":"Ze-Jing Chuang and Chung-Hsien Wu. 2004. Multi-modal emotion recognition from speech and text. In International Journal of Computational Linguistics & Chinese Language Processing, Volume 9, Number 2, August 2004: Special Issue on New Trends of Speech and Language Processing. 45-62.","journal-title":"International Journal of Computational Linguistics & Chinese Language Processing"},{"key":"e_1_3_2_1_12_1","unstructured":"Yunkai Dang Kaichen Huang Jiahao Huo Yibo Yan Sirui Huang Dongrui Liu Mengxi Gao Jie Zhang Chen Qian Kun Wang et al. 2024. Explainable and interpretable multimodal large language models: A comprehensive survey. arXiv preprint arXiv:2412.02104 (2024)."},{"key":"e_1_3_2_1_13_1","volume-title":"Towards a rigorous science of interpretable machine learning. arXiv preprint arXiv:1702.08608","author":"Doshi-Velez Finale","year":"2017","unstructured":"Finale Doshi-Velez and Been Kim. 2017. Towards a rigorous science of interpretable machine learning. arXiv preprint arXiv:1702.08608 (2017)."},{"key":"e_1_3_2_1_14_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1207"},{"key":"e_1_3_2_1_16_1","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et al. 2025. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948 (2025)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCTIS63324.2024.10699096"},{"key":"e_1_3_2_1_18_1","volume-title":"Can Generated Images Serve as a Viable Modality for Text-Centric Multimodal Learning? arXiv preprint arXiv:2506.17623","author":"Huang Yuesheng","year":"2025","unstructured":"Yuesheng Huang, Peng Zhang, Riliang Liu, and Jiaqi Liang. 2025. Can Generated Images Serve as a Viable Modality for Text-Centric Multimodal Learning? arXiv preprint arXiv:2506.17623 (2025)."},{"key":"e_1_3_2_1_19_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_20_1","volume-title":"Attention is not explanation. arXiv preprint arXiv:1902.10186","author":"Jain Sarthak","year":"2019","unstructured":"Sarthak Jain and Byron C Wallace. 2019. Attention is not explanation. arXiv preprint arXiv:1902.10186 (2019)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"e_1_3_2_1_22_1","volume-title":"Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326","author":"Li Bo","year":"2024","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Peiyuan Zhang, Yanwei Li, Ziwei Liu, et al., 2024. Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"Deep facial expression recognition: A survey","author":"Li Shan","year":"2020","unstructured":"Shan Li and Weihong Deng. 2020. Deep facial expression recognition: A survey. IEEE transactions on affective computing, Vol. 13, 3 (2020), 1195-1215."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Zheng Lian Rui Liu Kele Xu Bin Liu Xuefei Liu Yazhou Zhang Xin Liu Yong Li Zebang Cheng Haolin Zuo et al. 2025a. Mer 2025: When affective computing meets large language models. arXiv preprint arXiv:2504.19423 (2025).","DOI":"10.1145\/3746027.3762007"},{"key":"e_1_3_2_1_25_1","unstructured":"Zheng Lian Haiyang Sun Licai Sun Haoyu Chen Lan Chen Hao Gu Zhuofan Wen Shun Chen Siyuan Zhang Hailiang Yao et al. 2024a. OV-MER: Towards Open-Vocabulary Multimodal Emotion Recognition. arXiv preprint arXiv:2410.01495 (2024)."},{"key":"e_1_3_2_1_26_1","unstructured":"Zheng Lian Haiyang Sun Licai Sun Hao Gu Zhuofan Wen Siyuan Zhang Shun Chen Mingyu Xu Ke Xu Kang Chen et al. 2023. Explainable multimodal emotion recognition. arXiv preprint arXiv:2306.15401 (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"AffectGPT: Dataset and framework for explainable multimodal emotion recognition. arXiv preprint arXiv:2407.07653","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Haiyang Sun, Licai Sun, Jiangyan Yi, Bin Liu, and Jianhua Tao. 2024c. AffectGPT: Dataset and framework for explainable multimodal emotion recognition. arXiv preprint arXiv:2407.07653 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"DMER-Ranker: Learning to Rank Emotion Descriptions in the Absence of Ground Truth. arXiv preprint arXiv:2507.04278","author":"Lian Zheng","year":"2025","unstructured":"Zheng Lian, Licai Sun, Haoyu Chen, Zebang Cheng, Fan Zhang, Ziyu Jia, Ziyang Ma, Fei Ma, Xiaojiang Peng, and Jianhua Tao. 2025b. DMER-Ranker: Learning to Rank Emotion Descriptions in the Absence of Ground Truth. arXiv preprint arXiv:2507.04278 (2025)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102367"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3672758.3672824"},{"key":"e_1_3_2_1_31_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_1_32_1","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024a. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_1_33_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li Bo Li Yuanhan Zhang Sheng Shen and Yong Jae Lee. 2024b. LLaVA-NeXT: Improved reasoning OCR and world knowledge. https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2010.5494890"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2020.3032373"},{"key":"e_1_3_2_1_36_1","volume-title":"A review of affective computing: From unimodal analysis to multimodal fusion. Information fusion","author":"Poria Soujanya","year":"2017","unstructured":"Soujanya Poria, Erik Cambria, Rajiv Bajpai, and Amir Hussain. 2017. A review of affective computing: From unimodal analysis to multimodal fusion. Information fusion, Vol. 37 (2017), 98-125."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-3020"},{"key":"e_1_3_2_1_38_1","volume-title":"Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355","author":"Su Yixuan","year":"2023","unstructured":"Yixuan Su, Tian Lan, Huayang Li, Jialu Xu, Yan Wang, and Deng Cai. 2023. Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Salmonn: Towards generic hearing abilities for large language models. arXiv preprint arXiv:2310.13289","author":"Tang Changli","year":"2023","unstructured":"Changli Tang, Wenyi Yu, Guangzhi Sun, Xianzhao Chen, Tian Tan, Wei Li, Lu Lu, Zejun Ma, and Chao Zhang. 2023. Salmonn: Towards generic hearing abilities for large language models. arXiv preprint arXiv:2310.13289 (2023)."},{"key":"e_1_3_2_1_40_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth Katie Millican et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_42_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_43_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al., 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824-24837."},{"key":"e_1_3_2_1_44_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao. 2023. React: Synergizing reasoning and acting in language models. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1208"},{"key":"e_1_3_2_1_46_1","volume-title":"Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3762009","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:37:28Z","timestamp":1765309048000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3762009"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":46,"alternative-id":["10.1145\/3746027.3762009","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3762009","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}