{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:56:52Z","timestamp":1781539012508,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/legalcode"}],"funder":[{"name":"the National Natural Science Foundation of China grant","award":["62076227"],"award-info":[{"award-number":["62076227"]}]},{"name":"Natural Science Foundation of Hubei Province grant","award":["2023AFB572"],"award-info":[{"award-number":["2023AFB572"]}]},{"name":"Hubei Key Laboratory of Intelligent Geo-Information Processing","award":["KLIGIP-2022-B10"],"award-info":[{"award-number":["KLIGIP-2022-B10"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810634","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1128-1137","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CERA: Conflict-Explicit Reflective Agent for Multimodal Emotion Reasoning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-2332-6157","authenticated-orcid":false,"given":"Kejun","family":"Liu","sequence":"first","affiliation":[{"name":"China University of Geosciences (Wuhan), Wuhan, Hubei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7326-9655","authenticated-orcid":false,"given":"Yuanyuan","family":"Liu","sequence":"additional","affiliation":[{"name":"China University of Geosciences (Wuhan), Wuhan, Hubei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8430-4826","authenticated-orcid":false,"given":"Ke","family":"Wang","sequence":"additional","affiliation":[{"name":"China University of Geosciences (Wuhan), Wuhan, Hubei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8410-1698","authenticated-orcid":false,"given":"Jiahao","family":"Zhang","sequence":"additional","affiliation":[{"name":"China University of Geosciences (Wuhan), Wuhan, Hubei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0947-9014","authenticated-orcid":false,"given":"Lei","family":"Xu","sequence":"additional","affiliation":[{"name":"China University of Geosciences (Wuhan), Wuhan, Hubei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6515-7696","authenticated-orcid":false,"given":"Chang","family":"Tang","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, Hubei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5004-8975","authenticated-orcid":false,"given":"Zhe","family":"Chen","sequence":"additional","affiliation":[{"name":"La Trobe University, Melbourne, Australia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3180-0484","authenticated-orcid":false,"given":"Yibing","family":"Zhan","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, Hubei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_3_2_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1208"},{"key":"e_1_3_3_3_3_2","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang Humen Zhong Yuanzhi Zhu Mingkun Yang Zhaohai Li Jianqiang Wan Pengfei Wang Wei Ding Zheren Fu Yiheng Xu Jiabo Ye Xi Zhang Tianbao Xie Zesen Cheng Hang Zhang Zhibo Yang Haiyang Xu and Junyang Lin. 2025. Qwen2.5-VL Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2502.13923\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2502.13923"},{"key":"e_1_3_3_3_4_2","unstructured":"Xiaokang Chen Zhiyu Wu Xingchao Liu Zizheng Pan Wen Liu Zhenda Xie Xingkai Yu and Chong Ruan. 2025. Janus-Pro: Unified Multimodal Understanding and Generation with Data and Model Scaling. arxiv:https:\/\/arXiv.org\/abs\/2501.17811\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2501.17811"},{"key":"e_1_3_3_3_5_2","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu Lixin Gu Xuehui Wang Qingyun Li Yimin Ren Zixuan Chen Jiapeng Luo Jiahao Wang Tan Jiang Bo Wang Conghui He Botian Shi Xingcheng Zhang Han Lv Yi Wang Wenqi Shao Pei Chu Zhongying Tu Tong He Zhiyong Wu Huipeng Deng Jiaye Ge Kai Chen Kaipeng Zhang Limin Wang Min Dou Lewei Lu Xizhou Zhu Tong Lu Dahua Lin Yu Qiao Jifeng Dai and Wenhai Wang. 2025. Expanding Performance Boundaries of Open-Source Multimodal Models with Model Data and Test-Time Scaling. arxiv:https:\/\/arXiv.org\/abs\/2412.05271\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2412.05271"},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3518"},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"crossref","unstructured":"Zebang Cheng Zhi-Qi Cheng Jun-Yan He Kai Wang Yuxiang Lin Zheng Lian Xiaojiang Peng and Alexander Hauptmann. 2024. Emotion-llama: Multimodal emotion recognition and reasoning with instruction tuning. Advances in Neural Information Processing Systems 37 (2024) 110805\u2013110853.","DOI":"10.52202\/079017-3518"},{"key":"e_1_3_3_3_8_2","unstructured":"Yumeng Fu Junjie Wu Zhongjie Wang Meishan Zhang Lili Shan Yulin Wu and Bingquan Li. 2025. LaERC-S: Improving LLM-based Emotion Recognition in Conversation with Speaker Characteristics. arxiv:https:\/\/arXiv.org\/abs\/2403.07260\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2403.07260"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"crossref","unstructured":"Markus Hafner Maria Katsantoni Tino K\u00f6ster James Marks Joyita Mukherjee Dorothee Staiger Jernej Ule and Mihaela Zavolan. 2021. CLIP and complementary methods. Nature Reviews Methods Primers 1 1 (2021) 20.","DOI":"10.1038\/s43586-021-00018-1"},{"key":"e_1_3_3_3_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3754856"},{"key":"e_1_3_3_3_11_2","unstructured":"Devamanyu Hazarika Roger Zimmermann and Soujanya Poria. 2020. MISA: Modality-Invariant and -Specific Representations for Multimodal Sentiment Analysis. arxiv:https:\/\/arXiv.org\/abs\/2005.03545\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2005.03545"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413678"},{"key":"e_1_3_3_3_13_2","unstructured":"Junnan Li Ramprasaath\u00a0R. Selvaraju Akhilesh\u00a0Deepak Gotmare Shafiq Joty Caiming Xiong and Steven Hoi. 2021. Align before Fuse: Vision and Language Representation Learning with Momentum Distillation. arxiv:https:\/\/arXiv.org\/abs\/2107.07651\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2107.07651"},{"key":"e_1_3_3_3_14_2","unstructured":"Yafu Li Xuyang Hu Xiaoye Qu Linjie Li and Yu Cheng. 2025. Test-Time Preference Optimization: On-the-Fly Alignment via Iterative Textual Feedback. arXiv e-prints (2025) arXiv\u20132501."},{"key":"e_1_3_3_3_15_2","unstructured":"Zheng Lian Haoyu Chen Lan Chen Haiyang Sun Licai Sun Yong Ren Zebang Cheng Bin Liu Rui Liu Xiaojiang Peng et\u00a0al. 2025. Affectgpt: A new dataset model and benchmark for emotion understanding with multimodal large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.16566 (2025)."},{"key":"e_1_3_3_3_16_2","unstructured":"Zheng Lian Haiyang Sun Licai Sun Jiangyan Yi Bin Liu and Jianhua Tao. 2024. AffectGPT: Dataset and framework for explainable multimodal emotion recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.07653 (2024)."},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548190"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681583"},{"key":"e_1_3_3_3_19_2","unstructured":"Zhun Liu Ying Shen Varun\u00a0Bharadhwaj Lakshminarasimhan Paul\u00a0Pu Liang Amir Zadeh and Louis-Philippe Morency. 2018. Efficient Low-rank Multimodal Fusion with Modality-Specific Factors. arxiv:https:\/\/arXiv.org\/abs\/1806.00064\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/1806.00064"},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"publisher","unstructured":"N. Majumder D. Hazarika A. Gelbukh E. Cambria and S. Poria. 2018. Multimodal\u00a0sentiment\u00a0analysis using hierarchical\u00a0fusion with context\u00a0modeling. Knowledge-Based Systems 161 (2018) 124\u2013133. 10.1016\/j.knosys.2018.07.041","DOI":"10.1016\/j.knosys.2018.07.041"},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"crossref","unstructured":"Yu Meng Mengzhou Xia and Danqi Chen. 2024. Simpo: Simple preference optimization with a reference-free reward. Advances in Neural Information Processing Systems 37 (2024) 124198\u2013124235.","DOI":"10.52202\/079017-3946"},{"key":"e_1_3_3_3_22_2","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark Gretchen Krueger and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arxiv:https:\/\/arXiv.org\/abs\/2103.00020\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_3_3_23_2","doi-asserted-by":"publisher","unstructured":"Wasifur Rahman Md\u00a0Kamrul Hasan Sangwu Lee Amir Zadeh Chengfeng Mao Louis-Philippe Morency and Ehsan Hoque. 2020. Integrating Multimodal Information in Large Pretrained Transformers. Proceedings of the conference. Association for Computational Linguistics. Meeting 2020 (July 2020) 2359\u20142369. 10.18653\/v1\/2020.acl-main.214","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"e_1_3_3_3_24_2","doi-asserted-by":"publisher","unstructured":"Licai Sun Zheng Lian Bin Liu and Jianhua Tao. 2024. HiCMAE: Hierarchical Contrastive Masked Autoencoder for self-supervised Audio-Visual Emotion Recognition. Information Fusion 108 (2024) 102382. 10.1016\/j.inffus.2024.102382","DOI":"10.1016\/j.inffus.2024.102382"},{"key":"e_1_3_3_3_25_2","doi-asserted-by":"publisher","unstructured":"Yao-Hung\u00a0Hubert Tsai Shaojie Bai Paul Pu\u00a0Liang J\u00a0Zico Kolter Louis-Philippe Morency and Ruslan Salakhutdinov. 2019. Multimodal Transformer for Unaligned Multimodal Language Sequences. Proceedings of the conference. Association for Computational Linguistics. Meeting 2019 (July 2019) 6558\u20146569. 10.18653\/v1\/p19-1656","DOI":"10.18653\/v1\/p19-1656"},{"key":"e_1_3_3_3_26_2","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Yang Fan Kai Dang Mengfei Du Xuancheng Ren Rui Men Dayiheng Liu Chang Zhou Jingren Zhou and Junyang Lin. 2024. Qwen2-VL: Enhancing Vision-Language Model\u2019s Perception of the World at Any Resolution. arxiv:https:\/\/arXiv.org\/abs\/2409.12191\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2409.12191"},{"key":"e_1_3_3_3_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681403"},{"key":"e_1_3_3_3_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888606"},{"key":"e_1_3_3_3_29_2","doi-asserted-by":"publisher","unstructured":"Sheng Wu Dongxiao He Xiaobao Wang Longbiao Wang and Jianwu Dang. 2025. Enriching Multimodal Sentiment Analysis Through Textual Emotional Descriptions of Visual-Audio Content. Proceedings of the AAAI Conference on Artificial Intelligence 39 2 (Apr. 2025) 1601\u20131609. 10.1609\/aaai.v39i2.32152","DOI":"10.1609\/aaai.v39i2.32152"},{"key":"e_1_3_3_3_30_2","unstructured":"Jin Xu Zhifang Guo Jinzheng He Hangrui Hu Ting He Shuai Bai Keqin Chen Jialin Wang Yang Fan Kai Dang Bin Zhang Xiong Wang Yunfei Chu and Junyang Lin. 2025. Qwen2.5-Omni Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2503.20215\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2503.20215"},{"key":"e_1_3_3_3_31_2","unstructured":"Qize Yang Detao Bai Yi-Xing Peng and Xihan Wei. 2025. Omni-emotion: Extending video mllm with detailed face and audio modeling for multimodal emotion analysis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.09502 (2025)."},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054229"},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.343"},{"key":"e_1_3_3_3_34_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"publisher","unstructured":"Wenmeng Yu Hua Xu Ziqi Yuan and Jiele Wu. 2021. Learning Modality-Specific Representations with Self-Supervised Multi-Task Learning for Multimodal Sentiment Analysis. Proceedings of the AAAI Conference on Artificial Intelligence 35 12 (May 2021) 10790\u201310797. 10.1609\/aaai.v35i12.17289","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"e_1_3_3_3_36_2","unstructured":"Amir Zadeh Minghai Chen Soujanya Poria Erik Cambria and Louis-Philippe Morency. 2017. Tensor Fusion Network for Multimodal Sentiment Analysis. arxiv:https:\/\/arXiv.org\/abs\/1707.07250\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1707.07250"},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.49"},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"publisher","unstructured":"Xiaoqin Zhang Min Li Sheng Lin Hang Xu and Guobao Xiao. 2024. Transformer-Based Multimodal Emotional Perception for Dynamic Facial Expression Recognition in the Wild. IEEE Transactions on Circuits and Systems for Video Technology 34 5 (2024) 3192\u20133203. 10.1109\/TCSVT.2023.3312858","DOI":"10.1109\/TCSVT.2023.3312858"},{"key":"e_1_3_3_3_39_2","unstructured":"Zhuosheng Zhang Aston Zhang Mu Li Hai Zhao George Karypis and Alex Smola. 2023. Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.00923 (2023)."},{"key":"e_1_3_3_3_40_2","unstructured":"Jiaxing Zhao Xihan Wei and Liefeng Bo. 2025. R1-omni: Explainable omni-multimodal emotion recognition with reinforcement learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.05379 (2025)."},{"key":"e_1_3_3_3_41_2","unstructured":"Jinguo Zhu Weiyun Wang Zhe Chen Zhaoyang Liu Shenglong Ye Lixin Gu Hao Tian Yuchen Duan Weijie Su Jie Shao Zhangwei Gao Erfei Cui Xuehui Wang Yue Cao Yangzhou Liu Xingguang Wei Hongjie Zhang Haomin Wang Weiye Xu Hao Li Jiahao Wang Nianchen Deng Songze Li Yinan He Tan Jiang Jiapeng Luo Yi Wang Conghui He Botian Shi Xingcheng Zhang Wenqi Shao Junjun He Yingtong Xiong Wenwen Qu Peng Sun Penglong Jiao Han Lv Lijun Wu Kaipeng Zhang Huipeng Deng Jiaye Ge Kai Chen Limin Wang Min Dou Lewei Lu Xizhou Zhu Tong Lu Dahua Lin Yu Qiao Jifeng Dai and Wenhai Wang. 2025. InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models. arxiv:https:\/\/arXiv.org\/abs\/2504.10479\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2504.10479"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:34:21Z","timestamp":1781537661000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810634"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":40,"alternative-id":["10.1145\/3805622.3810634","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810634","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}