{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,27]],"date-time":"2026-06-27T16:22:29Z","timestamp":1782577349497,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62076195, 62376070"],"award-info":[{"award-number":["62076195, 62376070"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3762011","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:55:00Z","timestamp":1761375300000},"page":"13864-13871","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Agent-MER: A Cognitive Agent with Hierarchical Deliberation for Open-Vocabulary Multimodal Emotion Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-6144-5396","authenticated-orcid":false,"given":"Zhengqin","family":"Lai","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China and Pengcheng Laboratory, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8657-6408","authenticated-orcid":false,"given":"Zhilin","family":"Zhu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China and Pengcheng Laboratory, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0611-0636","authenticated-orcid":false,"given":"Xiaopeng","family":"Hong","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Harbin, China and Pengcheng Laboratory, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6110-4036","authenticated-orcid":false,"given":"Yaowei","family":"Wang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China and Pengcheng Laboratory, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation","author":"Busso Carlos","year":"2008","unstructured":"Carlos Busso, Murtaza Bulut, Chi-Chun Lee, Abe Kazemzadeh, Emily Mower, Samuel Kim, Jeannette N Chang, Sungbok Lee, and Shrikanth S Narayanan. 2008. IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation, Vol. 42 (2008), 335-359."},{"key":"e_1_3_2_1_2_1","volume-title":"Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models. arXiv preprint arXiv:2311.07919","author":"Chu Yunfei","year":"2023","unstructured":"Yunfei Chu, Jin Xu, Xiaohuan Zhou, Qian Yang, Shiliang Zhang, Zhijie Yan, Chang Zhou, and Jingren Zhou. 2023. Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models. arXiv preprint arXiv:2311.07919 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Bidipta Sarkar, Rohan Taori, Yusuke Noda, Demetri Terzopoulos, Yejin Choi, et al.","author":"Durante Zane","year":"2024","unstructured":"Zane Durante, Qiuyuan Huang, Naoki Wake, Ran Gong, Jae Sung Park, Bidipta Sarkar, Rohan Taori, Yusuke Noda, Demetri Terzopoulos, Yejin Choi, et al., 2024. AGENT AI: Surveying the Horizons of Multimodal Interaction. arXiv preprint arXiv:2401.03568v2 (2024)."},{"key":"e_1_3_2_1_4_1","volume-title":"An argument for basic emotions. Cognition & emotion","author":"Ekman Paul","year":"1992","unstructured":"Paul Ekman. 1992. An argument for basic emotions. Cognition & emotion, Vol. 6, 3-4 (1992), 169-200."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413620"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689092.3689399"},{"key":"e_1_3_2_1_8_1","volume-title":"A Benchmark for Incremental Micro-expression Recognition. arXiv preprint arXiv:2501.19111","author":"Lai Zhengqin","year":"2025","unstructured":"Zhengqin Lai, Xiaopeng Hong, Yabin Wang, and Xiaobai Li. 2025. A Benchmark for Incremental Micro-expression Recognition. arXiv preprint arXiv:2501.19111 (2025)."},{"key":"e_1_3_2_1_9_1","unstructured":"Patrick Lewis Ethan Perez Aleksandra Piktus Fabio Petroni Vladimir Karpukhin Naman Goyal Heinrich K\u00fcttler Mike Lewis Wen-tau Yih Tim Rockt\u00e4schel et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in neural information processing systems Vol. 33 (2020) 9459-9474."},{"key":"e_1_3_2_1_10_1","volume-title":"Language-driven Semantic Segmentation. In International Conference on Learning Representations.","author":"Li Boyi","year":"2021","unstructured":"Boyi Li, Kilian Q Weinberger, Serge Belongie, Vladlen Koltun, and Rene Ranftl. 2021. Language-driven Semantic Segmentation. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_11_1","volume-title":"Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355","author":"Li KunChang","year":"2023","unstructured":"KunChang Li, Yinan He, Yi Wang, Yizhuo Li, Wenhai Wang, Ping Luo, Yali Wang, Limin Wang, and Yu Qiao. 2023. Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"European Conference on Computer Vision. Springer, 323-340","author":"Li Yanwei","year":"2024","unstructured":"Yanwei Li, Chengyao Wang, and Jiaya Jia. 2024. Llama-vid: An image is worth 2 tokens in large language models. In European Conference on Computer Vision. Springer, 323-340."},{"key":"e_1_3_2_1_13_1","volume-title":"Appeared in Proceedings of the 41st International Conference on Machine Learning (PMLR 267)","author":"Lian Zheng","year":"2025","unstructured":"Zheng Lian, Haoyu Chen, Lan Chen, Haiyang Sun, Licai Sun, Yong Ren, Zebang Cheng, Bin Liu, Rui Liu, Xiaojiang Peng, Jiangyan Yi, and Jianhua Tao. 2025a. AffectGPT: A New Dataset, Model, and Benchmark for Emotion Understanding with Multimodal Large Language Models. arXiv preprint arXiv:2501.16566v2 (2025). Appeared in Proceedings of the 41st International Conference on Machine Learning (PMLR 267)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Zheng Lian Rui Liu Kele Xu Bin Liu Xuefei Liu Yazhou Zhang Xin Liu Yong Li Zebang Cheng Haolin Zuo et al. 2025b. Mer 2025: When affective computing meets large language models. arXiv preprint arXiv:2504.19423 (2025).","DOI":"10.1145\/3746027.3762007"},{"key":"e_1_3_2_1_15_1","volume-title":"Appeared in Proceedings of the 41st International Conference on Machine Learning (PMLR 267)","author":"Lian Zheng","year":"2025","unstructured":"Zheng Lian, Haiyang Sun, Licai Sun, Haoyu Chen, Lan Chen, Hao Gu, Zhuofan Wen, Shun Chen, Siyuan Zhang, Hailiang Yao, Bin Liu, Rui Liu, Shan Liang, Ya Li, Jiangyan Yi, and Jianhua Tao. 2025c. OV-MER: Towards Open-Vocabulary Multimodal Emotion Recognition. arXiv preprint arXiv:2410.01495v3 (2025). Appeared in Proceedings of the 41st International Conference on Machine Learning (PMLR 267)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612836"},{"key":"e_1_3_2_1_17_1","first-page":"394","article-title":"Context-Dependent Domain Adversarial Neural Network for Multimodal Emotion Recognition","author":"Lian Zheng","year":"2020","unstructured":"Zheng Lian, Jianhua Tao, Bin Liu, Jian Huang, Zhanlei Yang, and Rongjun Li. 2020. Context-Dependent Domain Adversarial Neural Network for Multimodal Emotion Recognition.. In Interspeech. 394-398.","journal-title":"Interspeech."},{"key":"e_1_3_2_1_18_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i18.34127"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2025.3553290"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01308"},{"key":"e_1_3_2_1_22_1","volume-title":"Society of mind","author":"Minsky Marvin","unstructured":"Marvin Minsky. 1988. Society of mind. Simon and Schuster."},{"key":"e_1_3_2_1_23_1","volume-title":"Kanishka Rao, Dorsa Sadigh, and Andy Zeng.","author":"Mirchandani Suvir","year":"2023","unstructured":"Suvir Mirchandani, Fei Xia, Pete Florence, Brian Ichter, Danny Driess, Montserrat Gonzalez Arenas, Kanishka Rao, Dorsa Sadigh, and Andy Zeng. 2023. Large language models as general pattern machines. arXiv preprint arXiv:2307.04721 (2023)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606763"},{"key":"e_1_3_2_1_25_1","volume-title":"A general psychoevolutionary theory of emotion. Emotion: Theory, research, and experience","author":"Plutchik Robert","year":"1980","unstructured":"Robert Plutchik. 1980. A general psychoevolutionary theory of emotion. Emotion: Theory, research, and experience, Vol. 1 (1980), 3-33."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1511\/2001.28.344"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689092.3689401"},{"key":"e_1_3_2_1_28_1","first-page":"68539","article-title":"Toolformer: Language models can teach themselves to use tools","volume":"36","author":"Schick Timo","year":"2023","unstructured":"Timo Schick, Jane Dwivedi-Yu, Roberto Dess`i, Roberta Raileanu, Maria Lomeli, Eric Hambro, Luke Zettlemoyer, Nicola Cancedda, and Thomas Scialom. 2023. Toolformer: Language models can teach themselves to use tools. Advances in Neural Information Processing Systems, Vol. 36 (2023), 68539-68551.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2023.3274829"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612365"},{"key":"e_1_3_2_1_31_1","volume-title":"Salmonn: Towards generic hearing abilities for large language models. arXiv preprint arXiv:2310.13289","author":"Tang Changli","year":"2023","unstructured":"Changli Tang, Wenyi Yu, Guangzhi Sun, Xianzhao Chen, Tian Tan, Wei Li, Lu Lu, Zejun Ma, and Chao Zhang. 2023. Salmonn: Towards generic hearing abilities for large language models. arXiv preprint arXiv:2310.13289 (2023)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11704-024-40231-1"},{"key":"e_1_3_2_1_33_1","volume-title":"Aakanksha Chowdhery, and Denny Zhou.","author":"Wang Xuezhi","year":"2022","unstructured":"Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc Le, Ed Chi, Sharan Narang, Aakanksha Chowdhery, and Denny Zhou. 2022. Self-consistency improves chain of thought reasoning in language models. arXiv preprint arXiv:2203.11171 (2022)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.1242"},{"key":"e_1_3_2_1_35_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al., 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824-24837."},{"key":"e_1_3_2_1_36_1","unstructured":"Jianzong Wu Xiangtai Li Shilin Xu Haobo Yuan Henghui Ding Yibo Yang Xia Li Jiangning Zhang Yunhai Tong Xudong Jiang et al. 2024. Towards open vocabulary learning: A survey. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_37_1","unstructured":"Zhilin Xi Wenxiang Chen Xin Guo Wei He Yi Ding Bowen Hong Ming Zhang Jun Wang Siyu Jin Enyu Zhou et al. 2023. The rise and potential of large language model based agents: A survey. arXiv preprint arXiv:2309.07864 (2023)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21413"},{"key":"e_1_3_2_1_39_1","volume-title":"Tree of thoughts: Deliberate problem solving with large language models. Advances in neural information processing systems","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and Karthik Narasimhan. 2023a. Tree of thoughts: Deliberate problem solving with large language models. Advances in neural information processing systems, Vol. 36 (2023), 11809-11822."},{"key":"e_1_3_2_1_40_1","volume-title":"Appeared in The Eleventh International Conference on Learning Representations (ICLR","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao. 2023b. ReAct: Synergizing reasoning and acting in language models. arXiv preprint arXiv:2210.03629 (2023). Appeared in The Eleventh International Conference on Learning Representations (ICLR 2023)."},{"key":"e_1_3_2_1_41_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi et al. 2023. mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.108820"},{"key":"e_1_3_2_1_44_1","volume-title":"European Conference on Computer Vision. Springer, 415-433","author":"Zhu Zhilin","year":"2024","unstructured":"Zhilin Zhu, Xiaopeng Hong, Zhiheng Ma, Weijun Zhuang, Yaohui Ma, Yong Dai, and Yaowei Wang. 2024. Reshaping the online data buffering and organizing mechanism for continual test-time adaptation. In European Conference on Computer Vision. Springer, 415-433."},{"key":"e_1_3_2_1_45_1","volume-title":"Conference on Robot Learning. PMLR, 2165-2183","author":"Zitkovich Brianna","year":"2023","unstructured":"Brianna Zitkovich, Tianhe Yu, Sichun Xu, Peng Xu, Ted Xiao, Fei Xia, Jialin Wu, Paul Wohlhart, Stefan Welker, Ayzaan Wahid, et al., 2023. Rt-2: Vision-language-action models transfer web knowledge to robotic control. In Conference on Robot Learning. PMLR, 2165-2183."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3762011","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T03:59:54Z","timestamp":1765339194000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3762011"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":45,"alternative-id":["10.1145\/3746027.3762011","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3762011","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}