{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,23]],"date-time":"2025-10-23T01:09:14Z","timestamp":1761181754258,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746270.3760215","type":"proceedings-article","created":{"date-parts":[[2025,10,20]],"date-time":"2025-10-20T15:14:09Z","timestamp":1760973249000},"page":"2-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["More Is Better: A MoE-Based Emotion Recognition Framework with Human Preference Alignment"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6152-3943","authenticated-orcid":false,"given":"Jun","family":"Xie","sequence":"first","affiliation":[{"name":"Lenovo Research, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5538-0645","authenticated-orcid":false,"given":"Yingjian","family":"Zhu","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8278-486X","authenticated-orcid":false,"given":"Feng","family":"Chen","sequence":"additional","affiliation":[{"name":"Lenovo Research, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1530-7486","authenticated-orcid":false,"given":"Zhenghao","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1498-1989","authenticated-orcid":false,"given":"Xiaohui","family":"Fan","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3333-5863","authenticated-orcid":false,"given":"Hongzhu","family":"Yi","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9111-4817","authenticated-orcid":false,"given":"Xinming","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3711-2078","authenticated-orcid":false,"given":"Chen","family":"Yu","sequence":"additional","affiliation":[{"name":"Beijing Jiaotong University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6187-5641","authenticated-orcid":false,"given":"Yue","family":"Bi","sequence":"additional","affiliation":[{"name":"Shandong University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3406-2598","authenticated-orcid":false,"given":"Zhaoran","family":"Zhao","sequence":"additional","affiliation":[{"name":"Lenovo Research, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8887-3735","authenticated-orcid":false,"given":"Xiongjun","family":"Guan","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6088-3517","authenticated-orcid":false,"given":"Zhepeng","family":"Wang","sequence":"additional","affiliation":[{"name":"Lenovo Research, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,26]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"1","article-title":"Openface: an open source facial behavior analysis toolkit. In 2016 IEEE winter conference on applications of computer vision (WACV)","author":"Baltru\u0161aitis Tadas","year":"2016","unstructured":"Tadas Baltru\u0161aitis, Peter Robinson, and Louis-Philippe Morency. 2016. Openface: an open source facial behavior analysis toolkit. In 2016 IEEE winter conference on applications of computer vision (WACV). IEEE, 1-10.","journal-title":"IEEE"},{"key":"e_1_3_2_1_2_1","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu et al. 2024. Expanding performance boundaries of open-source multimodal models with model data and test-time scaling. arXiv preprint arXiv:2412.05271 (2024)."},{"key":"e_1_3_2_1_3_1","first-page":"110805","article-title":"Emotion-llama: Multimodal emotion recognition and reasoning with instruction tuning","volume":"37","author":"Cheng Zebang","year":"2024","unstructured":"Zebang Cheng, Zhi-Qi Cheng, Jun-Yan He, Kai Wang, Yuxiang Lin, Zheng Lian, Xiaojiang Peng, and Alexander Hauptmann. 2024. Emotion-llama: Multimodal emotion recognition and reasoning with instruction tuning. Advances in Neural Information Processing Systems, Vol. 37 (2024), 110805-110853.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_4_1","unstructured":"Yunfei Chu Jin Xu Qian Yang Haojie Wei Xipin Wei Zhifang Guo Yichong Leng Yuanjun Lv Jinzheng He Junyang Lin et al. 2024. Qwen2-audio technical report. arXiv preprint arXiv:2407.10759 (2024)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3124365"},{"key":"e_1_3_2_1_6_1","volume-title":"Facial action coding system. Environmental Psychology & Nonverbal Behavior","author":"Ekman Paul","year":"1978","unstructured":"Paul Ekman and Wallace V Friesen. 1978. Facial action coding system. Environmental Psychology & Nonverbal Behavior (1978)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.3389\/frai.2024.1467051"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689092.3689412"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689092.3689415"},{"key":"e_1_3_2_1_10_1","unstructured":"Pengcheng Guo and Shixing Liu. 2022. chinese_speech_pretrain. https:\/\/github.com\/TencentGameMate\/chinese_speech_pretrain"},{"key":"e_1_3_2_1_11_1","volume-title":"Kushal Lakhotia","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio, speech, and language processing, Vol. 29 (2021), 3451-3460."},{"key":"e_1_3_2_1_12_1","volume-title":"Audio and Textual Data. In 2025 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV). IEEE, 5305-5315","author":"Kumar Puneet","year":"2025","unstructured":"Puneet Kumar, Shreshtha Misra, Zhuhong Shao, Bin Zhu, Balasubramanian Raman, and Xiaobai Li. 2025. Multimodal Interpretable Depression Analysis Using Visual, Physiological, Audio and Textual Data. In 2025 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV). IEEE, 5305-5315."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSEN.2024.3363042"},{"key":"e_1_3_2_1_14_1","unstructured":"Zheng Lian Rui Liu Kele Xu Bin Liu Xuefei Liu Yazhou Zhang Xin Liu Yong Li Zebang Cheng Haolin Zuo et al. 2025. Mer 2025: When affective computing meets large language models. arXiv preprint arXiv:2504.19423 (2025)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612836"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689092.3689959"},{"key":"e_1_3_2_1_17_1","volume-title":"Merbench: A unified evaluation benchmark for multimodal emotion recognition. arXiv preprint arXiv:2401.03429","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Licai Sun, Yong Ren, Hao Gu, Haiyang Sun, Lan Chen, Bin Liu, and Jianhua Tao. 2024a. Merbench: A unified evaluation benchmark for multimodal emotion recognition. arXiv preprint arXiv:2401.03429 (2024)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_1_19_1","volume-title":"Advancing education through tutoring systems: A systematic literature review. arXiv preprint arXiv:2503.09748","author":"Liu Vincent","year":"2025","unstructured":"Vincent Liu, Ehsan Latif, and Xiaoming Zhai. 2025. Advancing education through tutoring systems: A systematic literature review. arXiv preprint arXiv:2503.09748 (2025)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2013.130"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-022-05026-w"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689092.3689401"},{"key":"e_1_3_2_1_23_1","volume-title":"International conference on machine learning (ICML). PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning (ICML). PmLR, 8748-8763."},{"key":"e_1_3_2_1_24_1","volume-title":"Radeya Chowdhury","author":"Sadi Abu Adnan","year":"2022","unstructured":"Abu Adnan Sadi, Labib Chowdhury, Nusrat Jahan, Mohammad Newaz Sharif Rafi, Radeya Chowdhury, Faisal Ahamed Khan, and Nabeel Mohammed. 2022. LMFLOSS: A hybrid loss for imbalanced medical image classification. arXiv preprint arXiv:2212.12741 (2022)."},{"key":"e_1_3_2_1_25_1","volume-title":"Audio-guided fusion techniques for multimodal emotion analysis. arXiv preprint arXiv:2409.05007","author":"Shi Pujin","year":"2024","unstructured":"Pujin Shi and Fei Gao. 2024. Audio-guided fusion techniques for multimodal emotion analysis. arXiv preprint arXiv:2409.05007 (2024)."},{"key":"e_1_3_2_1_26_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth Katie Millican et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"volume-title":"Workshop, Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili\u0107, Daniel Hesslow, Roman Castagn\u00e9, Alexandra Sasha Luccioni, Fran\u00e7ois Yvon, et al., 2022","year":"2022","key":"e_1_3_2_1_27_1","unstructured":"BigScience Workshop, Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili\u0107, Daniel Hesslow, Roman Castagn\u00e9, Alexandra Sasha Luccioni, Fran\u00e7ois Yvon, et al., 2022. Bloom: A 176b-parameter open-access multilingual language model. arXiv preprint arXiv:2211.05100 (2022)."},{"volume-title":"FG-CLIP: Fine-Grained Visual and Textual Alignment. In Forty-second International Conference on Machine Learning.","author":"Xie Chunyu","key":"e_1_3_2_1_28_1","unstructured":"Chunyu Xie, Bin Wang, Fanjing Kong, Jincheng Li, Dawei Liang, Gengshen Zhang, Dawei Leng, and Yuhui Yin. [n.d.]. FG-CLIP: Fine-Grained Visual and Textual Alignment. In Forty-second International Conference on Machine Learning."},{"key":"e_1_3_2_1_29_1","volume-title":"Four Eyes Are Better Than Two: Harnessing the Collaborative Potential of Large Models via Differentiated Thinking and Complementary Ensembles. arXiv preprint arXiv:2505.16784","author":"Xie Jun","year":"2025","unstructured":"Jun Xie, Xiongjun Guan, Yingjian Zhu, Zhaoran Zhao, Xinming Wang, Hongzhu Yi, Feng Chen, and Zhepeng Wang. 2025. Four Eyes Are Better Than Two: Harnessing the Collaborative Potential of Large Models via Differentiated Thinking and Complementary Ensembles. arXiv preprint arXiv:2505.16784 (2025)."},{"key":"e_1_3_2_1_30_1","unstructured":"Aiyuan Yang Bin Xiao Bingning Wang Borong Zhang Ce Bian Chao Yin Chenxu Lv Da Pan Dian Wang Dong Yan et al. 2023. Baichuan 2: Open large-scale language models. arXiv preprint arXiv:2309.10305 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"Emotion-anchored contrastive learning framework for emotion recognition in conversation. arXiv preprint arXiv:2403.20289","author":"Yu Fangxu","year":"2024","unstructured":"Fangxu Yu, Junjie Guo, Zhen Wu, and Xinyu Dai. 2024. Emotion-anchored contrastive learning framework for emotion recognition in conversation. arXiv preprint arXiv:2403.20289 (2024)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689092.3689407"}],"event":{"name":"MM '25:The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland"},"container-title":["Proceedings of the 3rd International Workshop on Multimodal and Responsible Affective Computing"],"original-title":[],"deposited":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T17:22:20Z","timestamp":1761153740000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746270.3760215"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,26]]},"references-count":32,"alternative-id":["10.1145\/3746270.3760215","10.1145\/3746270"],"URL":"https:\/\/doi.org\/10.1145\/3746270.3760215","relation":{},"subject":[],"published":{"date-parts":[[2025,10,26]]},"assertion":[{"value":"2025-10-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}