{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:06:56Z","timestamp":1776888416542,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3689092.3689415","type":"proceedings-article","created":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T18:33:17Z","timestamp":1729708397000},"page":"54-61","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Early Joint Learning of Emotion Information Makes MultiModal Model Understand You Better"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-2976-4153","authenticated-orcid":false,"given":"Mengying","family":"Ge","sequence":"first","affiliation":[{"name":"BOSS ZhiPin, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2082-5013","authenticated-orcid":false,"given":"Mingyang","family":"Li","sequence":"additional","affiliation":[{"name":"BOSS ZhiPin, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8426-6725","authenticated-orcid":false,"given":"Dongkai","family":"Tang","sequence":"additional","affiliation":[{"name":"BOSS ZhiPin, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0563-2854","authenticated-orcid":false,"given":"Pengbo","family":"Li","sequence":"additional","affiliation":[{"name":"BOSS ZhiPin, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5704-7959","authenticated-orcid":false,"given":"Kuo","family":"Liu","sequence":"additional","affiliation":[{"name":"BOSS ZhiPin, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0322-2661","authenticated-orcid":false,"given":"Shuhao","family":"Deng","sequence":"additional","affiliation":[{"name":"BOSS ZhiPin, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5575-263X","authenticated-orcid":false,"given":"Songbai","family":"Pu","sequence":"additional","affiliation":[{"name":"BOSS ZhiPin, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1080-079X","authenticated-orcid":false,"given":"Long","family":"Liu","sequence":"additional","affiliation":[{"name":"BOSS ZhiPin, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8252-9626","authenticated-orcid":false,"given":"Yang","family":"Song","sequence":"additional","affiliation":[{"name":"BOSS ZhiPin, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5226-9429","authenticated-orcid":false,"given":"Tao","family":"Zhang","sequence":"additional","affiliation":[{"name":"BOSS ZhiPin, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.5555\/265013"},{"key":"e_1_3_2_1_2_1","volume-title":"Unimse: Towards unified multimodal sentiment analysis and emotion recognition. arXiv preprint arXiv:2211.11256","author":"Hu Guimin","year":"2022","unstructured":"Guimin Hu, Ting-En Lin, Yi Zhao, Guangming Lu, Yuchuan Wu, and Yongbin Li. Unimse: Towards unified multimodal sentiment analysis and emotion recognition. arXiv preprint arXiv:2211.11256, 2022."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413678"},{"key":"e_1_3_2_1_4_1","volume-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems, 35:10078--10093","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems, 35:10078--10093, 2022."},{"key":"e_1_3_2_1_5_1","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. Learning transferable visual models from natural language supervision. In International conference on machine learning, pages 8748--8763. PMLR, 2021."},{"key":"e_1_3_2_1_6_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems, 33:12449--12460","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems, 33:12449--12460, 2020."},{"key":"e_1_3_2_1_7_1","volume-title":"Hubert: Self-supervised speech representation learning by masked prediction of hidden units","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio, speech, and language processing, 29:3451--3460, 2021."},{"key":"e_1_3_2_1_8_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692, 2019."},{"key":"e_1_3_2_1_9_1","volume-title":"Baichuan 2: Open large-scale language models. arXiv preprint arXiv:2309.10305","author":"Yang Aiyuan","year":"2023","unstructured":"Aiyuan Yang, Bin Xiao, Bingning Wang, Borong Zhang, Ce Bian, Chao Yin, Chenxu Lv, Da Pan, Dian Wang, Dong Yan, et al. Baichuan 2: Open large-scale language models. arXiv preprint arXiv:2309.10305, 2023."},{"key":"e_1_3_2_1_10_1","volume-title":"Jointly fine-tuning? bert-like? self supervised models to improve multimodal speech emotion recognition. arXiv preprint arXiv:2008.06682","author":"Siriwardhana Shamane","year":"2020","unstructured":"Shamane Siriwardhana, Andrew Reis, Rivindu Weerasekera, and Suranga Nanayakkara. Jointly fine-tuning? bert-like? self supervised models to improve multimodal speech emotion recognition. arXiv preprint arXiv:2008.06682, 2020."},{"key":"e_1_3_2_1_11_1","volume-title":"Multimodal machine learning: A survey and taxonomy","author":"Ahuja Chaitanya","year":"2018","unstructured":", Chaitanya Ahuja, and Louis-Philippe Morency. Multimodal machine learning: A survey and taxonomy. IEEE transactions on pattern analysis and machine intelligence, 41(2):423--443, 2018."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2022.03.009"},{"key":"e_1_3_2_1_13_1","volume-title":"Merbench: A unified evaluation benchmark for multimodal emotion recognition. arXiv preprint arXiv:2401.03429","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Licai Sun, Yong Ren, Hao Gu, Haiyang Sun, Lan Chen, Bin Liu, and Jianhua Tao. Merbench: A unified evaluation benchmark for multimodal emotion recognition. arXiv preprint arXiv:2401.03429, 2024."},{"key":"e_1_3_2_1_14_1","first-page":"9226","volume-title":"International conference on machine learning","author":"Huang Yu","year":"2022","unstructured":"Yu Huang, Junyang Lin, Chang Zhou, Hongxia Yang, and Longbo Huang. Modality competition: What makes joint training of multi-modal network fail in deep learning?(provably). In International conference on machine learning, pages 9226-- 9259. PMLR, 2022."},{"key":"e_1_3_2_1_15_1","volume-title":"Mer 2024: Semi-supervised learning, noise robustness, and open-vocabulary multimodal emotion recognition. arXiv preprint arXiv:2404.17113","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Haiyang Sun, Licai Sun, Zhuofan Wen, Siyuan Zhang, Shun Chen, Hao Gu, Jinming Zhao, Ziyang Ma, Xie Chen, et al. Mer 2024: Semi-supervised learning, noise robustness, and open-vocabulary multimodal emotion recognition. arXiv preprint arXiv:2404.17113, 2024."},{"key":"e_1_3_2_1_16_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929, 2020."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612836"},{"key":"e_1_3_2_1_21_1","volume-title":"Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models. arXiv preprint arXiv:2311.07919","author":"Chu Yunfei","year":"2023","unstructured":"Yunfei Chu, Jin Xu, Xiaohuan Zhou, Qian Yang, Shiliang Zhang, Zhijie Yan, Chang Zhou, and Jingren Zhou. Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models. arXiv preprint arXiv:2311.07919, 2023."},{"key":"e_1_3_2_1_22_1","volume-title":"et al. Funasr: A fundamental end-to-end speech recognition toolkit. arXiv preprint arXiv:2305.11013","author":"Gao Zhifu","year":"2023","unstructured":"Zhifu Gao, Zerui Li, Jiaming Wang, Haoneng Luo, Xian Shi, Mengzhe Chen, Yabin Li, Lingyun Zuo, Zhihao Du, Zhangyu Xiao, et al. Funasr: A fundamental end-to-end speech recognition toolkit. arXiv preprint arXiv:2305.11013, 2023."},{"key":"e_1_3_2_1_23_1","volume-title":"Paraformer: Fast and accurate parallel transformer for non-autoregressive end-to-end speech recognition. arXiv preprint arXiv:2206.08317","author":"Gao Zhifu","year":"2022","unstructured":"Zhifu Gao, Shiliang Zhang, Ian McLoughlin, and Zhijie Yan. Paraformer: Fast and accurate parallel transformer for non-autoregressive end-to-end speech recognition. arXiv preprint arXiv:2206.08317, 2022."},{"key":"e_1_3_2_1_24_1","volume-title":"A Music, Speech, and Noise Corpus","author":"Snyder David","year":"2015","unstructured":"David Snyder, Guoguo Chen, and Daniel Povey. MUSAN: A Music, Speech, and Noise Corpus, 2015. arXiv:1510.08484v1."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2019.2917582"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10445985"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.425"},{"key":"e_1_3_2_1_28_1","volume-title":"Voxceleb2: Deep speaker recognition. arXiv preprint arXiv:1806.05622","author":"Chung Joon Son","year":"2018","unstructured":"Joon Son Chung, Arsha Nagrani, and Andrew Zisserman. Voxceleb2: Deep speaker recognition. arXiv preprint arXiv:1806.05622, 2018."},{"key":"e_1_3_2_1_29_1","volume-title":"Workshop on faces in?Real-Life?Images: detection, alignment, and recognition","author":"Huang Gary B","year":"2008","unstructured":"Gary B Huang, Marwan Mattar, Tamara Berg, and Eric Learned-Miller. Labeled faces in the wild: A database forstudying face recognition in unconstrained environments. In Workshop on faces in?Real-Life?Images: detection, alignment, and recognition, 2008."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.596"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"e_1_3_2_1_32_1","volume-title":"Imdb-wiki--500k face images with age and gender labels. Online] URL: https:\/\/data. vision. ee. ethz. ch\/cvl\/rrothe\/imdb-wiki, 4","author":"Rothe Rasmus","year":"2015","unstructured":"Rasmus Rothe, Radu Timofte, and LV Gool. Imdb-wiki--500k face images with age and gender labels. Online] URL: https:\/\/data. vision. ee. ethz. ch\/cvl\/rrothe\/imdb-wiki, 4, 2015."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.343"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12024"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.277"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.414"},{"key":"e_1_3_2_1_37_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101, 2017."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2nd International Workshop on Multimodal and Responsible Affective Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689092.3689415","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3689092.3689415","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T01:59:47Z","timestamp":1755914387000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689092.3689415"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":37,"alternative-id":["10.1145\/3689092.3689415","10.1145\/3689092"],"URL":"https:\/\/doi.org\/10.1145\/3689092.3689415","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}