{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T09:15:04Z","timestamp":1780391704593,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":26,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3689092.3689401","type":"proceedings-article","created":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T18:33:17Z","timestamp":1729708397000},"page":"49-53","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Multimodal Emotion Recognition with Vision-language Prompting and Modality Dropout"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-1015-7494","authenticated-orcid":false,"given":"Anbin","family":"Qi","sequence":"first","affiliation":[{"name":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1034-6718","authenticated-orcid":false,"given":"Zhongliang","family":"Liu","sequence":"additional","affiliation":[{"name":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2774-2106","authenticated-orcid":false,"given":"Xinyong","family":"Zhou","sequence":"additional","affiliation":[{"name":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1559-9929","authenticated-orcid":false,"given":"Jinba","family":"Xiao","sequence":"additional","affiliation":[{"name":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9488-5077","authenticated-orcid":false,"given":"Fengrun","family":"Zhang","sequence":"additional","affiliation":[{"name":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0688-3920","authenticated-orcid":false,"given":"Qi","family":"Gan","sequence":"additional","affiliation":[{"name":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4757-4975","authenticated-orcid":false,"given":"Ming","family":"Tao","sequence":"additional","affiliation":[{"name":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3531-532X","authenticated-orcid":false,"given":"Gaozheng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8714-3007","authenticated-orcid":false,"given":"Lu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33 (2020), 12449--12460."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the 31st ACM International Conference on Multimedia. 9556--9560","author":"Chen Haifeng","year":"2023","unstructured":"Haifeng Chen, Chujia Guo, Yan Li, Peng Zhang, and Dongmei Jiang. 2023. Semisupervised multimodal emotion recognition with class-balanced pseudo-labeling. In Proceedings of the 31st ACM International Conference on Multimedia. 9556--9560."},{"key":"e_1_3_2_1_4_1","volume-title":"Electra: Pre-training text encoders as discriminators rather than generators. arXiv preprint arXiv:2003.10555","author":"Clark Kevin","year":"2020","unstructured":"Kevin Clark, Minh-Thang Luong, Quoc V Le, and Christopher D Manning. 2020. Electra: Pre-training text encoders as discriminators rather than generators. arXiv preprint arXiv:2003.10555 (2020)."},{"key":"e_1_3_2_1_5_1","volume-title":"Transformer-xl: Attentive language models beyond a fixedlength context. arXiv preprint arXiv:1901.02860","author":"Dai Zihang","year":"2019","unstructured":"Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V Le, and Ruslan Salakhutdinov. 2019. Transformer-xl: Attentive language models beyond a fixedlength context. arXiv preprint arXiv:1901.02860 (2019)."},{"key":"e_1_3_2_1_6_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_7_1","volume-title":"Kushal Lakhotia","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio, speech, and language processing 29 (2021), 3451--3460."},{"key":"e_1_3_2_1_8_1","first-page":"21157","article-title":"u-hubert: Unified mixed-modal speech pretraining and zero-shot transfer to unlabeled modality","volume":"35","author":"Hsu Wei-Ning","year":"2022","unstructured":"Wei-Ning Hsu and Bowen Shi. 2022. u-hubert: Unified mixed-modal speech pretraining and zero-shot transfer to unlabeled modality. Advances in Neural Information Processing Systems 35 (2022), 21157--21170.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","volume-title":"International conference on machine learning. PMLR, 9226--9259","author":"Huang Yu","year":"2022","unstructured":"Yu Huang, Junyang Lin, Chang Zhou, Hongxia Yang, and Longbo Huang. 2022. Modality competition: What makes joint training of multi-modal network fail in deep learning?(provably). In International conference on machine learning. PMLR, 9226--9259."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"e_1_3_2_1_11_1","volume-title":"MER 2024: Semi- Supervised Learning, Noise Robustness, and Open-Vocabulary Multimodal Emotion Recognition. arXiv preprint arXiv:2404","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Haiyang Sun, Licai Sun, Zhuofan Wen, Siyuan Zhang, Shun Chen, Hao Gu, Jinming Zhao, Ziyang Ma, Xie Chen, et al. 2024. MER 2024: Semi- Supervised Learning, Noise Robustness, and Open-Vocabulary Multimodal Emotion Recognition. arXiv preprint arXiv:2404.17113 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"Merbench: A unified evaluation benchmark for multimodal emotion recognition. arXiv preprint arXiv:2401.03429","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Licai Sun, Yong Ren, Hao Gu, Haiyang Sun, Lan Chen, Bin Liu, and Jianhua Tao. 2024. Merbench: A unified evaluation benchmark for multimodal emotion recognition. arXiv preprint arXiv:2401.03429 (2024)."},{"key":"e_1_3_2_1_13_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_14_1","volume-title":"Paul Pu Liang, Amir Zadeh, and Louis-Philippe Morency.","author":"Liu Zhun","year":"2018","unstructured":"Zhun Liu, Ying Shen, Varun Bharadhwaj Lakshminarasimhan, Paul Pu Liang, Amir Zadeh, and Louis-Philippe Morency. 2018. Efficient low-rank multimodal fusion with modality-specific factors. arXiv preprint arXiv:1806.00064 (2018)."},{"key":"e_1_3_2_1_15_1","volume-title":"emotion2vec: Self-supervised pre-training for speech emotion representation. arXiv preprint arXiv:2312.15185","author":"Ma Ziyang","year":"2023","unstructured":"Ziyang Ma, Zhisheng Zheng, Jiaxin Ye, Jinchao Li, Zhifu Gao, Shiliang Zhang, and Xie Chen. 2023. emotion2vec: Self-supervised pre-training for speech emotion representation. arXiv preprint arXiv:2312.15185 (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00806"},{"key":"e_1_3_2_1_17_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00633"},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the conference. Association for computational linguistics. Meeting","volume":"2019","author":"Hubert Tsai Yao-Hung","year":"2019","unstructured":"Yao-Hung Hubert Tsai, Shaojie Bai, Paul Pu Liang, J Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Multimodal transformer for unaligned multimodal language sequences. In Proceedings of the conference. Association for computational linguistics. Meeting, Vol. 2019. NIH Public Access, 6558."},{"key":"e_1_3_2_1_20_1","volume-title":"International conference on machine learning. PMLR, 11525--11536","author":"Xu Yi","year":"2021","unstructured":"Yi Xu, Lei Shang, Jinxing Ye, Qi Qian, Yu-Feng Li, Baigui Sun, Hao Li, and Rong Jin. 2021. Dash: Semi-supervised learning with dynamic thresholding. In International conference on machine learning. PMLR, 11525--11536."},{"key":"e_1_3_2_1_21_1","unstructured":"Aiyuan Yang Bin Xiao Bingning Wang Borong Zhang Ce Bian Chao Yin Chenxu Lv Da Pan Dian Wang Dong Yan et al. 2023. Baichuan 2: Open largescale language models. arXiv preprint arXiv:2309.10305 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"Xlnet: Generalized autoregressive pretraining for language understanding. Advances in neural information processing systems 32","author":"Yang Zhilin","year":"2019","unstructured":"Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Russ R Salakhutdinov, and Quoc V Le. 2019. Xlnet: Generalized autoregressive pretraining for language understanding. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_23_1","volume-title":"Tensor fusion network for multimodal sentiment analysis. arXiv preprint arXiv:1707.07250","author":"Zadeh Amir","year":"2017","unstructured":"Amir Zadeh, Minghai Chen, Soujanya Poria, Erik Cambria, and Louis-Philippe Morency. 2017. Tensor fusion network for multimodal sentiment analysis. arXiv preprint arXiv:1707.07250 (2017)."},{"key":"e_1_3_2_1_24_1","first-page":"18408","article-title":"Flexmatch: Boosting semi-supervised learning with curriculum pseudo labeling","volume":"34","author":"Zhang Bowen","year":"2021","unstructured":"Bowen Zhang, Yidong Wang, Wenxin Hou, Hao Wu, Jindong Wang, Manabu Okumura, and Takahiro Shinozaki. 2021. Flexmatch: Boosting semi-supervised learning with curriculum pseudo labeling. Advances in Neural Information Processing Systems 34 (2021), 18408--18419.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612872"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2nd International Workshop on Multimodal and Responsible Affective Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689092.3689401","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3689092.3689401","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T01:59:32Z","timestamp":1755914372000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689092.3689401"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":26,"alternative-id":["10.1145\/3689092.3689401","10.1145\/3689092"],"URL":"https:\/\/doi.org\/10.1145\/3689092.3689401","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}