{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T03:30:29Z","timestamp":1769830229130,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the Natural Science Foundation of Top Talent at SZTU","award":["GDRC202131"],"award-info":[{"award-number":["GDRC202131"]}]},{"name":"NSF CISE","award":["1937998"],"award-info":[{"award-number":["1937998"]}]},{"name":"the National Natural Science Foundation of China","award":["62176165"],"award-info":[{"award-number":["62176165"]}]},{"name":"the Stable Support Projects for Shenzhen Higher Education Institutions","award":["20220718110918001"],"award-info":[{"award-number":["20220718110918001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3689092.3689404","type":"proceedings-article","created":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T18:33:17Z","timestamp":1729708397000},"page":"78-87","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["SZTU-CMU at MER2024: Improving Emotion-LLaMA with Conv-Attention for Multimodal Emotion Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2854-7425","authenticated-orcid":false,"given":"Zebang","family":"Cheng","sequence":"first","affiliation":[{"name":"Shenzhen Technology University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6369-535X","authenticated-orcid":false,"given":"Shuyuan","family":"Tu","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7947-1773","authenticated-orcid":false,"given":"Dawei","family":"Huang","sequence":"additional","affiliation":[{"name":"Shenzhen Technology University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5153-2406","authenticated-orcid":false,"given":"Minghan","family":"Li","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5783-321X","authenticated-orcid":false,"given":"Xiaojiang","family":"Peng","sequence":"additional","affiliation":[{"name":"Shenzhen Technology University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1720-2085","authenticated-orcid":false,"given":"Zhi-Qi","family":"Cheng","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2123-0684","authenticated-orcid":false,"given":"Alexander G.","family":"Hauptmann","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.3390\/s23115184"},{"key":"e_1_3_2_1_2_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716--23736."},{"key":"e_1_3_2_1_3_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang Binyuan Hui Luo Ji Mei Li Junyang Lin Runji Lin Dayiheng Liu Gao Liu Chengqiang Lu Keming Lu Jianxin Ma Rui Men Xingzhang Ren Xuancheng Ren Chuanqi Tan Sinan Tan Jianhong Tu Peng Wang Shijie Wang Wei Wang Shengguang Wu Benfeng Xu Jin Xu An Yang Hao Yang Jian Yang Shusheng Yang Yang Yao Bowen Yu Hongyi Yuan Zheng Yuan Jianwei Zhang Xingxuan Zhang Yichang Zhang Zhenru Zhang Chang Zhou Jingren Zhou Xiaohuan Zhou and Tianhang Zhu. 2023. Qwen Technical Report. arxiv: 2309.16609 [cs.CL]"},{"key":"e_1_3_2_1_4_1","volume-title":"Language Models are Few-Shot Learners. arxiv","author":"Brown Tom B.","year":"2005","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. arxiv: 2005.14165 [cs.CL] https:\/\/arxiv.org\/abs\/2005.14165"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision. 357--366","author":"Richard Chen Chun-Fu","year":"2021","unstructured":"Chun-Fu Richard Chen, Quanfu Fan, and Rameswar Panda. 2021. Crossvit: Cross-attention multi-scale vision transformer for image classification. In Proceedings of the IEEE\/CVF international conference on computer vision. 357--366."},{"key":"e_1_3_2_1_6_1","volume-title":"Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv preprint arXiv:2306.15195","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv preprint arXiv:2306.15195 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"Emotion-LLaMA: Multimodal Emotion Recognition and Reasoning with Instruction Tuning. arXiv preprint arXiv:2406.11161","author":"Cheng Zebang","year":"2024","unstructured":"Zebang Cheng, Zhi-Qi Cheng, Jun-Yan He, Jingdong Sun, Kai Wang, Yuxiang Lin, Zheng Lian, Xiaojiang Peng, and Alexander Hauptmann. 2024. Emotion-LLaMA: Multimodal Emotion Recognition and Reasoning with Instruction Tuning. arXiv preprint arXiv:2406.11161 (2024)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612840"},{"key":"e_1_3_2_1_9_1","volume-title":"Mips at semeval-2024 task 3: Multimodal emotion-cause pair extraction in conversations with multimodal language models. arXiv preprint arXiv:2404.00511","author":"Cheng Zebang","year":"2024","unstructured":"Zebang Cheng, Fuqiang Niu, Yuxiang Lin, Zhi-Qi Cheng, Bowen Zhang, and Xiaojiang Peng. 2024. Mips at semeval-2024 task 3: Multimodal emotion-cause pair extraction in conversations with multimodal language models. arXiv preprint arXiv:2404.00511 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models. arXiv preprint arXiv:2311.07919","author":"Chu Yunfei","year":"2023","unstructured":"Yunfei Chu, Jin Xu, Xiaohuan Zhou, Qian Yang, Shiliang Zhang, Zhijie Yan, Chang Zhou, and Jingren Zhou. 2023. Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models. arXiv preprint arXiv:2311.07919 (2023)."},{"key":"e_1_3_2_1_11_1","unstructured":"Hyung Won Chung Le Hou Shayne Longpre Barret Zoph Yi Tay William Fedus Eric Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma et al. 2022. Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.58"},{"key":"e_1_3_2_1_13_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv: 1810.04805 [cs.CL] https:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Chaoyue Ding Jiakui Li Daoming Zong Baoxiang Li TianHao Zhang and Qunyan Zhou. 2023. Stable Speech Emotion Recognition with Head-k-Pooling Loss. In INTERSPEECH.","DOI":"10.21437\/Interspeech.2023-80"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3607865.3613184"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747295"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.26"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2015.2457417"},{"key":"e_1_3_2_1_19_1","unstructured":"Kaiming He Xinlei Chen Saining Xie Yanghao Li Piotr Doll\u00e1r and Ross Girshick. 2021. Masked Autoencoders Are Scalable Vision Learners. arxiv: 2111.06377 [cs.CV] https:\/\/arxiv.org\/abs\/2111.06377"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.3390\/app14125068"},{"key":"e_1_3_2_1_24_1","volume-title":"Ramakanth Pasunuru, Todor Mihaylov, D\u00e1niel Simig, Ping Yu, Kurt Shuster, Tianlu Wang, Qing Liu, Punit Singh Koura, et al.","author":"Iyer Srinivasan","year":"2022","unstructured":"Srinivasan Iyer, Xi Victoria Lin, Ramakanth Pasunuru, Todor Mihaylov, D\u00e1niel Simig, Ping Yu, Kurt Shuster, Tianlu Wang, Qing Liu, Punit Singh Koura, et al. 2022. OPT-IML: Scaling Language Model Instruction Meta Learning through the Lens of Generalization. arXiv preprint arXiv:2212.12017 (2022)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413620"},{"key":"e_1_3_2_1_26_1","volume-title":"Instructerc: Reforming emotion recognition in conversation with a retrieval multi-task llms framework. arXiv preprint arXiv:2309.11911","author":"Lei Shanglin","year":"2023","unstructured":"Shanglin Lei, Guanting Dong, Xiaoping Wang, Keheng Wang, and Sirui Wang. 2023. Instructerc: Reforming emotion recognition in conversation with a retrieval multi-task llms framework. arXiv preprint arXiv:2309.11911 (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"Otter: A Multi-Modal Model with In-Context Instruction Tuning. arXiv preprint arXiv:2305.03726","author":"Li Bo","year":"2023","unstructured":"Bo Li, Yuanhan Zhang, Liangyu Chen, Jinghao Wang, Jingkang Yang, and Ziwei Liu. 2023. Otter: A Multi-Modal Model with In-Context Instruction Tuning. arXiv preprint arXiv:2305.03726 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355","author":"Li KunChang","year":"2023","unstructured":"KunChang Li, Yinan He, Yi Wang, Yizhuo Li, Wenhai Wang, Ping Luo, Yali Wang, Limin Wang, and Yu Qiao. 2023. Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"e_1_3_2_1_30_1","volume-title":"Uni-MoE: Scaling Unified Multimodal LLMs with Mixture of Experts. arXiv preprint arXiv:2405.11273","author":"Li Yunxin","year":"2024","unstructured":"Yunxin Li, Shenyuan Jiang, Baotian Hu, Longyue Wang, Wanqi Zhong, Wenhan Luo, Lin Ma, and Min Zhang. 2024. Uni-MoE: Scaling Unified Multimodal LLMs with Mixture of Experts. arXiv preprint arXiv:2405.11273 (2024)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612836"},{"key":"e_1_3_2_1_32_1","volume-title":"MER 2024: Semi-Supervised Learning, Noise Robustness, and Open-Vocabulary Multimodal Emotion Recognition. arXiv preprint arXiv:2404","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Haiyang Sun, Licai Sun, Zhuofan Wen, Siyuan Zhang, Shun Chen, Hao Gu, Jinming Zhao, Ziyang Ma, Xie Chen, et al. 2024. MER 2024: Semi-Supervised Learning, Noise Robustness, and Open-Vocabulary Multimodal Emotion Recognition. arXiv preprint arXiv:2404.17113 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"2024 d. AffectGPT: Dataset and Framework for Explainable Multimodal Emotion Recognition. arXiv preprint arXiv:2407.07653","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Haiyang Sun, Licai Sun, Jiangyan Yi, Bin Liu, and Jianhua Tao. 2024 d. AffectGPT: Dataset and Framework for Explainable Multimodal Emotion Recognition. arXiv preprint arXiv:2407.07653 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Merbench: A unified evaluation benchmark for multimodal emotion recognition. arXiv preprint arXiv:2401.03429","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Licai Sun, Yong Ren, Hao Gu, Haiyang Sun, Lan Chen, Bin Liu, and Jianhua Tao. 2024. Merbench: A unified evaluation benchmark for multimodal emotion recognition. arXiv preprint arXiv:2401.03429 (2024)."},{"key":"e_1_3_2_1_35_1","volume-title":"GPT-4V with emotion: A zero-shot benchmark for Generalized Emotion Recognition. Information Fusion","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Licai Sun, Haiyang Sun, Kang Chen, Zhuofan Wen, Hao Gu, Bin Liu, and Jianhua Tao. 2024. GPT-4V with emotion: A zero-shot benchmark for Generalized Emotion Recognition. Information Fusion (2024), 102367."},{"key":"e_1_3_2_1_36_1","volume-title":"Explainable multimodal emotion reasoning. arXiv preprint arXiv:2306.15401","author":"Lian Zheng","year":"2023","unstructured":"Zheng Lian, Licai Sun, Mingyu Xu, Haiyang Sun, Ke Xu, Zhuofan Wen, Shun Chen, Bin Liu, and Jianhua Tao. 2023. Explainable multimodal emotion reasoning. arXiv preprint arXiv:2306.15401 (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"Luc Van Gool, and Radu Timofte","author":"Liang Jingyun","year":"2021","unstructured":"Jingyun Liang, Guolei Sun, Kai Zhang, Luc Van Gool, and Radu Timofte. 2021. Mutual Affine Network for Spatially Variant Kernel Estimation in Blind Image Super-Resolution. arxiv: 2108.05302 [cs.CV] https:\/\/arxiv.org\/abs\/2108.05302"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"e_1_3_2_1_39_1","volume-title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_40_1","volume-title":"Valley: Video assistant with large language model enhanced ability. arXiv preprint arXiv:2306.07207","author":"Luo Ruipu","year":"2023","unstructured":"Ruipu Luo, Ziwang Zhao, Min Yang, Junwei Dong, Da Li, Pengcheng Lu, Tao Wang, Linmei Hu, Minghui Qiu, and Zhongyu Wei. 2023. Valley: Video assistant with large language model enhanced ability. arXiv preprint arXiv:2306.07207 (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00258"},{"key":"e_1_3_2_1_42_1","volume-title":"emotion2vec: Self-Supervised Pre-Training for Speech Emotion Representation. arXiv preprint arXiv:2312.15185","author":"Ma Ziyang","year":"2023","unstructured":"Ziyang Ma, Zhisheng Zheng, Jiaxin Ye, Jinchao Li, Zhifu Gao, Shiliang Zhang, and Xie Chen. 2023. emotion2vec: Self-Supervised Pre-Training for Speech Emotion Representation. arXiv preprint arXiv:2312.15185 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Video-ChatGPT: Towards Detailed Video Understanding via Large Vision and Language Models. arXiv preprint arXiv:2306.05424","author":"Maaz Muhammad","year":"2023","unstructured":"Muhammad Maaz, Hanoona Rasheed, Salman Khan, and Fahad Shahbaz Khan. 2023. Video-ChatGPT: Towards Detailed Video Understanding via Large Vision and Language Models. arXiv preprint arXiv:2306.05424 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"Attention bottlenecks for multimodal fusion. Advances in neural information processing systems","author":"Nagrani Arsha","year":"2021","unstructured":"Arsha Nagrani, Shan Yang, Anurag Arnab, Aren Jansen, Cordelia Schmid, and Chen Sun. 2021. Attention bottlenecks for multimodal fusion. Advances in neural information processing systems, Vol. 34 (2021), 14200--14213."},{"key":"e_1_3_2_1_45_1","unstructured":"OpenAI. 2023. GPT-4V(ision) system card. https:\/\/openai.com\/research\/gpt-4v-system-card"},{"key":"e_1_3_2_1_46_1","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et al. 2022. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems (2022)."},{"key":"e_1_3_2_1_47_1","volume-title":"Kosmos-2: Grounding Multimodal Large Language Models to the World. arXiv preprint arXiv:2306.14824","author":"Peng Zhiliang","year":"2023","unstructured":"Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, and Furu Wei. 2023. Kosmos-2: Grounding Multimodal Large Language Models to the World. arXiv preprint arXiv:2306.14824 (2023)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CIS.2008.204"},{"key":"e_1_3_2_1_49_1","volume-title":"Meld: A multimodal multi-party dataset for emotion recognition in conversations. arXiv preprint arXiv:1810.02508","author":"Poria Soujanya","year":"2018","unstructured":"Soujanya Poria, Devamanyu Hazarika, Navonil Majumder, Gautam Naik, Erik Cambria, and Rada Mihalcea. 2018. Meld: A multimodal multi-party dataset for emotion recognition in conversations. arXiv preprint arXiv:1810.02508 (2018)."},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of the International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In Proceedings of the International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_51_1","volume-title":"Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever.","author":"Radford Alec","year":"2022","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2022. Robust Speech Recognition via Large-Scale Weak Supervision. arxiv: 2212.04356 [eess.AS] https:\/\/arxiv.org\/abs\/2212.04356"},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the 1st Workshop on Taming Large Language Models: Controllability in the era of Interactive Assistants. 11--23","author":"Su Yixuan","year":"2023","unstructured":"Yixuan Su, Tian Lan, Huayang Li, Jialu Xu, Yan Wang, and Deng Cai. 2023. PandaGPT: One Model To Instruction-Follow Them All. In Proceedings of the 1st Workshop on Taming Large Language Models: Controllability in the era of Interactive Assistants. 11--23."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2023.3274829"},{"key":"e_1_3_2_1_54_1","volume-title":"SALMONN: Towards Generic Hearing Abilities for Large Language Models. In The Twelfth International Conference on Learning Representations.","author":"Tang Changli","year":"2023","unstructured":"Changli Tang, Wenyi Yu, Guangzhi Sun, Xianzhao Chen, Tian Tan, Wei Li, Lu Lu, MA Zejun, and Chao Zhang. 2023. SALMONN: Towards Generic Hearing Abilities for Large Language Models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_55_1","unstructured":"Qwen Team. 2024. Introducing Qwen1.5. https:\/\/qwenlm.github.io\/blog\/qwen1.5\/"},{"key":"e_1_3_2_1_56_1","volume-title":"Proceedings of the Advances in Neural Information Processing Systems.","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2022. VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training. In Proceedings of the Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_57_1","volume-title":"Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. arXiv preprint arXiv:2305.11175","author":"Wang Wenhai","year":"2023","unstructured":"Wenhai Wang, Zhe Chen, Xiaokang Chen, Jiannan Wu, Xizhou Zhu, Gang Zeng, Ping Luo, Tong Lu, Jie Zhou, Yu Qiao, et al. 2023. Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. arXiv preprint arXiv:2305.11175 (2023)."},{"key":"e_1_3_2_1_58_1","volume-title":"Self-Instruct: Aligning Language Model with Self Generated Instructions. arXiv preprint arXiv:2212.10560","author":"Wang Yizhong","year":"2022","unstructured":"Yizhong Wang, Yeganeh Kordi, Swaroop Mishra, Alisa Liu, Noah A Smith, Daniel Khashabi, and Hannaneh Hajishirzi. 2022. Self-Instruct: Aligning Language Model with Self Generated Instructions. arXiv preprint arXiv:2212.10560 (2022)."},{"key":"e_1_3_2_1_59_1","volume-title":"Atharva Naik, David Stap, et al.","author":"Wang Yizhong","year":"2022","unstructured":"Yizhong Wang, Swaroop Mishra, Pegah Alipoormolabashi, Yeganeh Kordi, Amirreza Mirzaei, Anjana Arunkumar, Arjun Ashok, Arut Selvan Dhanasekaran, Atharva Naik, David Stap, et al. 2022. Benchmarking generalization via in-context instructions on 1,600 language tasks. arXiv preprint arXiv:2204.07705 (2022)."},{"key":"e_1_3_2_1_60_1","volume-title":"Chi, Quoc Le, and Denny Zhou","author":"Wei Jason","year":"2023","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Brian Ichter, Fei Xia, Ed Chi, Quoc Le, and Denny Zhou. 2023. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. arxiv: 2201.11903 [cs.CL] https:\/\/arxiv.org\/abs\/2201.11903"},{"key":"e_1_3_2_1_61_1","volume-title":"Workshop, Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili?, Daniel Hesslow, Roman Castagn\u00e9, Alexandra Sasha Luccioni, Franccois Yvon, et al. 2022","year":"2022","unstructured":"BigScience Workshop, Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili?, Daniel Hesslow, Roman Castagn\u00e9, Alexandra Sasha Luccioni, Franccois Yvon, et al. 2022. Bloom: A 176b-parameter open-access multilingual language model. arXiv preprint arXiv:2211.05100 (2022)."},{"key":"e_1_3_2_1_62_1","volume-title":"EmoVIT: Revolutionizing Emotion Insights with Visual Instruction Tuning. arXiv preprint arXiv:2404.16670","author":"Xie Hongxia","year":"2024","unstructured":"Hongxia Xie, Chu-Jun Peng, Yu-Wen Tseng, Hung-Jen Chen, Chan-Feng Hsu, Hong-Han Shuai, and Wen-Huang Cheng. 2024. EmoVIT: Revolutionizing Emotion Insights with Visual Instruction Tuning. arXiv preprint arXiv:2404.16670 (2024)."},{"key":"e_1_3_2_1_63_1","unstructured":"Aiyuan Yang Bin Xiao Bingning Wang Borong Zhang Ce Bian Chao Yin Chenxu Lv Da Pan Dian Wang Dong Yan et al. 2023. Baichuan 2: Open large-scale language models. arXiv preprint arXiv:2309.10305 (2023)."},{"key":"e_1_3_2_1_64_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi Chaoya Jiang Chenliang Li Yuanhong Xu Hehong Chen Junfeng Tian Qian Qi Ji Zhang and Fei Huang. 2023. mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. arxiv: 2304.14178 [cs.CL]"},{"key":"e_1_3_2_1_65_1","volume-title":"Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01821"},{"key":"e_1_3_2_1_67_1","volume-title":"Prompting visual-language models for dynamic facial expression recognition. arXiv preprint arXiv:2308.13382","author":"Zhao Zengqun","year":"2023","unstructured":"Zengqun Zhao and Ioannis Patras. 2023. Prompting visual-language models for dynamic facial expression recognition. arXiv preprint arXiv:2308.13382 (2023)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340555.3355713"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611974"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2nd International Workshop on Multimodal and Responsible Affective Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689092.3689404","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3689092.3689404","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T01:59:59Z","timestamp":1755914399000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689092.3689404"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":69,"alternative-id":["10.1145\/3689092.3689404","10.1145\/3689092"],"URL":"https:\/\/doi.org\/10.1145\/3689092.3689404","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}