{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T18:05:49Z","timestamp":1777658749178,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":81,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100004543","name":"China Scholarship Council","doi-asserted-by":"publisher","award":["202307960007, 202406250043"],"award-info":[{"award-number":["202307960007, 202406250043"]}],"id":[{"id":"10.13039\/501100004543","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100019241","name":"CSC \u2013 IT Center for Science","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100019241","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755411","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:38:54Z","timestamp":1761377934000},"page":"5707-5716","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["DEEMO: De-identity Multimodal Emotion Recognition and Reasoning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7140-0701","authenticated-orcid":false,"given":"Deng","family":"Li","sequence":"first","affiliation":[{"name":"Lappeenranta-Lahti University of Technology LUT, Lappeenranta, Finland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5924-4178","authenticated-orcid":false,"given":"Bohao","family":"Xing","sequence":"additional","affiliation":[{"name":"Lappeenranta-Lahti University of Technology LUT, Lappeenranta, Finland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2242-6139","authenticated-orcid":false,"given":"Xin","family":"Liu","sequence":"additional","affiliation":[{"name":"Lappeenranta-Lahti University of Technology LUT, Lappeenranta, Finland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4933-5801","authenticated-orcid":false,"given":"Baiqiang","family":"Xia","sequence":"additional","affiliation":[{"name":"AMD Silo AI, Helsinki, Finland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6874-6453","authenticated-orcid":false,"given":"Bihan","family":"Wen","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0790-6847","authenticated-orcid":false,"given":"Heikki","family":"K\u00e4lvi\u00e4inen","sequence":"additional","affiliation":[{"name":"Lappeenranta-Lahti University of Technology LUT, Lappeenranta, Finland and Brno University of Technology, Brno, Czech Republic"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1208"},{"key":"e_1_3_2_2_3_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2015.2396531"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-008-9076-6"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2021.10.005"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01761-6"},{"key":"e_1_3_2_2_9_1","first-page":"110805","article-title":"Emotion-llama: Multimodal emotion recognition and reasoning with instruction tuning","volume":"37","author":"Cheng Zebang","year":"2025","unstructured":"Zebang Cheng, Zhi-Qi Cheng, Jun-Yan He, Kai Wang, Yuxiang Lin, Zheng Lian, Xiaojiang Peng, and Alexander Hauptmann. 2025. Emotion-llama: Multimodal emotion recognition and reasoning with instruction tuning. Advances in Neural Information Processing Systems, Vol. 37 (2025), 110805-110853.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.semeval-1.97"},{"key":"e_1_3_2_2_11_1","volume-title":"Xing","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"e_1_3_2_2_12_1","unstructured":"Yunfei Chu Jin Xu Qian Yang Haojie Wei Xipin Wei Zhifang Guo Yichong Leng Yuanjun Lv Jinzheng He Junyang Lin et al. 2024. Qwen2-audio technical report. arXiv preprint arXiv:2407.10759 (2024)."},{"key":"e_1_3_2_2_13_1","unstructured":"Gheorghe Comanici Eric Bieber Mike Schaekermann Ice Pasupat Noveen Sachdeva Inderjit Dhillon Marcel Blistein Ori Ram Dan Zhang Evan Rosen et al. 2025. Gemini 2.5: Pushing the frontier with advanced reasoning multimodality long context and next generation agentic capabilities. arXiv preprint arXiv:2507.06261 (2025)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2012.26"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/2818346.2829994"},{"key":"e_1_3_2_2_16_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-74889-2_43"},{"key":"e_1_3_2_2_18_1","volume-title":"Survey on speech emotion recognition: Features, classification schemes, and databases. Pattern recognition","author":"Ayadi Moataz El","year":"2011","unstructured":"Moataz El Ayadi, Mohamed S Kamel, and Fakhri Karray. 2011. Survey on speech emotion recognition: Features, classification schemes, and databases. Pattern recognition, Vol. 44, 3 (2011), 572-587."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101847"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"e_1_3_2_2_21_1","volume-title":"Proceedings of the Ninth International Conference on Language Resources and Evaluation. 3486-3493","author":"Fourati Nesrine","year":"2014","unstructured":"Nesrine Fourati and Catherine Pelachaud. 2014. Emilya: Emotional body expression in daily actions database. In Proceedings of the Ninth International Conference on Language Resources and Evaluation. 3486-3493."},{"key":"e_1_3_2_2_22_1","volume-title":"Identity-free artificial emotional intelligence via micro-gesture understanding. arXiv preprint arXiv:2405.13206","author":"Gao Rong","year":"2024","unstructured":"Rong Gao, Xin Liu, Bohao Xing, Zitong Yu, Bjorn W Schuller, and Heikki K\u00e4lvi\u00e4inen. 2024. Identity-free artificial emotional intelligence via micro-gesture understanding. arXiv preprint arXiv:2405.13206 (2024)."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2008.4607572"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3358415"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/COMPTELIX.2017.8004002"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2023.102974"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2017.2781732"},{"key":"e_1_3_2_2_30_1","volume-title":"Feallm: Advancing facial emotion analysis in multimodal large language models with emotional synergy and reasoning. arXiv preprint arXiv:2505.13419","author":"Hu Zhuozhao","year":"2025","unstructured":"Zhuozhao Hu, Kaishen Yuan, Xin Liu, Zitong Yu, Yuan Zong, Jingang Shi, Huanjing Yue, and Jingyu Yang. 2025. Feallm: Advancing facial emotion analysis in multimodal large language models with emotional synergy and reasoning. arXiv preprint arXiv:2505.13419 (2025)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/T-AFFC.2011.15"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2944808"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1038\/s44159-023-00172-1"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2024.3396656"},{"key":"e_1_3_2_2_36_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2020.2981446"},{"key":"e_1_3_2_2_38_1","volume-title":"UMETTS: A Unified Framework for Emotional Text-to-Speech Synthesis with Multimodal Prompts. In IEEE International Conference on Acoustics, Speech and Signal Processing. 1-5.","author":"Li Xiang","unstructured":"Xiang Li, Zhi-Qi Cheng, Jun-Yan He, Junyao Chen, Xiaomao Fan, Xiaojiang Peng, and Alexander G. Hauptmann. 2025. UMETTS: A Unified Framework for Emotional Text-to-Speech Synthesis with Multimodal Prompts. In IEEE International Conference on Acoustics, Speech and Signal Processing. 1-5."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3524499"},{"key":"e_1_3_2_2_40_1","unstructured":"Zheng Lian Haiyang Sun Licai Sun Hao Gu Zhuofan Wen Siyuan Zhang Shun Chen Mingyu Xu Ke Xu Kang Chen et al. 2023. Explainable multimodal emotion recognition. arXiv preprint arXiv:2306.15401 (2023)."},{"key":"e_1_3_2_2_41_1","volume-title":"AffectGPT: Dataset and framework for explainable multimodal emotion recognition. arXiv preprint arXiv:2407.07653","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Haiyang Sun, Licai Sun, Jiangyan Yi, Bin Liu, and Jianhua Tao. 2024. AffectGPT: Dataset and framework for explainable multimodal emotion recognition. arXiv preprint arXiv:2407.07653 (2024)."},{"key":"e_1_3_2_2_42_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2025.3573785"},{"key":"e_1_3_2_2_44_1","volume-title":"Visual instruction tuning. Advances in Neural Information Processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. Advances in Neural Information Processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSCSE.2016.0051"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCDS.2021.3071170"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01049"},{"key":"e_1_3_2_2_48_1","volume-title":"Multi-scale promoted self-adjusting correlation learning for facial action unit detection","author":"Liu Xin","year":"2024","unstructured":"Xin Liu, Kaishen Yuan, Xuesong Niu, Jingang Shi, Zitong Yu, Huanjing Yue, and Jingyu Yang. 2024. Multi-scale promoted self-adjusting correlation learning for facial action unit detection. IEEE Transactions on Affective Computing (2024)."},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"e_1_3_2_2_50_1","unstructured":"Stephen Edward McAdams. 1984. Spectral fusion spectral parsing and the formation of auditory images. Stanford university."},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2013.130"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2021.3096024"},{"key":"e_1_3_2_2_53_1","volume-title":"Deep learning based multimodal emotion recognition using model-level fusion of audio-visual modalities. Knowledge-based systems","author":"Middya Asif Iqbal","year":"2022","unstructured":"Asif Iqbal Middya, Baibhav Nag, and Sarbani Roy. 2022. Deep learning based multimodal emotion recognition using model-level fusion of audio-visual modalities. Knowledge-based systems, Vol. 244 (2022), 108580."},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/2070481.2070509"},{"key":"e_1_3_2_2_55_1","volume-title":"A review on sentiment analysis and emotion detection from text. Social network analysis and mining","author":"Nandwani Pansy","year":"2021","unstructured":"Pansy Nandwani and Rupali Verma. 2021. A review on sentiment analysis and emotion detection from text. Social network analysis and mining, Vol. 11, 1 (2021), 81."},{"key":"e_1_3_2_2_56_1","volume-title":"What every body is saying","author":"Navarro Joe","unstructured":"Joe Navarro and Marvin Karlins. 2008. What every body is saying. HarperCollins Publishers New York, NY, USA."},{"key":"e_1_3_2_2_57_1","volume-title":"Shi Qiu, Muhammad Saqib, Saeed Anwar, Muhammad Usman, Naveed Akhtar, Nick Barnes, and Ajmal Mian.","author":"Naveed Humza","year":"2023","unstructured":"Humza Naveed, Asad Ullah Khan, Shi Qiu, Muhammad Saqib, Saeed Anwar, Muhammad Usman, Naveed Akhtar, Nick Barnes, and Ajmal Mian. 2023. A comprehensive overview of large language models. arXiv preprint arXiv:2307.06435 (2023)."},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2018.2874986"},{"key":"e_1_3_2_2_59_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning. Article 1182","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In Proceedings of the 40th International Conference on Machine Learning. Article 1182, 27 pages."},{"key":"e_1_3_2_2_60_1","first-page":"171","article-title":"A multimodal emotion recognition system using facial landmark analysis. Iranian Journal of Science and Technology","volume":"43","author":"Rahdari Farhad","year":"2019","unstructured":"Farhad Rahdari, Esmat Rashedi, and Mahdi Eftekhari. 2019. A multimodal emotion recognition system using facial landmark analysis. Iranian Journal of Science and Technology, Transactions of Electrical Engineering, Vol. 43 (2019), 171-189.","journal-title":"Transactions of Electrical Engineering"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2024.103102"},{"key":"e_1_3_2_2_62_1","first-page":"1","article-title":"General data protection regulation","volume":"25","author":"Regulation Protection","year":"2018","unstructured":"Protection Regulation. 2018. General data protection regulation. Intouch, Vol. 25 (2018), 1-5.","journal-title":"Intouch"},{"key":"e_1_3_2_2_63_1","volume-title":"On the use of speech and face information for identity verification. Research Paper IDIAP-RR","author":"Sanderson Conrad","year":"2004","unstructured":"Conrad Sanderson and Kuldip K Paliwal. 2004. On the use of speech and face information for identity verification. Research Paper IDIAP-RR (2004), 04-10."},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.04.028"},{"key":"e_1_3_2_2_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"e_1_3_2_2_66_1","volume-title":"Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355","author":"Su Yixuan","year":"2023","unstructured":"Yixuan Su, Tian Lan, Huayang Li, Jialu Xu, Yan Wang, and Deng Cai. 2023. Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355 (2023)."},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2025.3583968"},{"key":"e_1_3_2_2_68_1","volume-title":"The voiceprivacy 2024 challenge evaluation plan. arXiv preprint arXiv:2404.02677","author":"Tomashenko Natalia","year":"2024","unstructured":"Natalia Tomashenko, Xiaoxiao Miao, Pierre Champion, Sarina Meyer, Xin Wang, Emmanuel Vincent, Michele Panariello, Nicholas Evans, Junichi Yamagishi, and Massimiliano Todisco. 2024. The voiceprivacy 2024 challenge evaluation plan. arXiv preprint arXiv:2404.02677 (2024)."},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.121419"},{"key":"e_1_3_2_2_70_1","volume-title":"Emo-llama: Enhancing facial emotion understanding with instruction tuning. arXiv preprint arXiv:2408.11424","author":"Xing Bohao","year":"2024","unstructured":"Bohao Xing, Zitong Yu, Xin Liu, Kaishen Yuan, Qilang Ye, Weicheng Xie, Huanjing Yue, Jingyu Yang, and Heikki K\u00e4lvi\u00e4inen. 2024. Emo-llama: Enhancing facial emotion understanding with instruction tuning. arXiv preprint arXiv:2408.11424 (2024)."},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1155\/2020\/7845384"},{"key":"e_1_3_2_2_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2025.3563775"},{"key":"e_1_3_2_2_73_1","volume-title":"Learning Adaptive Lighting via Channel-Aware Guidance. arXiv preprint arXiv:2412.01493","author":"Yang Qirui","year":"2024","unstructured":"Qirui Yang, Peng-Tao Jiang, Hao Zhang, Jinwei Chen, Bo Li, Huanjing Yue, and Jingyu Yang. 2024. Learning Adaptive Lighting via Channel-Aware Guidance. arXiv preprint arXiv:2412.01493 (2024)."},{"key":"e_1_3_2_2_74_1","volume-title":"Efficient hdr reconstruction from real-world raw images. arXiv preprint arXiv:2306.10311","author":"Yang Qirui","year":"2023","unstructured":"Qirui Yang, Yihao Liu, Qihua Chen, Huanjing Yue, Kun Li, and Jingyu Yang. 2023b. Efficient hdr reconstruction from real-world raw images. arXiv preprint arXiv:2306.10311 (2023)."},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2023.102549"},{"key":"e_1_3_2_2_76_1","volume-title":"DSDNet: Raw Domain Demoir\\'eing via Dual Color-Space Synergy. arXiv preprint arXiv:2504.15756","author":"Yang Qirui","year":"2025","unstructured":"Qirui Yang, Fangpu Zhang, Yeying Jin, Qihua Cheng, Pengtao Jiang, Huanjing Yue, and Jingyu Yang. 2025b. DSDNet: Raw Domain Demoir\\'eing via Dual Color-Space Synergy. arXiv preprint arXiv:2504.15756 (2025)."},{"key":"e_1_3_2_2_77_1","volume-title":"CAT: Investigating and Enhancing Audio-visual Understanding in Large Language Models","author":"Ye Qilang","year":"2025","unstructured":"Qilang Ye, Zitong Yu, Rui Shao, Yawen Cui, Xiangui Kang, Xin Liu, Philip Torr, and Xiaochun Cao. 2025. CAT: Investigating and Enhancing Audio-visual Understanding in Large Language Models. IEEE Transactions on Pattern Analysis and Machine Intelligence (2025)."},{"key":"e_1_3_2_2_78_1","first-page":"427","article-title":"AUFormer","volume":"2024","author":"Yuan Kaishen","year":"2025","unstructured":"Kaishen Yuan, Zitong Yu, Xin Liu, Weicheng Xie, Huanjing Yue, and Jingyu Yang. 2025. AUFormer: Vision Transformers Are Parameter-Efficient Facial Action Unit Detectors. In Computer Vision - ECCV 2024. 427-445.","journal-title":"Vision Transformers Are Parameter-Efficient Facial Action Unit Detectors. In Computer Vision - ECCV"},{"key":"e_1_3_2_2_79_1","volume-title":"Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding. In 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations","author":"Zhang Hang","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding. In 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Singapore, 543-553."},{"key":"e_1_3_2_2_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02722"},{"key":"e_1_3_2_2_81_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755411","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:10:40Z","timestamp":1765339840000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755411"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":81,"alternative-id":["10.1145\/3746027.3755411","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755411","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}