{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:04:03Z","timestamp":1765343043365,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","funder":[{"name":"the National Key R&D Program of China","award":["2022YFF0901800"],"award-info":[{"award-number":["2022YFF0901800"]}]},{"name":"the National Natural Science Foundation of China (NSFC)","award":["62176205, 62472346, 62372365, 62302383"],"award-info":[{"award-number":["62176205, 62472346, 62372365, 62302383"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755637","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"5765-5774","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["LES-CLIP: A Lightweight Emotion-Sensitive Adaptation of CLIP for Precise Similar Emotion Discrimination"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-6601-5290","authenticated-orcid":false,"given":"Xiao","family":"Fu","sequence":"first","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2485-8311","authenticated-orcid":false,"given":"Pengyu","family":"Wang","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, xi'an, shannxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9348-2982","authenticated-orcid":false,"given":"Wei","family":"Xi","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2326-4343","authenticated-orcid":false,"given":"Kun","family":"Zhao","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6276-6182","authenticated-orcid":false,"given":"Jiadong","family":"Feng","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1037-9619","authenticated-orcid":false,"given":"Jizhong","family":"Zhao","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00150"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550325"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680827"},{"key":"e_1_3_2_1_5_1","volume-title":"From static to dynamic: Adapting landmark-aware image models for facial expression recognition in videos","author":"Chen Yin","year":"2024","unstructured":"Yin Chen, Jia Li, Shiguang Shan, Meng Wang, and Richang Hong. 2024b. From static to dynamic: Adapting landmark-aware image models for facial expression recognition in videos. IEEE Transactions on Affective Computing (2024)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3518"},{"key":"e_1_3_2_1_7_1","volume-title":"Correlation-aware cross-modal attention network for fashion compatibility modeling in ugc systems. ACM Transactions on Multimedia Computing, Communications and Applications","author":"Cui Kai","year":"2024","unstructured":"Kai Cui, Shenghao Liu, Wei Feng, Xianjun Deng, Liangbin Gao, Minmin Cheng, Hongwei Lu, and Laurence T Yang. 2024. Correlation-aware cross-modal attention network for fashion compatibility modeling in ugc systems. ACM Transactions on Multimedia Computing, Communications and Applications (2024)."},{"key":"e_1_3_2_1_8_1","volume-title":"Enhancing Multimodal Affective Analysis with Learned Live Comment Features. arXiv preprint arXiv:2410.16407","author":"Deng Zhaoyuan","year":"2024","unstructured":"Zhaoyuan Deng, Amith Ananthram, and Kathleen McKeown. 2024. Enhancing Multimodal Affective Analysis with Learned Live Comment Features. arXiv preprint arXiv:2410.16407 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"SentiFormer: Metadata Enhanced Transformer for Image Sentiment Analysis. In ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1-5.","author":"Feng Bin","year":"2025","unstructured":"Bin Feng, Shulan Ruan, Mingzheng Yang, Dongxuan Han, Huijie Liu, Kai Zhang, and Qi Liu. 2025. SentiFormer: Metadata Enhanced Transformer for Image Sentiment Analysis. In ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1-5."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG59268.2024.10581982"},{"key":"e_1_3_2_1_11_1","volume-title":"MRFER: Multi-Channel Robust Feature Enhanced Fusion for Multi-Modal Emotion Recognition. In 2024 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1-6.","author":"Fu Xiao","year":"2024","unstructured":"Xiao Fu, Wei Xi, Zhao Yang, Rui Jiang, Dianwen Ng, Jie Yang, and Jizhong Zhao. 2024. MRFER: Multi-Channel Robust Feature Enhanced Fusion for Multi-Modal Emotion Recognition. In 2024 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1-6."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00203"},{"key":"e_1_3_2_1_13_1","first-page":"117","volume-title":"ICONIP 2013, daegu, korea, november 3-7, 2013. Proceedings, Part III 20","author":"Goodfellow Ian J","year":"2013","unstructured":"Ian J Goodfellow, Dumitru Erhan, Pierre Luc Carrier, Aaron Courville, Mehdi Mirza, Ben Hamner, Will Cukierski, Yichuan Tang, David Thaler, Dong-Hyun Lee, et al., 2013. Challenges in representation learning: A report on three machine learning contests. In Neural information processing: 20th international conference, ICONIP 2013, daegu, korea, november 3-7, 2013. Proceedings, Part III 20. Springer, 117-124."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413620"},{"key":"e_1_3_2_1_15_1","volume-title":"Navigating label ambiguity for facial expression recognition in the wild. arXiv preprint arXiv:2502.09993","author":"Lee JunGyu","year":"2025","unstructured":"JunGyu Lee, Yeji Choi, Haksub Kim, Ig-Jae Kim, and Gi Pyo Nam. 2025. Navigating label ambiguity for facial expression recognition in the wild. arXiv preprint arXiv:2502.09993 (2025)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25077"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME57554.2024.10687508"},{"key":"e_1_3_2_1_18_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023a. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_19_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681708"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.277"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3625687.3625799"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_1_24_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023a. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3638065"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM58522.2023.00052"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681583"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00937"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19775-8_38"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2017.2740923"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00481"},{"key":"e_1_3_2_1_33_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_2_1_35_1","volume-title":"PE-CLIP: A Parameter-Efficient Fine-Tuning of Vision Language Models for Dynamic Facial Expression Recognition. arXiv preprint arXiv:2503.16945","author":"Saadi Ibtissam","year":"2025","unstructured":"Ibtissam Saadi, Abdenour Hadid, Douglas W Cunningham, Abdelmalik Taleb-Ahmed, and Yassin El Hillali. 2025. PE-CLIP: A Parameter-Efficient Fine-Tuning of Vision Language Models for Dynamic Facial Expression Recognition. arXiv preprint arXiv:2503.16945 (2025)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612365"},{"key":"e_1_3_2_1_37_1","volume-title":"QCS: Feature refining from quadruplet cross similarity for facial expression recognition. arXiv preprint arXiv:2411.01988","author":"Wang Chengpeng","year":"2024","unstructured":"Chengpeng Wang, Li Chen, Lili Wang, Zhaofan Li, and Xuebin Lv. 2024a. QCS: Feature refining from quadruplet cross similarity for facial expression recognition. arXiv preprint arXiv:2411.01988 (2024)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01722"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25353"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2024.3398004"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680875"},{"key":"e_1_3_2_1_42_1","volume-title":"Enriching Multimodal Sentiment Analysis through Textual Emotional Descriptions of Visual-Audio Content. arXiv preprint arXiv:2412.10460","author":"Wu Sheng","year":"2024","unstructured":"Sheng Wu, Xiaobao Wang, Longbiao Wang, Dongxiao He, and Jianwu Dang. 2024a. Enriching Multimodal Sentiment Analysis through Textual Emotional Descriptions of Visual-Audio Content. arXiv preprint arXiv:2412.10460 (2024)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN60899.2024.10650158"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00608"},{"key":"e_1_3_2_1_45_1","volume-title":"Omni-Emotion: Extending Video MLLM with Detailed Face and Audio Modeling for Multimodal Emotion Analysis. arXiv preprint arXiv:2501.09502","author":"Yang Qize","year":"2025","unstructured":"Qize Yang, Detao Bai, Yi-Xing Peng, and Xihan Wei. 2025. Omni-Emotion: Extending Video MLLM with Detailed Face and Audio Modeling for Multimodal Emotion Analysis. arXiv preprint arXiv:2501.09502 (2025)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2018.8485912"},{"key":"e_1_3_2_1_47_1","volume-title":"CLIP-Guided Bidirectional Prompt and Semantic Supervision for Dynamic Facial Expression Recognition. In 2024 IEEE International Joint Conference on Biometrics (IJCB). IEEE, 1-10","author":"Zhang Junliang","year":"2024","unstructured":"Junliang Zhang, Xu Liu, Yu Liang, Xiaole Xian, Weicheng Xie, Linlin Shen, and Siyang Song. 2024. CLIP-Guided Bidirectional Prompt and Semantic Supervision for Dynamic Facial Expression Recognition. In 2024 IEEE International Joint Conference on Biometrics (IJCB). IEEE, 1-10."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01821"},{"key":"e_1_3_2_1_49_1","volume-title":"FaVChat: Unlocking Fine-Grained Facail Video Understanding with Multimodal Large Language Models. arXiv preprint arXiv:2503.09158","author":"Zhao Fufangchen","year":"2025","unstructured":"Fufangchen Zhao, Ming Li, Linrui Xu, Wenhao Jiang, Jian Gao, and Danfeng Yan. 2025. FaVChat: Unlocking Fine-Grained Facail Video Understanding with Multimodal Large Language Models. arXiv preprint arXiv:2503.09158 (2025)."},{"key":"e_1_3_2_1_50_1","volume-title":"Prompting visual-language models for dynamic facial expression recognition. arXiv preprint arXiv:2308.13382","author":"Zhao Zengqun","year":"2023","unstructured":"Zengqun Zhao and Ioannis Patras. 2023. Prompting visual-language models for dynamic facial expression recognition. arXiv preprint arXiv:2308.13382 (2023)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28594"},{"key":"e_1_3_2_1_52_1","volume-title":"Ceprompt: Cross-modal emotion-aware prompting for facial expression recognition","author":"Zhou Haoliang","year":"2024","unstructured":"Haoliang Zhou, Shucheng Huang, Feifei Zhang, and Changsheng Xu. 2024. Ceprompt: Cross-modal emotion-aware prompting for facial expression recognition. IEEE Transactions on Circuits and Systems for Video Technology (2024)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM48880.2022.9796782"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755637","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:59:34Z","timestamp":1765342774000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755637"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":53,"alternative-id":["10.1145\/3746027.3755637","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755637","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}