{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T02:55:36Z","timestamp":1764557736927,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":74,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Project Funded by the Priority Academic Program Development of Jiangsu Higher Education Institutions"},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62006166,62376178,62076175"],"award-info":[{"award-number":["62006166,62376178,62076175"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,22]]},"DOI":"10.1145\/3696410.3714642","type":"proceedings-article","created":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T22:47:11Z","timestamp":1745362031000},"page":"188-197","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Omni-SILA: Towards &lt;u&gt;Omni&lt;\/u&gt;-scene Driven Visual &lt;u&gt;S&lt;\/u&gt;entiment &lt;u&gt;I&lt;\/u&gt;dentifying, &lt;u&gt;L&lt;\/u&gt;ocating and &lt;u&gt;A&lt;\/u&gt;ttributing in Videos"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-2144-6921","authenticated-orcid":false,"given":"Jiamin","family":"Luo","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3619-1525","authenticated-orcid":false,"given":"Jingjing","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6437-2061","authenticated-orcid":false,"given":"Junxiao","family":"Ma","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4832-6902","authenticated-orcid":false,"given":"Yujie","family":"Jin","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1000-3278","authenticated-orcid":false,"given":"Shoushan","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7887-5099","authenticated-orcid":false,"given":"Guodong","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with Interleaved Visual-Textual Tokens. CoRR","author":"Ataallah Kirolos","year":"2024","unstructured":"Kirolos Ataallah, Xiaoqian Shen, Eslam Abdelrahman, Essam Sleiman, Deyao Zhu, Jian Ding, and Mohamed Elhoseiny. 2024. MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with Interleaved Visual-Textual Tokens. CoRR (2024)."},{"key":"e_1_3_2_1_2_1","first-page":"3117","article-title":"Curriculum-Listener","volume":"2023","author":"Chen Houlun","year":"2023","unstructured":"Houlun Chen, Xin Wang, Xiaohan Lan, Hong Chen, Xuguang Duan, Jia Jia, and Wenwu Zhu. 2023. Curriculum-Listener: Consistency- and Complementarity-Aware Audio-Enhanced Temporal Sentence Grounding. In Proceedings of ACM MM 2023. 3117--3128.","journal-title":"Consistency- and Complementarity-Aware Audio-Enhanced Temporal Sentence Grounding. In Proceedings of ACM MM"},{"key":"e_1_3_2_1_3_1","volume-title":"MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. CoRR","author":"Chen Jun","year":"2023","unstructured":"Jun Chen, Deyao Zhu, Xiaoqian Shen, Xiang Li, Zechun Liu, Pengchuan Zhang, Raghuraman Krishnamoorthi, Vikas Chandra, Yunyang Xiong, and Mohamed Elhoseiny. 2023. MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. CoRR (2023)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.338"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00543"},{"key":"e_1_3_2_1_6_1","volume-title":"Xing","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3268066"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of ICLR","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In Proceedings of ICLR 2021."},{"key":"e_1_3_2_1_9_1","volume-title":"LoRAMoE: Revolutionizing Mixture of Experts for Maintaining World Knowledge in Language Model Alignment. CoRR","author":"Dou Shihan","year":"2023","unstructured":"Shihan Dou, Enyu Zhou, Yan Liu, Songyang Gao, Jun Zhao,Wei Shen, Yuhao Zhou, Zhiheng Xi, Xiao Wang, Xiaoran Fan, Shiliang Pu, Jiang Zhu, Rui Zheng, Tao Gui, Qi Zhang, and Xuanjing Huang. 2023. LoRAMoE: Revolutionizing Mixture of Experts for Maintaining World Knowledge in Language Model Alignment. CoRR (2023)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01778"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0078506"},{"key":"e_1_3_2_1_12_1","volume-title":"VITRON: A Unified Pixel-level Vision LLM for Understanding, Generating, Segmenting, Editing.","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Shengqiong Wu, Hanwang Zhang, Tat-Seng Chua, and Shuicheng Yan. 2024. VITRON: A Unified Pixel-level Vision LLM for Understanding, Generating, Segmenting, Editing."},{"key":"e_1_3_2_1_13_1","volume-title":"Cognition-driven multimodal personality classification. Sci. China Inf. Sci. 65, 10","author":"Gao Xiaoya","year":"2022","unstructured":"Xiaoya Gao, Jingjing Wang, Shoushan Li, Min Zhang, and Guodong Zhou. 2022. Cognition-driven multimodal personality classification. Sci. China Inf. Sci. 65, 10 (2022)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02510"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.2307\/2346830"},{"key":"e_1_3_2_1_18_1","first-page":"1122","article-title":"MISA","volume":"2020","author":"Hazarika Devamanyu","year":"2020","unstructured":"Devamanyu Hazarika, Roger Zimmermann, and Soujanya Poria. 2020. MISA: Modality-Invariant and -Specific Representations for Multimodal Sentiment Analysis. In Proceedings of ACM MM 2020. 1122--1131.","journal-title":"Modality-Invariant and -Specific Representations for Multimodal Sentiment Analysis. In Proceedings of ACM MM"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.534"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of EMNLP","author":"Kazemzadeh Sahar","year":"2014","unstructured":"Sahar Kazemzadeh, Vicente Ordonez, Mark Matten, and Tamara L. Berg. 2014. ReferItGame: Referring to Objects in Photographs of Natural Scenes. In Proceedings of EMNLP 2014. 787--798."},{"key":"e_1_3_2_1_22_1","volume-title":"Segment Anything. In Proceedings of ICCV","author":"Kirillov Alexander","year":"2023","unstructured":"Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chlo\u00e9 Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alexander C. Berg, Wan-Yen Lo, Piotr Doll\u00e1r, and Ross B. Girshick. 2023. Segment Anything. In Proceedings of ICCV 2023. 3992--4003."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16280"},{"key":"e_1_3_2_1_24_1","volume-title":"Otter: A Multi-Modal Model with In-Context Instruction Tuning. CoRR","author":"Li Bo","year":"2023","unstructured":"Bo Li, Yuanhan Zhang, Liangyu Chen, Jinghao Wang, Jingkang Yang, and Ziwei Liu. 2023. Otter: A Multi-Modal Model with In-Context Instruction Tuning. CoRR (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of ICML 2023. 1973","author":"Li Junnan","year":"1974","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven C. H. Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In Proceedings of ICML 2023. 19730--19742."},{"key":"e_1_3_2_1_26_1","volume-title":"VideoChat: Chat-Centric Video Understanding. CoRR","author":"Li Kunchang","year":"2023","unstructured":"Kunchang Li, Yinan He, Yi Wang, Yizhuo Li, Wenhai Wang, Ping Luo, Yali Wang, Limin Wang, and Yu Qiao. 2023. VideoChat: Chat-Centric Video Understanding. CoRR (2023)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.111"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.360"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW63481.2024.10645462"},{"key":"e_1_3_2_1_30_1","volume-title":"AffectGPT: Dataset and Framework for Explainable Multimodal Emotion Recognition. CoRR","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Haiyang Sun, Licai Sun, Jiangyan Yi, Bin Liu, and Jianhua Tao. 2024. AffectGPT: Dataset and Framework for Explainable Multimodal Emotion Recognition. CoRR (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"Video-LLaVA: Learning United Visual Representation by Alignment Before Projection. CoRR","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-LLaVA: Learning United Visual Representation by Alignment Before Projection. CoRR (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of NeurIPS","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, QingyangWu, and Yong Jae Lee. 2023. Visual Instruction Tuning. In Proceedings of NeurIPS 2023."},{"key":"e_1_3_2_1_33_1","volume-title":"Valley: Video Assistant with Large Language model Enhanced abilitY. CoRR","author":"Luo Ruipu","year":"2023","unstructured":"Ruipu Luo, Ziwang Zhao, Min Yang, Junwei Dong, Minghui Qiu, Pengcheng Lu, TaoWang, and ZhongyuWei. 2023. Valley: Video Assistant with Large Language model Enhanced abilitY. CoRR (2023)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01367"},{"key":"e_1_3_2_1_37_1","unstructured":"Judea Pearl and Dana Mackenzie. 2018. The book of why: the new science of cause and effect. Basic books."},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of ICLR","author":"Puigcerver Joan","year":"2024","unstructured":"Joan Puigcerver, Carlos Riquelme Ruiz, Basil Mustafa, and Neil Houlsby. 2024. From Sparse to Soft Mixtures of Experts. In Proceedings of ICLR 2024."},{"key":"e_1_3_2_1_39_1","volume-title":"TimeChat: A Time-sensitive Multimodal Large Language Model for Long Video Understanding. CoRR","author":"Ren Shuhuai","year":"2023","unstructured":"Shuhuai Ren, Linli Yao, Shicheng Li, Xu Sun, and Lu Hou. 2023. TimeChat: A Time-sensitive Multimodal Large Language Model for Long Video Understanding. CoRR (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/2578726.2578772"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.5555\/2627435.2670313"},{"key":"e_1_3_2_1_42_1","volume-title":"PandaGPT: One Model To Instruction-Follow Them All. CoRR","author":"Su Yixuan","year":"2023","unstructured":"Yixuan Su, Tian Lan, Huayang Li, Jialu Xu, Yan Wang, and Deng Cai. 2023. PandaGPT: One Model To Instruction-Follow Them All. CoRR (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of ICNN","author":"Peter","year":"1993","unstructured":"Peter T. Szymanski and Michael D. Lemmon. 1993. Adaptive mixtures of local experts are source coding solutions. In Proceedings of ICNN 1993. 1391--1396."},{"key":"e_1_3_2_1_44_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. CoRR (2023)."},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of ACL","author":"Hubert Tsai Yao-Hung","year":"2019","unstructured":"Yao-Hung Hubert Tsai, Shaojie Bai, Paul Pu Liang, J. Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Multimodal Transformer for Unaligned Multimodal Language Sequences. In Proceedings of ACL 2019. 6558--6569."},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of EMNLP 2020. 1823","author":"Hubert Tsai Yao-Hung","year":"2020","unstructured":"Yao-Hung Hubert Tsai, Martin Ma, Muqiao Yang, Ruslan Salakhutdinov, and Louis-Philippe Morency. 2020. Multimodal Routing: Improving Local and Global Interpretability of Multimodal Language Analysis. In Proceedings of EMNLP 2020. 1823--1833."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2017.12.005"},{"key":"e_1_3_2_1_48_1","volume-title":"HawkEye: Training Video-Text LLMs for Grounding Text in Videos. CoRR","author":"Wang Yueqian","year":"2024","unstructured":"Yueqian Wang, Xiaojun Meng, Jianxin Liang, Yuxuan Wang, Qun Liu, and Dongyan Zhao. 2024. HawkEye: Training Video-Text LLMs for Grounding Text in Videos. CoRR (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"GPT4Video: A Unified Multimodal Large Language Model for lnstruction-Followed Understanding and Safety-Aware Generation. CoRR","author":"Wang Zhanyu","year":"2023","unstructured":"Zhanyu Wang, Longyue Wang, Zhen Zhao, Minghao Wu, Chenyang Lyu, Huayang Li, Deng Cai, Luping Zhou, Shuming Shi, and Zhaopeng Tu. 2023. GPT4Video: A Unified Multimodal Large Language Model for lnstruction-Followed Understanding and Safety-Aware Generation. CoRR (2023)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28423"},{"key":"e_1_3_2_1_51_1","volume-title":"Proceedings of ICLR","author":"Wu Xun","year":"2024","unstructured":"Xun Wu, Shaohan Huang, and Furu Wei. 2024. Mixture of LoRA Experts. In Proceedings of ICLR 2024."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02511"},{"key":"e_1_3_2_1_53_1","volume-title":"Proceedings of ICML 2015. 2048","author":"Xu Kelvin","year":"2015","unstructured":"Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron C. Courville, Ruslan Salakhutdinov, Richard S. Zemel, and Yoshua Bengio. 2015. Show, Attend and Tell: Neural Image Caption Generation with Visual Attention. In Proceedings of ICML 2015. 2048--2057."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00926"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01864"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3106813"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.421"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00972"},{"key":"e_1_3_2_1_59_1","volume-title":"The Dawn of LMMs: Preliminary Explorations with GPT-4V(ision). CoRR","author":"Yang Zhengyuan","year":"2023","unstructured":"Zhengyuan Yang, Linjie Li, Kevin Lin, JianfengWang, Chung-Ching Lin, Zicheng Liu, and Lijuan Wang. 2023. The Dawn of LMMs: Preliminary Explorations with GPT-4V(ision). CoRR (2023)."},{"key":"e_1_3_2_1_60_1","volume-title":"mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. CoRR","author":"Ye Qinghao","year":"2023","unstructured":"Qinghao Ye, Haiyang Xu, Guohai Xu, Jiabo Ye, Ming Yan, Yiyang Zhou, Junyang Wang, Anwen Hu, Pengcheng Shi, Yaya Shi, Chenliang Li, Yuanhong Xu, Hehong Chen, Junfeng Tian, Qian Qi, Ji Zhang, and Fei Huang. 2023. mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. CoRR (2023)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.343"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1115"},{"key":"e_1_3_2_1_64_1","volume-title":"Proceedings of ACL","author":"Zadeh Amir","year":"2018","unstructured":"Amir Zadeh, Paul Pu Liang, Soujanya Poria, Erik Cambria, and Louis-Philippe Morency. 2018. Multimodal Language Analysis in the Wild: Carnegie Mellon University-MOSEI Dataset and Interpretable Dynamic Fusion Graph. In Proceedings of ACL 2018. 2236--2246."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01575"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"e_1_3_2_1_67_1","volume-title":"Holmes-VAD: Towards Unbiased and Explainable Video Anomaly Detection via Multi-modal LLM. CoRR","author":"Zhang Huaxin","year":"2024","unstructured":"Huaxin Zhang, Xiaohao Xu, Xiang Wang, Jialong Zuo, Chuchu Han, Xiaonan Huang, Changxin Gao, Yuehuan Wang, and Nong Sang. 2024. Holmes-VAD: Towards Unbiased and Explainable Video Anomaly Detection via Multi-modal LLM. CoRR (2024)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2016.2603342"},{"key":"e_1_3_2_1_69_1","first-page":"199","article-title":"Temporal Sentiment Localization","volume":"2022","author":"Zhang Zhicheng","year":"2022","unstructured":"Zhicheng Zhang and Jufeng Yang. 2022. Temporal Sentiment Localization: Listen and Look in Untrimmed Videos. In Proceedings of ACM MM 2022. 199--208.","journal-title":"Listen and Look in Untrimmed Videos. In Proceedings of ACM MM"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681407"},{"key":"e_1_3_2_1_71_1","first-page":"192","article-title":"PDANet","volume":"2019","author":"Zhao Sicheng","year":"2019","unstructured":"Sicheng Zhao, Zizhou Jia, Hui Chen, Leida Li, Guiguang Ding, and Kurt Keutzer. 2019. PDANet: Polarity-consistent Deep Attention Network for Fine-grained Visual Emotion Regression. In Proceedings of ACM MM 2019. 192--201.","journal-title":"In Proceedings of ACM MM"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3094362"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2723009"},{"key":"e_1_3_2_1_74_1","volume-title":"Proceedings of ICLR","author":"Zhu Bin","year":"2024","unstructured":"Bin Zhu, Bin Lin, Munan Ning, Yang Yan, Jiaxi Cui, Hongfa Wang, Yatian Pang, Wenhao Jiang, Junwu Zhang, Zongwei Li, Caiwan Zhang, Zhifeng Li, Wei Liu, and Li Yuan. 2024. LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment. In Proceedings of ICLR 2024."},{"key":"e_1_3_2_1_75_1","volume-title":"Proceedings of ICLR","author":"Zhu Deyao","year":"2024","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2024. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. In Proceedings of ICLR 2024."}],"event":{"name":"WWW '25: The ACM Web Conference 2025","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Sydney NSW Australia","acronym":"WWW '25"},"container-title":["Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714642","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696410.3714642","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:56Z","timestamp":1750295936000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714642"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":74,"alternative-id":["10.1145\/3696410.3714642","10.1145\/3696410"],"URL":"https:\/\/doi.org\/10.1145\/3696410.3714642","relation":{},"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"2025-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}