{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T17:15:32Z","timestamp":1775841332739,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","funder":[{"name":"Shenzhen Science and Technology Research and Development Fund for Sustainable Development Project","award":["GXWD20231128103819001"],"award-info":[{"award-number":["GXWD20231128103819001"]}]},{"name":"Guangdong Provincial Key Laboratory Grant","award":["2023B1212060076"],"award-info":[{"award-number":["2023B1212060076"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3774904.3792569","type":"proceedings-article","created":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T21:54:34Z","timestamp":1775771674000},"page":"7421-7430","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CCAF: Coarse-to-fine Cross-Modal Alignment and Fusion for Multimodal Sentiment Analysis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5482-3895","authenticated-orcid":false,"given":"Xianbing","family":"Zhao","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China, Jiangnan University, Wuxi, China, and Guangdong Provincial Key Laboratory of Intelligent Information Processing, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9287-4877","authenticated-orcid":false,"given":"Shengzun","family":"Yang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0271-8246","authenticated-orcid":false,"given":"Buzhou","family":"Tang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China, Pengcheng Laboratory, Shenzhen, China, and Guangdong Provincial Key Laboratory of Intelligent Information Processing, Shenzhen, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,4,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Deep variational information bottleneck. arXiv preprint arXiv:1612.00410","author":"Alemi Alexander A","year":"2016","unstructured":"Alexander A Alemi, Ian Fischer, Joshua V Dillon, and Kevin Murphy. 2016. Deep variational information bottleneck. arXiv preprint arXiv:1612.00410 (2016)."},{"key":"e_1_3_2_1_2_1","volume-title":"Meet claude. https:\/\/www. anthropic.com\/claude","year":"2024","unstructured":"Anthropic. 2024. Meet claude. https:\/\/www. anthropic.com\/claude"},{"key":"e_1_3_2_1_3_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966, Vol. 1, 2 (2023), 3. https:\/\/arxiv.org\/abs\/2308.12966"},{"key":"e_1_3_2_1_4_1","volume-title":"International conference on machine learning. PMLR, 531-540","author":"Belghazi Mohamed Ishmael","year":"2018","unstructured":"Mohamed Ishmael Belghazi, Aristide Baratin, Sai Rajeshwar, Sherjil Ozair, Yoshua Bengio, Aaron Courville, and Devon Hjelm. 2018. Mutual information neural estimation. In International conference on machine learning. PMLR, 531-540."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2018.00020"},{"key":"e_1_3_2_1_6_1","first-page":"1","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung Hyung Won","year":"2024","unstructured":"Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Yunxuan Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, et al., 2024. Scaling instruction-finetuned language models. Journal of Machine Learning Research, Vol. 25, 70 (2024), 1-53.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_7_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv:2305.06500 [cs.CV] https:\/\/arxiv.org\/abs\/2305.06500"},{"key":"e_1_3_2_1_8_1","unstructured":"Google. 2023. Gemini: our largest and most capable ai model. https:\/\/blog.google\/technology\/ ai\/google-gemini-ai\/#sundar-note"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413678"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.123"},{"key":"e_1_3_2_1_12_1","volume-title":"Kushal Lakhotia","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio, speech, and language processing, Vol. 29 (2021), 3451-3460."},{"key":"e_1_3_2_1_13_1","volume-title":"International Conference on Artificial Intelligence and Statistics. PMLR, 1513-1521","author":"Lee Changhee","year":"2021","unstructured":"Changhee Lee and Mihaela Van der Schaar. 2021. A variational information bottleneck approach to multi-omics data integration. In International Conference on Artificial Intelligence and Statistics. PMLR, 1513-1521."},{"key":"e_1_3_2_1_14_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023a. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. arXiv:2301.12597 [cs.CV] https:\/\/arxiv.org\/abs\/2301.12597"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00641"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02307"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02536"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i2.32131"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i18.34067"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1209"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00258"},{"key":"e_1_3_2_1_22_1","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"van der Maaten Laurens","year":"2008","unstructured":"Laurens van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, Nov (2008), 2579-2605.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2022.11.003"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3171679"},{"key":"e_1_3_2_1_25_1","volume-title":"Object-Oriented Anchoring and Modal Alignment in Multimodal Learning. In European Conference on Computer Vision. Springer, 179-196","author":"Mei Shibin","year":"2024","unstructured":"Shibin Mei, Bingbing Ni, Hang Wang, Chenglong Zhao, Fengfa Hu, Zhiming Pi, and Bilian Ke. 2024. Object-Oriented Anchoring and Modal Alignment in Multimodal Learning. In European Conference on Computer Vision. Springer, 179-196."},{"key":"e_1_3_2_1_26_1","volume-title":"Chatgpt: Large-scale language model fine-tuned for conversational applications. https:\/\/openai.com","author":"AI.","year":"2023","unstructured":"OpenAI. 2023a. Chatgpt: Large-scale language model fine-tuned for conversational applications. https:\/\/openai.com"},{"key":"e_1_3_2_1_27_1","unstructured":"OpenAI. 2023b. Gpt-4v(ision) system card. https:\/\/openai.com\/research\/ gpt-4v-system-card"},{"key":"e_1_3_2_1_28_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, Vol. 21, 140 (2020), 1-67.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3178231"},{"key":"e_1_3_2_1_31_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the conference. Association for computational linguistics. Meeting","volume":"2019","author":"Hubert Tsai Yao-Hung","year":"2019","unstructured":"Yao-Hung Hubert Tsai, Shaojie Bai, Paul Pu Liang, J Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Multimodal transformer for unaligned multimodal language sequences. In Proceedings of the conference. Association for computational linguistics. Meeting, Vol. 2019. 6558."},{"key":"e_1_3_2_1_33_1","volume-title":"Learning Factorized Multimodal Representations. In International Conference on Learning Representations.","author":"Hubert Tsai Yao-Hung","unstructured":"Yao-Hung Hubert Tsai, Paul Pu Liang, Amir Zadeh, Louis-Philippe Morency, and Ruslan Salakhutdinov. [n.d.]. Learning Factorized Multimodal Representations. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_34_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i20.35416"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611975673.5"},{"key":"e_1_3_2_1_37_1","volume-title":"Multimodal generative models for scalable weakly-supervised learning. Advances in neural information processing systems","author":"Wu Mike","year":"2018","unstructured":"Mike Wu and Noah Goodman. 2018. Multimodal generative models for scalable weakly-supervised learning. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i2.32152"},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence. 6550-6558","author":"Wu Zhuojia","year":"2024","unstructured":"Zhuojia Wu, Qi Zhang, Duoqian Miao, Kun Yi, Wei Fan, and Liang Hu. 2024. HyDiscGAN: a hybrid distributed cGAN for audio-visual privacy preservation in multimodal sentiment analysis. In Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence. 6550-6558."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i14.29540"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01942"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547754"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.421"},{"key":"e_1_3_2_1_44_1","volume-title":"MM-InstructEval: Zero-shot evaluation of (Multimodal) Large Language Models on multimodal reasoning tasks. Information Fusion","author":"Yang Xiaocui","year":"2025","unstructured":"Xiaocui Yang, Wenfang Wu, Shi Feng, Ming Wang, Daling Wang, Yang Li, Qi Sun, Yifei Zhang, Xiaoming Fu, and Soujanya Poria. 2025b. MM-InstructEval: Zero-shot evaluation of (Multimodal) Large Language Models on multimodal reasoning tasks. Information Fusion (2025), 103204."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i24.34755"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.860"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1115"},{"key":"e_1_3_2_1_49_1","volume-title":"Mosi: Multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos. arXiv","author":"Zadeh Amir","year":"2016","unstructured":"Amir Zadeh, Rowan Zellers, Eli Pincus, and Louis-Philippe Morency. [n.d.]. Mosi: Multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos. arXiv 2016. arXiv preprint arXiv:1606.06259, Vol. 6 ([n.d.])."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1208"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102031"},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the 31st International Conference on Computational Linguistics. 4611-4621","author":"Zhang Xiangmin","year":"2025","unstructured":"Xiangmin Zhang, Wei Wei, and Shihao Zou. 2025. Modal Feature Optimization Network with Prompt for Multimodal Sentiment Analysis. In Proceedings of the 31st International Conference on Computational Linguistics. 4611-4621."},{"key":"e_1_3_2_1_53_1","volume-title":"Prototypical information bottlenecking and disentangling for multimodal cancer survival prediction. arXiv preprint arXiv:2401.01646","author":"Zhang Yilan","year":"2024","unstructured":"Yilan Zhang, Yingxue Xu, Jianqi Chen, Fengying Xie, and Hao Chen. 2024. Prototypical information bottlenecking and disentangling for multimodal cancer survival prediction. arXiv preprint arXiv:2401.01646 (2024)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746536"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3222023"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583406"},{"key":"e_1_3_2_1_57_1","volume-title":"Predicting the popularity of micro-videos with multimodal variational encoder-decoder framework. arXiv preprint arXiv:2003.12724","author":"Zhu Yaochen","year":"2020","unstructured":"Yaochen Zhu, Jiayi Xie, and Zhenzhong Chen. 2020. Predicting the popularity of micro-videos with multimodal variational encoder-decoder framework. arXiv preprint arXiv:2003.12724 (2020)."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681527"}],"event":{"name":"WWW '26: The ACM Web Conference 2026","location":"Dubai United Arab Emirates","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM Web Conference 2026"],"original-title":[],"deposited":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T16:33:56Z","timestamp":1775838836000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3774904.3792569"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,12]]},"references-count":58,"alternative-id":["10.1145\/3774904.3792569","10.1145\/3774904"],"URL":"https:\/\/doi.org\/10.1145\/3774904.3792569","relation":{},"subject":[],"published":{"date-parts":[[2026,4,12]]},"assertion":[{"value":"2026-04-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}