{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T16:22:04Z","timestamp":1781281324690,"version":"3.54.1"},"reference-count":152,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2025,9,28]],"date-time":"2025-09-28T00:00:00Z","timestamp":1759017600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,28]],"date-time":"2025-09-28T00:00:00Z","timestamp":1759017600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Sci. China Inf. Sci."],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1007\/s11432-024-4593-8","type":"journal-article","created":{"date-parts":[[2025,10,4]],"date-time":"2025-10-04T10:38:02Z","timestamp":1759574282000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":22,"title":["Large language models meet text-centric multimodal sentiment analysis: a survey"],"prefix":"10.1007","volume":"68","author":[{"given":"Hao","family":"Yang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yanyan","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yang","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shilong","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tian","family":"Zheng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hongbo","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zongyang","family":"Ma","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wanxiang","family":"Che","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shijin","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Si","family":"Wei","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bing","family":"Qin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,9,28]]},"reference":[{"key":"4593_CR1","doi-asserted-by":"crossref","first-page":"478","DOI":"10.1109\/JSTSP.2020.2987728","volume":"14","author":"C Zhang","year":"2020","unstructured":"Zhang C, Yang Z, He X, et al. Multimodal intelligence: representation learning, information fusion, and applications. IEEE J Sel Top Signal Process, 2020, 14: 478\u2013493","journal-title":"IEEE J Sel Top Signal Process"},{"key":"4593_CR2","doi-asserted-by":"crossref","first-page":"3","DOI":"10.1016\/j.imavis.2017.08.003","volume":"65","author":"M Soleymani","year":"2017","unstructured":"Soleymani M, Garcia D, Jou B, et al. A survey of multimodal sentiment analysis. Image Vision Comput, 2017, 65: 3\u201314","journal-title":"Image Vision Comput"},{"key":"4593_CR3","volume-title":"Proceedings of Advances in Neural Information Processing Systems (NeurIPS)","author":"T B Brown","year":"2020","unstructured":"Brown T B, Mann B, Ryder N, et al. Language models are few-shot learners. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS), 2020"},{"key":"4593_CR4","volume-title":"Scaling language models: methods, analysis & insights from training Gopher","author":"J W Rae","year":"2021","unstructured":"Rae J W, Borgeaud S, Cai T, et al. Scaling language models: methods, analysis & insights from training Gopher. 2021. ArXiv:2112.11446"},{"key":"4593_CR5","volume-title":"GPT-4 technical report","author":"OpenAI.","year":"2023","unstructured":"OpenAI. GPT-4 technical report. 2023. ArXiv:2303.08774"},{"key":"4593_CR6","volume-title":"Can ChatGPT understand too? A comparative study on ChatGPT and fine-tuned BERT","author":"Q Zhong","year":"2023","unstructured":"Zhong Q, Ding L, Liu J, et al. Can ChatGPT understand too? A comparative study on ChatGPT and fine-tuned BERT. 2023. ArXiv:2302.10198"},{"key":"4593_CR7","volume-title":"A multitask, multilingual, multimodal evaluation of ChatGPT on reasoning, hallucination, and interactivity","author":"Y Bang","year":"2023","unstructured":"Bang Y, Cahyawijaya S, Lee N, et al. A multitask, multilingual, multimodal evaluation of ChatGPT on reasoning, hallucination, and interactivity. 2023. ArXiv:2302.04023"},{"key":"4593_CR8","volume-title":"A comprehensive capability analysis of GPT-3 and GPT-3.5 series models","author":"J Ye","year":"2023","unstructured":"Ye J, Chen X, Xu N, et al. A comprehensive capability analysis of GPT-3 and GPT-3.5 series models. 2023. ArXiv:2303.10420"},{"key":"4593_CR9","volume-title":"Harnessing the power of LLMs in practice: a survey on ChatGPT and beyond","author":"J Yang","year":"2023","unstructured":"Yang J, Jin H, Tang R, et al. Harnessing the power of LLMs in practice: a survey on ChatGPT and beyond. 2023. ArXiv:2304.13712"},{"key":"4593_CR10","first-page":"1014","volume-title":"Proceedings of the ACM Web Conference (WWW)","author":"X Deng","year":"2023","unstructured":"Deng X, Bashlovkina V, Han F, et al. LLMs to the moon? Reddit market sentiment analysis with LLMs. In: Proceedings of the ACM Web Conference (WWW), 2023. 1014\u20131019"},{"key":"4593_CR11","volume-title":"Is ChatGPT a good sentiment analyzer? A preliminary study","author":"Z Wang","year":"2023","unstructured":"Wang Z, Xie Q, Ding Z, et al. Is ChatGPT a good sentiment analyzer? A preliminary study. 2023. ArXiv:2304.04339"},{"key":"4593_CR12","volume-title":"Sentiment analysis in the era of large language models: a reality check","author":"W Zhang","year":"2023","unstructured":"Zhang W, Deng Y, Liu B, et al. Sentiment analysis in the era of large language models: a reality check. 2023. ArXiv:2305.15005"},{"key":"4593_CR13","volume-title":"Is ChatGPT equipped with emotional dialogue capabilities?","author":"W Zhao","year":"2023","unstructured":"Zhao W, Zhao Y, Lu X, et al. Is ChatGPT equipped with emotional dialogue capabilities? 2023. ArXiv:2304.09582"},{"key":"4593_CR14","volume-title":"PaLM: scaling language modeling with pathways","author":"A Chowdhery","year":"2022","unstructured":"Chowdhery A, Narang S, Devlin J, et al. PaLM: scaling language modeling with pathways. 2022. ArXiv:2204.02311"},{"key":"4593_CR15","volume-title":"Galactica: a large language model for science","author":"R Taylor","year":"2022","unstructured":"Taylor R, Kardas M, Cucurull G, et al. Galactica: a large language model for science. 2022. ArXiv:2211.09085"},{"key":"4593_CR16","volume-title":"LLaMA: open and efficient foundation language models","author":"H Touvron","year":"2023","unstructured":"Touvron H, Lavril T, Izacard G, et al. LLaMA: open and efficient foundation language models. 2023. arXiv:2302.13971"},{"key":"4593_CR17","volume-title":"Proceedings of the 10th International Conference on Learning Representations (ICLR)","author":"J Wei","year":"2022","unstructured":"Wei J, Bosma M, Zhao V Y, et al. Finetuned language models are zero-shot learners. In: Proceedings of the 10th International Conference on Learning Representations (ICLR), 2022"},{"key":"4593_CR18","volume-title":"Proceedings of the 10th International Conference on Learning Representations (ICLR)","author":"V Sanh","year":"2022","unstructured":"Sanh V, Webson A, Raffel C, et al. Multitask-prompted training enables zero-shot task generalization. In: Proceedings of the 10th International Conference on Learning Representations (ICLR), 2022"},{"key":"4593_CR19","volume-title":"Scaling instruction-finetuned language models","author":"H W Chung","year":"2022","unstructured":"Chung H W, Hou L, Longpre S, et al. Scaling instruction-finetuned language models. 2022. ArXiv:2210.11416"},{"key":"4593_CR20","volume-title":"Proceedings of Advances in Neural Information Processing Systems (NeurIPS)","author":"A Vaswani","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, et al. Attention is all you need. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS), 2017"},{"key":"4593_CR21","volume-title":"Refashioning emotion recognition modelling: the advent of generalised large models","author":"Z Zhang","year":"2023","unstructured":"Zhang Z, Peng L, Pang T, et al. Refashioning emotion recognition modelling: the advent of generalised large models. 2023. ArXiv:2308.11578"},{"key":"4593_CR22","volume-title":"Gemini: a family of highly capable multimodal models","author":"G Team","year":"2023","unstructured":"Team G, Anil R, Borgeaud S, et al. Gemini: a family of highly capable multimodal models. 2023. ArXiv:2312.11805"},{"key":"4593_CR23","first-page":"15180","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"R Girdhar","year":"2023","unstructured":"Girdhar R, El-Nouby A, Liu Z, et al. ImageBind: one embedding space to bind them all. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2023. 15180\u201315190"},{"key":"4593_CR24","first-page":"19730","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"J Li","year":"2023","unstructured":"Li J, Li D, Savarese S, et al. BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proceedings of the International Conference on Machine Learning (ICML), 2023. 19730\u201319742"},{"key":"4593_CR25","volume-title":"Proceedings of Advances in Neural Information Processing Systems (NeurIPS)","author":"H Liu","year":"2024","unstructured":"Liu H, Li C, Wu Q, et al. Visual instruction tuning. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS), 2024"},{"key":"4593_CR26","first-page":"8748","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"A Radford","year":"2021","unstructured":"Radford A, Kim J W, Hallacy C, et al. Learning transferable visual models from natural language supervision. In: Proceedings of the International Conference on Machine Learning (ICML), 2021. 8748\u20138763"},{"key":"4593_CR27","volume-title":"Vicuna: an open-source chatbot impressing GPT-4 with 90% ChatGPT quality","author":"W L Chiang","year":"2023","unstructured":"Chiang W L, Li Z, Lin Z, et al. Vicuna: an open-source chatbot impressing GPT-4 with 90% ChatGPT quality. 2023. https:\/\/vicuna.lmsys.org"},{"key":"4593_CR28","first-page":"2556","volume-title":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (ACL)","author":"P Sharma","year":"2018","unstructured":"Sharma P, Ding N, Goodman S, et al. Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (ACL), 2018. 2556\u20132565"},{"key":"4593_CR29","first-page":"2507","volume-title":"Proceedings of Advances in Neural Information Processing Systems (NeurIPS)","author":"P Lu","year":"2022","unstructured":"Lu P, Mishra S, Xia T, et al. Learn to explain: multimodal reasoning via thought chains for science question answering. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS), 2022. 2507\u20132521"},{"key":"4593_CR30","volume-title":"Qwen-VL: a frontier large vision-language model with versatile abilities","author":"J Bai","year":"2023","unstructured":"Bai J, Bai S, Yang S, et al. Qwen-VL: a frontier large vision-language model with versatile abilities. 2023. ArXiv:2308.12966"},{"key":"4593_CR31","volume-title":"Large language models meet NLP: a survey","author":"L Qin","year":"2024","unstructured":"Qin L, Chen Q, Feng X, et al. Large language models meet NLP: a survey. 2024. ArXiv:2405.12819"},{"key":"4593_CR32","first-page":"2790","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"N Houlsby","year":"2019","unstructured":"Houlsby N, Giurgiu A, Jastrzebski S, et al. Parameter-efficient transfer learning for NLP. In: Proceedings of the International Conference on Machine Learning (ICML), 2019. 2790\u20132799"},{"key":"4593_CR33","volume-title":"LoRA: low-rank adaptation of large language models","author":"E J Hu","year":"2021","unstructured":"Hu E J, Shen Y, Wallis P, et al. LoRA: low-rank adaptation of large language models. 2021. ArXiv:2106.09685"},{"key":"4593_CR34","volume-title":"Prefix-tuning: optimizing continuous prompts for generation","author":"X L Li","year":"2021","unstructured":"Li X L, Liang P, et al. Prefix-tuning: optimizing continuous prompts for generation. 2021. ArXiv:2101.00190"},{"key":"4593_CR35","volume-title":"Proceedings of Advances in Neural Information Processing Systems (NeurIPS)","author":"T Dettmers","year":"2024","unstructured":"Dettmers T, Pagnoni A, Holtzman A, et al. QLoRA: efficient finetuning of quantized LLMs. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS), 2024"},{"key":"4593_CR36","first-page":"15","volume-title":"Proceedings of the 22nd International Conference on MultiMedia Modeling","author":"T Niu","year":"2016","unstructured":"Niu T, Zhu S, Pang L, et al. Sentiment analysis on multi-view social data. In: Proceedings of the 22nd International Conference on MultiMedia Modeling, 2016. 15\u201327"},{"key":"4593_CR37","volume-title":"Proceedings of De-Factify: Workshop on Multimodal Fact Checking and Hate Speech Detection","author":"S Ramamoorthy","year":"2022","unstructured":"Ramamoorthy S, Gunti N, Mishra S, et al. Memotion 2: dataset on sentiment and emotion analysis of memes. In: Proceedings of De-Factify: Workshop on Multimodal Fact Checking and Hate Speech Detection, 2022"},{"key":"4593_CR38","first-page":"1512","volume-title":"Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)","author":"A Jia","year":"2022","unstructured":"Jia A, He Y, Zhang Y, et al. Beyond emotion: a multi-modal dataset for human desire understanding. In: Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT), 2022. 1512\u20131522"},{"key":"4593_CR39","first-page":"305","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"Q T Truong","year":"2019","unstructured":"Truong Q T, Lauw H W. VistaNet: visual aspect attention network for multimodal sentiment analysis. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2019. 305\u2013312"},{"key":"4593_CR40","doi-asserted-by":"crossref","first-page":"223","DOI":"10.1145\/2502081.2502282","volume-title":"Proceedings of the 21st ACM International Conference on Multimedia","author":"D Borth","year":"2013","unstructured":"Borth D, Ji R, Chen T, et al. Large-scale visual sentiment ontology and detectors using adjective noun pairs. In: Proceedings of the 21st ACM International Conference on Multimedia, 2013. 223\u2013232"},{"key":"4593_CR41","volume-title":"Proceedings of the International Conference on Internet Multimedia Computing and Service","author":"M Wang","year":"2014","unstructured":"Wang M, Cao D, Li L, et al. Microblog sentiment analysis based on cross-media bag-of-words model. In: Proceedings of the International Conference on Internet Multimedia Computing and Service, 2014"},{"key":"4593_CR42","doi-asserted-by":"crossref","first-page":"159","DOI":"10.1007\/978-3-319-25207-0_14","volume-title":"Proceedings of National CCF Conference on Natural Language Processing and Chinese Computing (NLPCC)","author":"G Cai","year":"2015","unstructured":"Cai G, Xia B. Convolutional neural networks for multimedia sentiment analysis. In: Proceedings of National CCF Conference on Natural Language Processing and Chinese Computing (NLPCC), 2015. 159\u2013167"},{"key":"4593_CR43","doi-asserted-by":"crossref","first-page":"41","DOI":"10.3390\/a9020041","volume":"9","author":"Y Yu","year":"2016","unstructured":"Yu Y, Lin H, Meng J, et al. Visual and textual sentiment analysis of a microblog using deep convolutional neural networks. Algorithms, 2016, 9: 41","journal-title":"Algorithms"},{"key":"4593_CR44","first-page":"152","volume-title":"Proceedings of the IEEE International Conference on Intelligence and Security Informatics (ISI)","author":"N Xu","year":"2017","unstructured":"Xu N. Analyzing multimodal public sentiment based on hierarchical semantic attentional network. In: Proceedings of the IEEE International Conference on Intelligence and Security Informatics (ISI), 2017. 152\u2013154"},{"key":"4593_CR45","first-page":"2399","volume-title":"Proceedings of the ACM Conference on Information and Knowledge Management (CIKM)","author":"N Xu","year":"2017","unstructured":"Xu N, Mao W. MultiSentiNet: a deep semantic network for multimodal sentiment analysis. In: Proceedings of the ACM Conference on Information and Knowledge Management (CIKM), 2017. 2399\u20132402"},{"key":"4593_CR46","first-page":"929","volume-title":"Proceedings of the 41st International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR)","author":"N Xu","year":"2018","unstructured":"Xu N, Mao W, Chen G, et al. A co-memory network for multimodal sentiment analysis. In: Proceedings of the 41st International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR), 2018. 929\u2013932"},{"key":"4593_CR47","first-page":"328","volume-title":"Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing","author":"X Yang","year":"2021","unstructured":"Yang X, Feng S, Zhang Y, et al. Multimodal sentiment detection based on multi-channel graph neural networks. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing, 2021. 328\u2013339"},{"key":"4593_CR48","doi-asserted-by":"crossref","first-page":"189","DOI":"10.1145\/3503161.3548306","volume-title":"Proceedings of the 30th ACM International Conference on Multimedia","author":"Y Yu","year":"2022","unstructured":"Yu Y, Zhang D, Li S, et al. Unified multi-modal pre-training for few-shot sentiment analysis with prompt-based learning. In: Proceedings of the 30th ACM International Conference on Multimedia, 2022. 189\u2013198"},{"key":"4593_CR49","first-page":"1","volume-title":"Proceedings of IEEE International Conference on Multimedia and Expo (ICME)","author":"Y Yu","year":"2022","unstructured":"Yu Y, Zhang D. Few-shot multi-modal sentiment analysis with prompt-based vision-aware language modeling. In: Proceedings of IEEE International Conference on Multimedia and Expo (ICME), 2022. 1\u20136"},{"key":"4593_CR50","doi-asserted-by":"crossref","first-page":"6868","DOI":"10.1109\/TMM.2022.3214989","volume":"25","author":"T Zhu","year":"2023","unstructured":"Zhu T, Li L, Yang J, et al. Multimodal emotion classification with multi-level semantic reasoning network. IEEE Trans Multimedia, 2023, 25: 6868\u20136880","journal-title":"IEEE Trans Multimedia"},{"key":"4593_CR51","doi-asserted-by":"crossref","first-page":"4014","DOI":"10.1109\/TMM.2020.3035277","volume":"23","author":"X Yang","year":"2020","unstructured":"Yang X, Feng S, Wang D, et al. Image-text multimodal emotion classification via multi-view attentional network. IEEE Trans Multimedia, 2020, 23: 4014\u20134026","journal-title":"IEEE Trans Multimedia"},{"key":"4593_CR52","doi-asserted-by":"crossref","first-page":"2506","DOI":"10.18653\/v1\/P19-1239","volume-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL)","author":"Y Cai","year":"2019","unstructured":"Cai Y, Cai H, Wan X, et al. Multimodal sarcasm detection in Twitter with hierarchical fusion model. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL), 2019. 2506\u20132515"},{"key":"4593_CR53","volume-title":"MMSD 2.0: towards a reliable multi-modal sarcasm detection system","author":"L Qin","year":"2023","unstructured":"Qin L, Huang S, Chen Q, et al. MMSD 2.0: towards a reliable multi-modal sarcasm detection system. 2023. ArXiv:2307.07135"},{"key":"4593_CR54","doi-asserted-by":"crossref","first-page":"1136","DOI":"10.1145\/2964284.2964321","volume-title":"Proceedings of the 24th ACM International Conference on Multimedia","author":"R Schifanella","year":"2016","unstructured":"Schifanella R, de Juan P, Tetreault J, et al. Detecting sarcasm in multimodal social platforms. In: Proceedings of the 24th ACM International Conference on Multimedia, 2016. 1136\u20131145"},{"key":"4593_CR55","first-page":"1383","volume-title":"Proceedings of Findings of the Association for Computational Linguistics","author":"H Pan","year":"2020","unstructured":"Pan H, Lin Z, Fu P, et al. Modeling intra and inter-modality incongruity for multi-modal sarcasm detection. In: Proceedings of Findings of the Association for Computational Linguistics, 2020. 1383\u20131392"},{"key":"4593_CR56","doi-asserted-by":"crossref","first-page":"19","DOI":"10.18653\/v1\/2020.nlpbt-1.3","volume-title":"Proceedings of the 1st International Workshop on Natural Language Processing Beyond Text","author":"X Wang","year":"2020","unstructured":"Wang X, Sun X, Yang T, et al. Building a bridge: a method for image-text sarcasm detection without pretraining on image-text data. In: Proceedings of the 1st International Workshop on Natural Language Processing Beyond Text, 2020. 19\u201329"},{"key":"4593_CR57","doi-asserted-by":"crossref","first-page":"7399","DOI":"10.1007\/s12652-022-04447-y","volume":"14","author":"D Tom\u00e1s","year":"2023","unstructured":"Tom\u00e1s D, Ortega-Bueno R, Zhang G, et al. Transformer-based models for multimodal irony detection. J Ambient Intell Hum Comput, 2023, 14: 7399\u20137410","journal-title":"J Ambient Intell Hum Comput"},{"key":"4593_CR58","doi-asserted-by":"crossref","first-page":"4707","DOI":"10.1145\/3474085.3475190","volume-title":"Proceedings of the 29th ACM International Conference on Multimedia","author":"B Liang","year":"2021","unstructured":"Liang B, Lou C, Li X, et al. Multi-modal sarcasm detection with interactive in-modal and cross-modal graphs. In: Proceedings of the 29th ACM International Conference on Multimedia, 2021. 4707\u20134715"},{"key":"4593_CR59","first-page":"1767","volume-title":"Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (ACL)","author":"B Liang","year":"2022","unstructured":"Liang B, Lou C, Li X, et al. Multi-modal sarcasm detection via cross-modal graph convolutional network. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (ACL), 2022. 1767\u20131777"},{"key":"4593_CR60","doi-asserted-by":"crossref","first-page":"101921","DOI":"10.1016\/j.inffus.2023.101921","volume":"100","author":"T Yue","year":"2023","unstructured":"Yue T, Mao R, Wang H, et al. KnowleNet: knowledge fusion network for multimodal sarcasm detection. Inf Fusion, 2023, 100: 101921","journal-title":"Inf Fusion"},{"key":"4593_CR61","doi-asserted-by":"crossref","first-page":"2146","DOI":"10.3390\/app14052146","volume":"14","author":"H Liu","year":"2024","unstructured":"Liu H, Yang B, Yu Z. A multi-view interactive approach for multimodal sarcasm detection in social Internet of Things with knowledge enhancement. Appl Sci, 2024, 14: 2146","journal-title":"Appl Sci"},{"key":"4593_CR62","doi-asserted-by":"crossref","first-page":"855","DOI":"10.3390\/electronics13050855","volume":"13","author":"H Fu","year":"2024","unstructured":"Fu H, Liu H, Wang H, et al. Multi-modal sarcasm detection with sentiment word embedding. Electronics, 2024, 13: 855","journal-title":"Electronics"},{"key":"4593_CR63","first-page":"1732","volume-title":"Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)","author":"B Tang","year":"2024","unstructured":"Tang B, Lin B, Yan H, et al. Leveraging generative large language models with visual instruction and demonstration retrieval for multimodal sarcasm detection. In: Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT), 2024. 1732\u20131742"},{"key":"4593_CR64","volume-title":"CofiPara: a coarse-to-fine paradigm for multimodal sarcasm target identification with large multimodal models","author":"H Lin","year":"2024","unstructured":"Lin H, Chen Z, Luo Z, et al. CofiPara: a coarse-to-fine paradigm for multimodal sarcasm target identification with large multimodal models. 2024. ArXiv:2405.00390"},{"key":"4593_CR65","volume-title":"Proceedings of the International Joint Conference on Artificial Intelligence (IJCAI)","author":"J Yu","year":"2019","unstructured":"Yu J, Jiang J. Adapting BERT for target-oriented multimodal sentiment classification. In: Proceedings of the International Joint Conference on Artificial Intelligence (IJCAI), 2019"},{"key":"4593_CR66","first-page":"4498","volume-title":"Proceedings of the Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING)","author":"Z Zheng","year":"2024","unstructured":"Zheng Z, Zhang Z, Wang Z, et al. Decompose, prioritize, and eliminate: dynamically integrating diverse representations for multimodal named entity recognition. In: Proceedings of the Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING), 2024. 4498\u20134508"},{"key":"4593_CR67","doi-asserted-by":"crossref","first-page":"1038","DOI":"10.1145\/3394171.3413650","volume-title":"Proceedings of the 28th ACM International Conference on Multimedia","author":"Z Wu","year":"2020","unstructured":"Wu Z, Zheng C, Cai Y, et al. Multimodal representation with embedded visual guiding objects for named entity recognition in social media posts. In: Proceedings of the 28th ACM International Conference on Multimedia, 2020. 1038\u20131046"},{"key":"4593_CR68","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)","author":"Q Zhang","year":"2018","unstructured":"Zhang Q, Fu J, Liu X, et al. Adaptive co-attention network for named entity recognition in tweets. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), 2018"},{"key":"4593_CR69","first-page":"13860","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)","author":"L Sun","year":"2021","unstructured":"Sun L, Wang J, Zhang K, et al. RpBERT: a text-image relation propagation-based BERT model for multimodal NER. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), 2021. 13860\u201313868"},{"key":"4593_CR70","volume-title":"Proceedings of the Association for Computational Linguistics (ACL)","author":"J Yu","year":"2022","unstructured":"Yu J, Jiang J, Yang L, et al. Improving multimodal named entity recognition via entity span detection with unified multimodal transformer. In: Proceedings of the Association for Computational Linguistics (ACL), 2022"},{"key":"4593_CR71","volume-title":"Multimodal named entity recognition for short social media posts","author":"S Moon","year":"2018","unstructured":"Moon S, Neves L, Carvalho V, et al. Multimodal named entity recognition for short social media posts. 2018. ArXiv:1802.07862"},{"key":"4593_CR72","first-page":"14347","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)","author":"D Zhang","year":"2021","unstructured":"Zhang D, Wei S, Li S, et al. Multi-modal graph fusion for named entity recognition with targeted visual guidance. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), 2021. 14347\u201314355"},{"key":"4593_CR73","first-page":"371","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)","author":"N Xu","year":"2019","unstructured":"Xu N, Mao W, Chen G, et al. Multi-interactive memory network for aspect-based multimodal sentiment analysis. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), 2019. 371\u2013378"},{"key":"4593_CR74","doi-asserted-by":"crossref","first-page":"3034","DOI":"10.1145\/3474085.3475692","volume-title":"Proceedings of the 29th ACM International Conference on Multimedia","author":"Z Khan","year":"2021","unstructured":"Khan Z, Fu Y, et al. Exploiting BERT for multimodal target sentiment classification through input space translation. In: Proceedings of the 29th ACM International Conference on Multimedia, 2021. 3034\u20133042"},{"key":"4593_CR75","doi-asserted-by":"crossref","first-page":"429","DOI":"10.1109\/TASLP.2019.2957872","volume":"28","author":"J Yu","year":"2020","unstructured":"Yu J, Jiang J, Xia R. Entity-sensitive attention and fusion network for entity-level multimodal sentiment classification. IEEE ACM Trans Audio Speech Lang Process, 2020, 28: 429\u2013439","journal-title":"IEEE ACM Trans Audio Speech Lang Process"},{"key":"4593_CR76","first-page":"3324","volume-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)","author":"H Yang","year":"2022","unstructured":"Yang H, Zhao Y, Qin B, et al. Face-sensitive image-to-emotional-text cross-modal translation for multimodal aspect-based sentiment analysis. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), 2022. 3324\u20133335"},{"key":"4593_CR77","first-page":"1996","volume-title":"Proceedings of the Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING)","author":"J Feng","year":"2024","unstructured":"Feng J, Lin M, Shang L, et al. Autonomous aspect-image instruction (A2II): Q-Former guided multimodal sentiment classification. In: Proceedings of the Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING), 2024. 1996\u20132005"},{"key":"4593_CR78","volume-title":"Vision-language pre-training for multimodal aspect-based sentiment analysis","author":"Y Ling","year":"2022","unstructured":"Ling Y, Yu J, Xia R, et al. Vision-language pre-training for multimodal aspect-based sentiment analysis. 2022. ArXiv:2204.07955"},{"key":"4593_CR79","first-page":"4395","volume-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)","author":"X Ju","year":"2021","unstructured":"Ju X, Zhang D, Xiao R, et al. Joint multi-modal aspect-sentiment analysis with auxiliary cross-modal relation detection. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), 2021. 4395\u20134405"},{"key":"4593_CR80","doi-asserted-by":"crossref","first-page":"103038","DOI":"10.1016\/j.ipm.2022.103038","volume":"59","author":"L Yang","year":"2022","unstructured":"Yang L, Na J C, Yu J. Cross-modal multitask transformer for end-to-end multimodal aspect-based sentiment analysis. Inf Process Manage, 2022, 59: 103038","journal-title":"Inf Process Manage"},{"key":"4593_CR81","doi-asserted-by":"crossref","first-page":"102304","DOI":"10.1016\/j.inffus.2024.102304","volume":"106","author":"L Xiao","year":"2024","unstructured":"Xiao L, Wu X, Xu J, et al. Atlantis: aesthetic-oriented multiple granularities fusion network for joint multimodal aspect-based sentiment analysis. Inf Fusion, 2024, 106: 102304","journal-title":"Inf Fusion"},{"key":"4593_CR82","volume-title":"M2DF: multi-grainedmulti-curriculumdenoising framework for multimodal aspect-based sentiment analysis","author":"F Zhao","year":"2023","unstructured":"Zhao F, Li C,Wu Z, et al. M2DF: multi-grainedmulti-curriculumdenoising framework for multimodal aspect-based sentiment analysis. 2023. ArXiv:2310.14605"},{"key":"4593_CR83","volume-title":"AoM: detecting aspect-oriented information for multimodal aspect-based sentiment analysis","author":"R Zhou","year":"2023","unstructured":"Zhou R, Guo W, Liu X, et al. AoM: detecting aspect-oriented information for multimodal aspect-based sentiment analysis. 2023. ArXiv:2306.01004"},{"key":"4593_CR84","volume-title":"RNG: reducing multi-level noise and multi-grained semantic GAP for joint multimodal aspect-sentiment analysis","author":"Y Liu","year":"2024","unstructured":"Liu Y, Zhou Y, Li Z, et al. RNG: reducing multi-level noise and multi-grained semantic GAP for joint multimodal aspect-sentiment analysis. 2024. ArXiv:2405.13059"},{"key":"4593_CR85","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/s10462-023-10620-2","volume":"57","author":"Y Li","year":"2024","unstructured":"Li Y, Ding H, Lin Y, et al. Multi-level textual-visual alignment and fusion network for multimodal aspect-based sentiment analysis. Artif Intell Rev, 2024, 57: 1\u201326","journal-title":"Artif Intell Rev"},{"key":"4593_CR86","volume-title":"Few-shot joint multimodal aspect-sentiment analysis based on generative multimodal prompt","author":"X Yang","year":"2023","unstructured":"Yang X, Feng S, Wang D, et al. Few-shot joint multimodal aspect-sentiment analysis based on generative multimodal prompt. 2023. ArXiv:2305.10169"},{"key":"4593_CR87","doi-asserted-by":"crossref","first-page":"103724","DOI":"10.1016\/j.ipm.2024.103724","volume":"61","author":"L Yang","year":"2024","unstructured":"Yang L, Wang Z, Li Z, et al. An empirical study of multimodal entity-based sentiment analysis with ChatGPT: improving in-context learning via entity-aware contrastive learning. Inf Process Manage, 2024, 61: 103724","journal-title":"Inf Process Manage"},{"key":"4593_CR88","doi-asserted-by":"crossref","first-page":"169","DOI":"10.1145\/2070481.2070509","volume-title":"Proceedings of the 13th International Conference on Multimodal Interfaces","author":"L P Morency","year":"2011","unstructured":"Morency L P, Mihalcea R, Doshi P, et al. Towards multimodal sentiment analysis: harvesting opinions from the web. In: Proceedings of the 13th International Conference on Multimodal Interfaces, 2011. 169\u2013176"},{"key":"4593_CR89","volume-title":"MOSI: multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos","author":"A Zadeh","year":"2016","unstructured":"Zadeh A, Zellers R, Pincus E, et al. MOSI: multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos. 2016. ArXiv:1606.06259"},{"key":"4593_CR90","first-page":"2236","volume-title":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (ACL)","author":"A B Zadeh","year":"2018","unstructured":"Zadeh A B, Liang P P, Poria S, et al. Multimodal language analysis in the wild: CMU-MOSEI dataset and interpretable dynamic fusion graph. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (ACL), 2018. 2236\u20132246"},{"key":"4593_CR91","volume-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)","author":"A Zadeh","year":"2020","unstructured":"Zadeh A, Cao Y S, Hessner S, et al. CMU-MOSEAS: a multimodal language dataset for Spanish, Portuguese, German and French. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), 2020"},{"key":"4593_CR92","doi-asserted-by":"crossref","first-page":"3718","DOI":"10.18653\/v1\/2020.acl-main.343","volume-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL)","author":"W Yu","year":"2020","unstructured":"Yu W, Xu H, Meng F, et al. CH-SIMS: a Chinese multimodal sentiment analysis dataset with fine-grained annotations of modality. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL), 2020. 3718\u20133727"},{"key":"4593_CR93","first-page":"247","volume-title":"Proceedings of the International Conference on Multimodal Interaction (ICMI)","author":"Y Liu","year":"2022","unstructured":"Liu Y, Yuan Z, Mao H, et al. Make acoustic and visual cues matter: CH-SIMS v2.0 dataset and AV-Mixup consistent module. In: Proceedings of the International Conference on Multimodal Interaction (ICMI), 2022. 247\u2013258"},{"key":"4593_CR94","volume-title":"MELD: a multimodal multi-party dataset for emotion recognition in conversations","author":"S Poria","year":"2018","unstructured":"Poria S, Hazarika D, Majumder N, et al. MELD: a multimodal multi-party dataset for emotion recognition in conversations. 2018. ArXiv:1810.02508"},{"key":"4593_CR95","first-page":"1103","volume-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)","author":"A Zadeh","year":"2017","unstructured":"Zadeh A, Chen M, Poria S, et al. Tensor fusion network for multimodal sentiment analysis. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), 2017. 1103\u20131114"},{"key":"4593_CR96","first-page":"7216","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"Y Wang","year":"2019","unstructured":"Wang Y, Shen Y, Liu Z, et al. Words can shift: dynamically adjusting word representations using nonverbal behaviors. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2019. 7216\u20137223"},{"key":"4593_CR97","doi-asserted-by":"crossref","first-page":"1122","DOI":"10.1145\/3394171.3413678","volume-title":"Proceedings of the 28th ACM International Conference on Multimedia","author":"D Hazarika","year":"2020","unstructured":"Hazarika D, Zimmermann R, Poria S, et al. MISA: modality-invariant and -specific representations for multimodal sentiment analysis. In: Proceedings of the 28th ACM International Conference on Multimedia, 2020. 1122\u20131131"},{"key":"4593_CR98","doi-asserted-by":"crossref","first-page":"163","DOI":"10.1145\/3136755.3136801","volume-title":"Proceedings of the 19th ACM International Conference on Multimodal Interaction","author":"M Chen","year":"2017","unstructured":"Chen M, Wang S, Liang P P, et al. Multimodal sentiment analysis with word-level fusion and reinforcement learning. In: Proceedings of the 19th ACM International Conference on Multimodal Interaction, 2017. 163\u2013171"},{"key":"4593_CR99","doi-asserted-by":"crossref","first-page":"2359","DOI":"10.18653\/v1\/2020.acl-main.214","volume-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL)","author":"W Rahman","year":"2020","unstructured":"Rahman W, Hasan M K, Lee S, et al. Integrating multimodal information in large pretrained transformers. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL), 2020. 2359\u20132369"},{"key":"4593_CR100","doi-asserted-by":"crossref","first-page":"2046","DOI":"10.18653\/v1\/2020.emnlp-main.161","volume-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)","author":"L Li","year":"2020","unstructured":"Li L, Chen Y C, Cheng Y, et al. HERO: hierarchical encoder for video+ language omni-representation pre-training. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), 2020. 2046\u20132065"},{"key":"4593_CR101","doi-asserted-by":"crossref","first-page":"479","DOI":"10.1007\/s00530-014-0407-8","volume":"22","author":"D Cao","year":"2016","unstructured":"Cao D, Ji R, Lin D, et al. A cross-media public sentiment analysis system for microblog. Multimedia Syst, 2016, 22: 479\u2013486","journal-title":"Multimedia Syst"},{"key":"4593_CR102","volume-title":"Learning modality-specific representations with self-supervisedmulti-task learning for multimodal sentiment analysis","author":"X H YuW","year":"2021","unstructured":"YuW, Xu H, Yuan Z, et al. Learning modality-specific representations with self-supervisedmulti-task learning for multimodal sentiment analysis. 2021. ArXiv:2102.04830"},{"key":"4593_CR103","volume-title":"Sentiment word aware multimodal refinement for multimodal sentiment analysis with ASR errors","author":"Y Wu","year":"2022","unstructured":"Wu Y, Zhao Y, Yang H, et al. Sentiment word aware multimodal refinement for multimodal sentiment analysis with ASR errors. 2022. ArXiv:2203.00257"},{"key":"4593_CR104","doi-asserted-by":"crossref","first-page":"6558","DOI":"10.18653\/v1\/P19-1656","volume-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL)","author":"Y H H Tsai","year":"2019","unstructured":"Tsai Y H H, Bai S, Liang P P, et al. Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL), 2019. 6558\u20136569"},{"key":"4593_CR105","first-page":"11151","volume-title":"Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"J Huang","year":"2024","unstructured":"Huang J, Pu Y, Zhou D, et al. Multimodal sentiment analysis based on 3D stereoscopic attention. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2024. 11151\u201311155"},{"key":"4593_CR106","doi-asserted-by":"crossref","first-page":"2276","DOI":"10.1109\/TAFFC.2022.3172360","volume":"14","author":"S Mai","year":"2023","unstructured":"Mai S, Zeng Y, Zheng S, et al. Hybrid contrastive learning of tri-modal representation for multimodal sentiment analysis. IEEE Trans Affective Comput, 2023, 14: 2276\u20132289","journal-title":"IEEE Trans Affective Comput"},{"key":"4593_CR107","volume-title":"Multimodal contrastive learning via uni-modal coding and cross-modal prediction for multimodal sentiment analysis","author":"R Lin","year":"2022","unstructured":"Lin R, Hu H. Multimodal contrastive learning via uni-modal coding and cross-modal prediction for multimodal sentiment analysis. 2022. ArXiv:2210.14556"},{"key":"4593_CR108","first-page":"8992","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)","author":"Z Sun","year":"2020","unstructured":"Sun Z, Sarma P, Sethares W, et al. Learning relationships between text, audio, and video via deep canonical correlation for multimodal language analysis. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), 2020. 8992\u20138999"},{"key":"4593_CR109","first-page":"164","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)","author":"S Mai","year":"2020","unstructured":"Mai S, Hu H, Xing S, et al. Modality to modality translation: an adversarial representation learning and graph fusion network for multimodal fusion. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), 2020. 164\u2013172"},{"key":"4593_CR110","volume-title":"Efficient low-rank multimodal fusion with modality-specific factors","author":"Z Liu","year":"2018","unstructured":"Liu Z, Shen Y, Lakshminarasimhan V B, et al. Efficient low-rank multimodal fusion with modality-specific factors. 2018. ArXiv:1806.00064"},{"key":"4593_CR111","doi-asserted-by":"crossref","first-page":"481","DOI":"10.18653\/v1\/P19-1046","volume-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL)","author":"S Mai","year":"2019","unstructured":"Mai S, Hu H, Xing S, et al. Divide, conquer and combine: hierarchical feature fusion network with local and global perspectives for multimodal affective computing. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL), 2019. 481\u2013492"},{"key":"4593_CR112","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)","author":"A Zadeh","year":"2018","unstructured":"Zadeh A, Liang P P, Mazumder N, et al. Memory fusion network for multi-view sequential learning. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), 2018"},{"key":"4593_CR113","volume-title":"Multimodal end-to-end sparse model for emotion recognition","author":"W Dai","year":"2021","unstructured":"Dai W, Cahyawijaya S, Liu Z, et al. Multimodal end-to-end sparse model for emotion recognition. 2021. ArXiv:2103.09666"},{"key":"4593_CR114","doi-asserted-by":"crossref","first-page":"111136","DOI":"10.1016\/j.knosys.2023.111136","volume":"283","author":"G Yi","year":"2024","unstructured":"Yi G, Fan C, Zhu K, et al. VLP2MSA: expanding vision-language pre-training to multimodal sentiment analysis. Knowl-Based Syst, 2024, 283: 111136","journal-title":"Knowl-Based Syst"},{"key":"4593_CR115","doi-asserted-by":"crossref","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","volume":"42","author":"C Busso","year":"2008","unstructured":"Busso C, Bulut M, Lee C C, et al. IEMOCAP: interactive emotional dyadic motion capture database. Lang Resour Evaluation, 2008, 42: 335\u2013359","journal-title":"Lang Resour Evaluation"},{"key":"4593_CR116","volume-title":"M3ED: multi-modal multi-scene multi-label emotional dialogue database","author":"J Zhao","year":"2022","unstructured":"Zhao J, Zhang T, Hu J, et al. M3ED: multi-modal multi-scene multi-label emotional dialogue database. 2022. ArXiv:2205.10237"},{"key":"4593_CR117","doi-asserted-by":"crossref","first-page":"9610","DOI":"10.1145\/3581783.3612836","volume-title":"Proceedings of the 31st ACM International Conference on Multimedia","author":"Z Lian","year":"2023","unstructured":"Lian Z, Sun H, Sun L, et al. MER 2023: multi-label learning, modality robustness, and semi-supervised learning. In: Proceedings of the 31st ACM International Conference on Multimedia, 2023. 9610\u20139614"},{"key":"4593_CR118","volume-title":"Explainable multimodal emotion reasoning","author":"Z Lian","year":"2023","unstructured":"Lian Z, Sun L, Xu M, et al. Explainable multimodal emotion reasoning. 2023. ArXiv:2306.15401"},{"key":"4593_CR119","volume-title":"MER 2024: semi-supervised learning, noise robustness, and open-vocabulary multimodal emotion recognition","author":"Z Lian","year":"2024","unstructured":"Lian Z, Sun H, Sun L, et al. MER 2024: semi-supervised learning, noise robustness, and open-vocabulary multimodal emotion recognition. 2024. ArXiv:2404.17113"},{"key":"4593_CR120","doi-asserted-by":"crossref","first-page":"4619","DOI":"10.18653\/v1\/P19-1455","volume-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL)","author":"S Castro","year":"2019","unstructured":"Castro S, Hazarika D, P\u00e9rez-Rosas V, et al. Towards multimodal sarcasm detection (an obviously perfect paper). In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL), 2019. 4619\u20134629"},{"key":"4593_CR121","doi-asserted-by":"crossref","first-page":"4351","DOI":"10.18653\/v1\/2020.acl-main.401","volume-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL)","author":"D S Chauhan","year":"2020","unstructured":"Chauhan D S, Dhanush S R, Ekbal A, et al. Sentiment and emotion help sarcasm? A multi-task learning framework for multi-modal sarcasm, sentiment and emotion analysis. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL), 2020. 4351\u20134360"},{"key":"4593_CR122","doi-asserted-by":"crossref","first-page":"86","DOI":"10.1109\/MMUL.2021.3069097","volume":"28","author":"Y Wu","year":"2021","unstructured":"Wu Y, Zhao Y, Lu X, et al. Modeling incongruity between modalities for multimodal sarcasm detection. IEEE MultiMedia, 2021, 28: 86\u201395","journal-title":"IEEE MultiMedia"},{"key":"4593_CR123","first-page":"6298","volume-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"L Chen","year":"2016","unstructured":"Chen L, Zhang H, Xiao J, et al. SCA-CNN: spatial and channel-wise attention in convolutional networks for image captioning. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2016. 6298\u20136306"},{"key":"4593_CR124","doi-asserted-by":"crossref","first-page":"12039","DOI":"10.1109\/ACCESS.2024.3354844","volume":"12","author":"H Zhao","year":"2024","unstructured":"Zhao H, Yang M, Bai X, et al. A survey on multimodal aspect-based sentiment analysis. IEEE Access, 2024, 12: 12039\u201312052","journal-title":"IEEE Access"},{"key":"4593_CR125","volume-title":"UR-FUNNY: a multimodal language dataset for understanding humor","author":"M K Hasan","year":"2019","unstructured":"Hasan M K, Rahman W, Zadeh A, et al. UR-FUNNY: a multimodal language dataset for understanding humor. 2019. ArXiv:1904.06618"},{"key":"4593_CR126","doi-asserted-by":"crossref","first-page":"2544","DOI":"10.1002\/asi.21416","volume":"61","author":"M Thelwall","year":"2010","unstructured":"Thelwall M, Buckley K, Paltoglou G, et al. Sentiment strength detection in short informal text. J Am Soc Inf Sci, 2010, 61: 2544\u20132558","journal-title":"J Am Soc Inf Sci"},{"key":"4593_CR127","volume-title":"CLMLF: a contrastive learning and multi-layer fusion method for multimodal sentiment detection","author":"Z Li","year":"2022","unstructured":"Li Z, Xu B, Zhu C, et al. CLMLF: a contrastive learning and multi-layer fusion method for multimodal sentiment detection. 2022. ArXiv:2204.05515"},{"key":"4593_CR128","volume-title":"WisdoM: improving multimodal sentiment analysis by fusing contextual world knowledge","author":"W Wang","year":"2024","unstructured":"Wang W, Ding L, Shen L, et al. WisdoM: improving multimodal sentiment analysis by fusing contextual world knowledge. 2024. ArXiv:2401.06659"},{"key":"4593_CR129","doi-asserted-by":"crossref","first-page":"3538","DOI":"10.18653\/v1\/P19-1344","volume-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL)","author":"D Ma","year":"2019","unstructured":"Ma D, Li S, Wu F, et al. Exploring sequence-to-sequence learning in aspect term extraction. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL), 2019. 3538\u20133547"},{"key":"4593_CR130","doi-asserted-by":"crossref","first-page":"2107","DOI":"10.18653\/v1\/2020.emnlp-main.164","volume-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)","author":"Z Chen","year":"2020","unstructured":"Chen Z, Qian T. Enhancing aspect term extraction with soft prototypes. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), 2020. 2107\u20132117"},{"key":"4593_CR131","volume-title":"Leveraging just a few keywords for fine-grained aspect detection through weakly supervised co-training","author":"G Karamanolakis","year":"2019","unstructured":"Karamanolakis G, Hsu D, Gravano L, et al. Leveraging just a few keywords for fine-grained aspect detection through weakly supervised co-training. 2019. ArXiv:1909.00415"},{"key":"4593_CR132","volume-title":"FSUIE: a novel fuzzy span mechanism for universal information extraction","author":"T Peng","year":"2023","unstructured":"Peng T, Li Z, Zhang L, et al. FSUIE: a novel fuzzy span mechanism for universal information extraction. 2023. ArXiv:2306.14913"},{"key":"4593_CR133","first-page":"18869","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)","author":"T Peng","year":"2024","unstructured":"Peng T, Li Z, Wang P, et al. A novel energy based model mechanism for multi-modal aspect-based sentiment analysis. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), 2024. 18869\u201318878"},{"key":"4593_CR134","volume-title":"A tutorial on energy-based learning. In: Predicting Structured Data","author":"Y LeCun","year":"2006","unstructured":"LeCun Y, Chopra S, Hadsell R, et al. A tutorial on energy-based learning. In: Predicting Structured Data. Cambridge: The MIT Press, 2006"},{"key":"4593_CR135","first-page":"303","volume-title":"Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing (AACL-IJCNLP)","author":"M N Sundararaman","year":"2020","unstructured":"Sundararaman M N, Ahmad Z, Ekbal A, et al. Unsupervised aspect-level sentiment controllable style transfer. In: Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing (AACL-IJCNLP), 2020. 303\u2013312"},{"key":"4593_CR136","doi-asserted-by":"crossref","first-page":"7012","DOI":"10.18653\/v1\/2020.emnlp-main.570","volume-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)","author":"Y Ji","year":"2020","unstructured":"Ji Y, Liu H, He B, et al. Diversified multiple instance learning for document-level multi-aspect sentiment classification. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), 2020. 7012\u20137023"},{"key":"4593_CR137","first-page":"150","volume-title":"Proceedings of the 28th International Conference on Computational Linguistics (COLING)","author":"B Liang","year":"2020","unstructured":"Liang B, Yin R, Gui L, et al. Jointly learning aspect-focused and inter-aspect relations with graph convolutional networks for aspect sentiment analysis. In: Proceedings of the 28th International Conference on Computational Linguistics (COLING), 2020. 150\u2013161"},{"key":"4593_CR138","doi-asserted-by":"crossref","first-page":"110021","DOI":"10.1016\/j.knosys.2022.110021","volume":"258","author":"J Ye","year":"2022","unstructured":"Ye J, Zhou J, Tian J, et al. Sentiment-aware multimodal pre-training for multimodal sentiment analysis. Knowl-Based Syst, 2022, 258: 110021","journal-title":"Knowl-Based Syst"},{"key":"4593_CR139","first-page":"27730","volume-title":"Proceedings of Advances in Neural Information Processing Systems (NeurIPS)","author":"L Ouyang","year":"2022","unstructured":"Ouyang L, Wu J, Jiang X, et al. Training language models to follow instructions with human feedback. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS), 2022. 27730\u201327744"},{"key":"4593_CR140","volume-title":"Self-instruct: aligning language models with self-generated instructions","author":"Y Wang","year":"2022","unstructured":"Wang Y, Kordi Y, Mishra S, et al. Self-instruct: aligning language models with self-generated instructions. 2022. ArXiv:2212.10560"},{"key":"4593_CR141","volume-title":"Proceedings of Advances in Neural Information Processing Systems (NeurIPS)","author":"W Dai","year":"2024","unstructured":"Dai W, Li J, Li D, et al. InstructBLIP: towards general-purpose vision-language models with instruction tuning. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS), 2024"},{"key":"4593_CR142","first-page":"270","volume-title":"Proceedings of Findings of the Association for Computational Linguistics","author":"J Ye","year":"2023","unstructured":"Ye J, Zhou J, Tian J, et al. Rethinking TMSC: an empirical study for target-oriented multimodal sentiment classification. In: Proceedings of Findings of the Association for Computational Linguistics, 2023. 270\u2013277"},{"key":"4593_CR143","first-page":"10941","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"X Wei","year":"2020","unstructured":"Wei X, Zhang T, Li Y, et al. Multi-modality cross attention network for image and sentence matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2020. 10941\u201310950"},{"key":"4593_CR144","first-page":"6077","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"P Anderson","year":"2018","unstructured":"Anderson P, He X, Buehler C, et al. Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2018. 6077\u20136086"},{"key":"4593_CR145","first-page":"704","volume-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)","author":"E Riloff","year":"2013","unstructured":"Riloff E, Qadir A, Surve P, et al. Sarcasm as contrast between a positive sentiment and negative situation. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), 2013. 704\u2013714"},{"key":"4593_CR146","first-page":"1010","volume-title":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (ACL)","author":"Y Tay","year":"2018","unstructured":"Tay Y, Luu A T, Hui S C, et al. Reasoning with sarcasm by reading in between. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (ACL), 2018. 1010\u20131020"},{"key":"4593_CR147","first-page":"2115","volume-title":"Proceedings of the Web Conference (WWW)","author":"T Xiong","year":"2019","unstructured":"Xiong T, Zhang P, Zhu H, et al. Sarcasm detection with self-matching networks and low-rank bilinear pooling. In: Proceedings of the Web Conference (WWW), 2019. 2115\u20132124"},{"key":"4593_CR148","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)","author":"R Speer","year":"2017","unstructured":"Speer R, Chin J, Havasi C, et al. ConceptNet 5.5: an open multilingual graph of general knowledge. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), 2017"},{"key":"4593_CR149","doi-asserted-by":"crossref","first-page":"102218","DOI":"10.1016\/j.inffus.2023.102218","volume":"105","author":"A V Geetha","year":"2024","unstructured":"Geetha A V, Mala T, Priyanka D, et al. Multimodal emotion recognition with deep learning: advancements, challenges, and future directions. Inf Fusion, 2024, 105: 102218","journal-title":"Inf Fusion"},{"key":"4593_CR150","doi-asserted-by":"crossref","first-page":"6690","DOI":"10.1109\/TCSS.2024.3396345","volume":"11","author":"Z Zhang","year":"2024","unstructured":"Zhang Z, Peng L, Pang T, et al. Refashioning emotion recognition modeling: the advent of generalized large models. IEEE Trans Comput Soc Syst, 2024, 11: 6690\u20136704","journal-title":"IEEE Trans Comput Soc Syst"},{"key":"4593_CR151","first-page":"11326","volume-title":"Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"L Peng","year":"2024","unstructured":"Peng L, Zhang Z, Pang T, et al. Customising general large language models for specialised emotion recognition tasks. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2024. 11326\u201311330"},{"key":"4593_CR152","volume-title":"MM-InstructEval: zero-shot evaluation of (multimodal) large language models on multimodal reasoning tasks","author":"X Yang","year":"2024","unstructured":"Yang X, Wu W, Feng S, et al. MM-InstructEval: zero-shot evaluation of (multimodal) large language models on multimodal reasoning tasks. 2024. ArXiv:2405.07229"}],"container-title":["Science China Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-024-4593-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11432-024-4593-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-024-4593-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,4]],"date-time":"2025-10-04T10:38:25Z","timestamp":1759574305000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11432-024-4593-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,28]]},"references-count":152,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2025,10]]}},"alternative-id":["4593"],"URL":"https:\/\/doi.org\/10.1007\/s11432-024-4593-8","relation":{},"ISSN":["1674-733X","1869-1919"],"issn-type":[{"value":"1674-733X","type":"print"},{"value":"1869-1919","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,28]]},"assertion":[{"value":"29 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 March 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 May 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 September 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"200101"}}