{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T08:41:01Z","timestamp":1777365661537,"version":"3.51.4"},"reference-count":60,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T00:00:00Z","timestamp":1777334400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T00:00:00Z","timestamp":1777334400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["52174184"],"award-info":[{"award-number":["52174184"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Pattern Anal Applic"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1007\/s10044-026-01676-1","type":"journal-article","created":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T07:38:53Z","timestamp":1777361933000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Text-enhanced cross-modal reinforced fusion network for multimodal sentiment analysis"],"prefix":"10.1007","volume":"29","author":[{"given":"Wantong","family":"Zhao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yongqing","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,4,28]]},"reference":[{"key":"1676_CR1","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2023.111206","volume":"152","author":"A Pandey","year":"2024","unstructured":"Pandey A, Vishwakarma DK (2024) Progress, achievements, and challenges in multimodal sentiment analysis using deep learning: A survey. Appl Soft Comput 152:111206. https:\/\/doi.org\/10.1016\/j.asoc.2023.111206","journal-title":"Appl Soft Comput"},{"key":"1676_CR2","doi-asserted-by":"publisher","unstructured":"Li D, Wang Y, Funakoshi K, Okumura M (2023) Joyful: joint modality fusion and graph contrastive learning for multimodal emotion recognition. In: Proceedings of the conference on empirical methods in natural language processing (EMNLP 2023), pp 16051\u201316069. https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-main.996","DOI":"10.18653\/v1\/2023.emnlp-main.996"},{"key":"1676_CR3","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.111096","volume":"159","author":"K Xiao","year":"2024","unstructured":"Xiao K, Chen B, Yang X, Cai Y (2024) Cross-modal independent matching network for image-text retrieval. Pattern Recognit 159:111096. https:\/\/doi.org\/10.1016\/j.patcog.2024.111096","journal-title":"Pattern Recognit"},{"key":"1676_CR4","doi-asserted-by":"publisher","unstructured":"Zhou R, Guo W, Liu X, Yu S, Zhang Y, Yuan X (2023) AoM: detecting aspect-oriented information for multimodal aspect-based sentiment analysis. In: Findings of the association for computational linguistics: ACL 2023, pp 8184\u20138196. https:\/\/doi.org\/10.18653\/v1\/2023.findings-acl.519","DOI":"10.18653\/v1\/2023.findings-acl.519"},{"key":"1676_CR5","doi-asserted-by":"publisher","unstructured":"Wang W, Ding L, Shen L, Luo Y, Hu H, Tao D (2024) WisdoM: improving multimodal sentiment analysis by fusing contextual world knowledge. In: Proceedings of the 32nd ACM international conference on multimedia (ACM MM 2024), pp 2282\u20132291. https:\/\/doi.org\/10.1145\/3664647.3681403","DOI":"10.1145\/3664647.3681403"},{"key":"1676_CR6","doi-asserted-by":"publisher","unstructured":"Tsai Y-H, Bai S, Liang P-P, Kolter JZ, Morency L-P, Salakhutdinov R (2019) Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the 57th annual meeting of the association for computational linguistics (ACL 2019), pp 6558\u20136569. https:\/\/doi.org\/10.18653\/v1\/P19-1656","DOI":"10.18653\/v1\/P19-1656"},{"key":"1676_CR7","doi-asserted-by":"publisher","unstructured":"Hazarika D, Zimmermann R, Poria S (2020) MISA: modality-invariant and specific representations for multimodal sentiment analysis. In: Proceedings of the 28th ACM international conference on multimedia (ACM MM 2020), pp 1122\u20131131. https:\/\/doi.org\/10.1145\/3394171.3413678","DOI":"10.1145\/3394171.3413678"},{"key":"1676_CR8","doi-asserted-by":"publisher","unstructured":"Yu W, Xu H, Yuan Z, Wu J (2021) Learning modality-specific representations with self-supervised multi-task learning for multimodal sentiment analysis. In: Proceedings of the thirty-fifth AAAI conference on artificial intelligence (AAAI 2021), pp 10790\u201310797. https:\/\/doi.org\/10.1609\/aaai.v35i12.17286","DOI":"10.1609\/aaai.v35i12.17286"},{"issue":"3","key":"1676_CR9","doi-asserted-by":"publisher","first-page":"132","DOI":"10.1007\/s10044-025-01508-8","volume":"28","author":"Y Qi","year":"2025","unstructured":"Qi Y, Ibrayim M, Tohti T (2025) Contextual xLSTM-based multimodal fusion for conversational emotion recognition. Pattern Anal Appl 28(3):132. https:\/\/doi.org\/10.1007\/s10044-025-01508-8","journal-title":"Pattern Anal Appl"},{"issue":"2","key":"1676_CR10","doi-asserted-by":"publisher","first-page":"40","DOI":"10.1007\/s10044-025-01414-z","volume":"28","author":"J Zhang","year":"2025","unstructured":"Zhang J, Yu Y, Tang S, Qi G, Wu H, Hachiya H (2025) Enhancing semantic audio-visual representation learning with supervised multi-scale attention. Pattern Anal Appl 28(2):40. https:\/\/doi.org\/10.1007\/s10044-025-01414-z","journal-title":"Pattern Anal Appl"},{"key":"1676_CR11","doi-asserted-by":"publisher","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, Uszkoreit J, Houlsby N (2021) An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. In: Proceedings of the ninth international conference on learning representations (ICLR 2021). https:\/\/doi.org\/10.48550\/arXiv.2010.11929","DOI":"10.48550\/arXiv.2010.11929"},{"key":"1676_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2023.102456","volume":"105","author":"Y Wang","year":"2024","unstructured":"Wang Y et al (2024) A dual-stage attention-based LSTM network for multimodal temporal data fusion. Knowl Based Syst 105:102456. https:\/\/doi.org\/10.1016\/j.knosys.2023.102456","journal-title":"Knowl Based Syst"},{"key":"1676_CR13","doi-asserted-by":"publisher","unstructured":"Zhang T, Li J, Liu Z, Yang R, Zhao Z (2024) TimesNet: temporal 2D-variation modeling for general time series analysis. https:\/\/doi.org\/10.48550\/arXiv.2210.02186","DOI":"10.48550\/arXiv.2210.02186"},{"key":"1676_CR14","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2025.01.003","volume":"114","author":"C Huang","year":"2025","unstructured":"Huang C et al (2025) AtCAF: attention-based causality-aware fusion network for multimodal sentiment analysis. Inf Fusion 114:102725. https:\/\/doi.org\/10.1016\/j.inffus.2025.01.003","journal-title":"Inf Fusion"},{"key":"1676_CR15","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109259","volume":"136","author":"D Wang","year":"2023","unstructured":"Wang D et al (2023) TETFN: a text enhanced transformer fusion network for multimodal sentiment analysis. Pattern Recognit 136:109259. https:\/\/doi.org\/10.1016\/j.patcog.2023.109259","journal-title":"Pattern Recognit"},{"key":"1676_CR16","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.112220","volume":"300","author":"J Hou","year":"2024","unstructured":"Hou J et al (2024) TCHFN: multimodal sentiment analysis based on text-centric hierarchical fusion network. Knowl Based Syst 300:112220. https:\/\/doi.org\/10.1016\/j.knosys.2024.112220","journal-title":"Knowl Based Syst"},{"issue":"5","key":"1676_CR17","doi-asserted-by":"publisher","first-page":"1145","DOI":"10.1007\/s10796-021-10107-x","volume":"23","author":"S Mendon","year":"2021","unstructured":"Mendon S, Dutta P, Behl A, Lessmann S (2021) A hybrid approach of machine learning and lexicons to sentiment analysis: enhanced insights from Twitter data of natural disasters. Inf Syst Front 23(5):1145\u20131168. https:\/\/doi.org\/10.1007\/s10796-021-10107-x","journal-title":"Inf Syst Front"},{"issue":"5","key":"1676_CR18","doi-asserted-by":"publisher","first-page":"114","DOI":"10.1145\/3640460","volume":"42","author":"D Rau","year":"2024","unstructured":"Rau D, Dehghani M, Kamps J (2024) Revisiting bag of words document representations for efficient ranking with transformers. ACM Trans Inf Syst 42(5):114. https:\/\/doi.org\/10.1145\/3640460","journal-title":"ACM Trans Inf Syst"},{"key":"1676_CR19","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2025.113092","volume":"175","author":"TU Sen","year":"2025","unstructured":"Sen TU, Yakit MC, Gumus MS, Abar O, Bakal G (2025) Combining n-grams and graph convolution for text classification. Appl Soft Comput 175:113092. https:\/\/doi.org\/10.1016\/j.asoc.2025.113092","journal-title":"Appl Soft Comput"},{"key":"1676_CR20","doi-asserted-by":"publisher","first-page":"189","DOI":"10.1016\/j.neucom.2019.10.118","volume":"408","author":"J Cervantes","year":"2020","unstructured":"Cervantes J, Garcia-Lamont F, Rodriguez-Mazahua L, Lopez A (2020) A comprehensive survey on support vector machine classification: applications, challenges and trends. Neurocomputing 408:189\u2013215. https:\/\/doi.org\/10.1016\/j.neucom.2019.10.118","journal-title":"Neurocomputing"},{"key":"1676_CR21","doi-asserted-by":"publisher","DOI":"10.1016\/j.cor.2021.105456","volume":"127","author":"R Blanquero","year":"2021","unstructured":"Blanquero R, Carrizosa E, Ram\u00edrez-Cobo P, Sillero-Denamiel M-R (2021) Variable selection for Na\u00efve Bayes classification. Comput Oper Res 127:105456. https:\/\/doi.org\/10.1016\/j.cor.2021.105456","journal-title":"Comput Oper Res"},{"key":"1676_CR22","doi-asserted-by":"publisher","first-page":"887","DOI":"10.1111\/ecog.03049","volume":"40","author":"SJ Phillips","year":"2017","unstructured":"Phillips SJ, Anderson RP, Dud\u00edk M, Schapire RE, Blair ME (2017) Opening the black box: an open-source release of Maxent. Ecography 40:887\u2013893. https:\/\/doi.org\/10.1111\/ecog.03049","journal-title":"Ecography"},{"issue":"4","key":"1676_CR23","doi-asserted-by":"publisher","first-page":"2753","DOI":"10.3390\/make6040132","volume":"6","author":"Y Gonzalez Tejeda","year":"2024","unstructured":"Gonzalez Tejeda Y, Mayer HA (2024) Deep learning with convolutional neural networks: a compact holistic tutorial with focus on supervised regression. Mach Learn Knowl Extract 6(4):2753\u20132782. https:\/\/doi.org\/10.3390\/make6040132","journal-title":"Mach Learn Knowl Extract"},{"key":"1676_CR24","doi-asserted-by":"publisher","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2019) BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies (NAACL-HLT 2019), pp 4171\u20134186. https:\/\/doi.org\/10.18653\/v1\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"1676_CR25","doi-asserted-by":"publisher","unstructured":"Liu Y, Ott M, Goyal N, Du J, Joshi M, Chen D, Levy O, Lewis M, Zettlemoyer L, Stoyanov V (2019) RoBERTa: a robustly optimized BERT pretraining approach. https:\/\/doi.org\/10.48550\/arXiv.1907.11692","DOI":"10.48550\/arXiv.1907.11692"},{"issue":"14","key":"1676_CR26","doi-asserted-by":"publisher","first-page":"2835","DOI":"10.3390\/electronics13142835","volume":"13","author":"Y Fu","year":"2024","unstructured":"Fu Y, Fu J, Xue H, Xu Z (2024) Self-HCL: self-supervised multitask learning with hybrid contrastive learning strategy for multimodal sentiment analysis. Electronics 13(14):2835. https:\/\/doi.org\/10.3390\/electronics13142835","journal-title":"Electronics"},{"key":"1676_CR27","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.107868","volume":"114","author":"W Sheng","year":"2021","unstructured":"Sheng W, Li X (2021) Multi-task learning for gait-based identity recognition and emotion recognition using attention enhanced temporal graph convolutional network. Pattern Recognit 114:107868. https:\/\/doi.org\/10.1016\/j.patcog.2021.107868","journal-title":"Pattern Recognit"},{"key":"1676_CR28","doi-asserted-by":"publisher","unstructured":"Poria S, Chaturvedi I, Cambria E, Hussain A (2016) Convolutional MKL-based multimodal emotion recognition and sentiment analysis. In: 2016 IEEE 16th international conference on data mining (ICDM 2016), pp 439\u2013448. https:\/\/doi.org\/10.1109\/ICDM.2016.0134","DOI":"10.1109\/ICDM.2016.0134"},{"key":"1676_CR29","doi-asserted-by":"publisher","unstructured":"Zadeh A, Liang P-P, Mazumder N, Poria S, Cambria E, Morency L-P (2018) Memory fusion network for multi-view sequential learning. In: Proceedings of the AAAI conference on artificial intelligence (AAAI 2018), vol 32, no 1, pp 5634\u20135641. https:\/\/doi.org\/10.1609\/aaai.v32i1.11552","DOI":"10.1609\/aaai.v32i1.11552"},{"key":"1676_CR30","doi-asserted-by":"publisher","unstructured":"Kampman O, Barezi EJ, Bertero D, Fung P (2018) Investigating audio, visual, and text fusion methods for end-to-end automatic personality prediction. In: Proceedings of the 56th annual meeting of the association for computational linguistics (ACL 2018): short papers, pp 606\u2013611. https:\/\/doi.org\/10.18653\/v1\/P18-2096","DOI":"10.18653\/v1\/P18-2096"},{"key":"1676_CR31","doi-asserted-by":"publisher","unstructured":"Zadeh A, Chen M, Poria S, Cambria E, Morency L-P (2017) Tensor fusion network for multimodal sentiment analysis. In: Proceedings of the 2017 conference on empirical methods in natural language processing (EMNLP 2017), pp 1103\u20131114. https:\/\/doi.org\/10.18653\/v1\/D17-1126","DOI":"10.18653\/v1\/D17-1126"},{"key":"1676_CR32","doi-asserted-by":"publisher","unstructured":"Mai S, Hu H, Xing S (2019) Divide, conquer and combine: hierarchical feature fusion network with local and global perspectives for multimodal affective computing. In: Proceedings of the 57th annual meeting of the association for computational linguistics (ACL 2019), pp 481\u2013492. https:\/\/doi.org\/10.18653\/v1\/P19-1046","DOI":"10.18653\/v1\/P19-1046"},{"key":"1676_CR33","doi-asserted-by":"publisher","unstructured":"Yang D, Yang K, Li M, Wang S, Wang S, Zhang L (2024) Robust emotion recognition in context debiasing. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR 2024), pp 12447\u201312457. https:\/\/doi.org\/10.1109\/CVPR52733.2024.01183","DOI":"10.1109\/CVPR52733.2024.01183"},{"issue":"5","key":"1676_CR34","doi-asserted-by":"publisher","first-page":"2807","DOI":"10.1007\/s00530-023-01135-5","volume":"29","author":"H Li","year":"2023","unstructured":"Li H, Li X, Wang Z et al (2023) Learning disentangled multimodal features for robust emotion recognition. Multim Syst 29(5):2807\u20132820. https:\/\/doi.org\/10.1007\/s00530-023-01135-5","journal-title":"Multim Syst"},{"issue":"4","key":"1676_CR35","doi-asserted-by":"publisher","first-page":"1347","DOI":"10.1007\/s00530-022-00917-7","volume":"28","author":"L Wang","year":"2022","unstructured":"Wang L, Zhang M (2022) Audio-text cross-modal attention with semantic alignment for multimodal sentiment analysis. Multim Syst 28(4):1347\u20131361. https:\/\/doi.org\/10.1007\/s00530-022-00917-7","journal-title":"Multim Syst"},{"issue":"3","key":"1676_CR36","doi-asserted-by":"publisher","first-page":"911","DOI":"10.1007\/s00530-021-00887-2","volume":"28","author":"Y Chen","year":"2022","unstructured":"Chen Y, Wang J, Zhang Y (2022) Explicit cross-modal alignment via graph neural networks for multimodal sentiment analysis. Multim Syst 28(3):911\u2013925. https:\/\/doi.org\/10.1007\/s00530-021-00887-2","journal-title":"Multim Syst"},{"issue":"4","key":"1676_CR37","doi-asserted-by":"publisher","first-page":"149","DOI":"10.1007\/s10044-024-01369-7","volume":"27","author":"SM Jiddah","year":"2024","unstructured":"Jiddah SM, Yurtkan K (2024) Feature fusion for human compound emotion recognition: a fusion of facial expression texture and action unit data. Pattern Anal Appl 27(4):149. https:\/\/doi.org\/10.1007\/s10044-024-01369-7","journal-title":"Pattern Anal Appl"},{"issue":"4","key":"1676_CR38","doi-asserted-by":"publisher","first-page":"2401","DOI":"10.1007\/s13042-024-02398-8","volume":"16","author":"Y Wang","year":"2024","unstructured":"Wang Y, Jiang C (2024) Fine-grained multimodal named entity recognition with heterogeneous image-text similarity graphs. Int J Mach Learn Cybern 16(4):2401\u20132415. https:\/\/doi.org\/10.1007\/s13042-024-02398-8","journal-title":"Int J Mach Learn Cybern"},{"key":"1676_CR39","doi-asserted-by":"publisher","DOI":"10.1016\/j.istruc.2024.106602","volume":"64","author":"Z Wang","year":"2024","unstructured":"Wang Z, Peng Z (2024) Structural acceleration response reconstruction based on BiLSTM network and multi-head attention mechanism. Structures 64:106602. https:\/\/doi.org\/10.1016\/j.istruc.2024.106602","journal-title":"Structures"},{"key":"1676_CR40","doi-asserted-by":"publisher","first-page":"5427","DOI":"10.1007\/s40747-024-01445-9","volume":"10","author":"Y Wang","year":"2024","unstructured":"Wang Y, Wang W, Li Y, Jia Y, Xu Y, Ling Y, Ma J (2024) An attention mechanism module with spatial perception and channel information interaction. Complex Intell Syst 10:5427\u20135444. https:\/\/doi.org\/10.1007\/s40747-024-01445-9","journal-title":"Complex Intell Syst"},{"issue":"13","key":"1676_CR41","doi-asserted-by":"publisher","first-page":"7489","DOI":"10.3390\/app13137489","volume":"13","author":"P He","year":"2023","unstructured":"He P, Qi H, Wang S, Cang J (2023) Cross-modal sentiment analysis of text and video based on Bi-GRU cyclic network and correlation enhancement. Appl Sci 13(13):7489. https:\/\/doi.org\/10.3390\/app13137489","journal-title":"Appl Sci"},{"key":"1676_CR42","doi-asserted-by":"publisher","unstructured":"Warner B, Chaffin A, Clavi\u00e9 B, Weller O, Hallstr\u00f6m O, Taghadouini S, Gallagher A, Biswas R, Ladhak F, Aarsen T, Adams GT, Howard J, Poli I (2025) Smarter, better, faster, longer: a modern bidirectional encoder for fast, memory efficient, and long context finetuning and inference. In: Proceedings of the 63rd annual meeting of the association for computational linguistics (ACL 2025), volume 1: long papers, pp 2526\u20132547. https:\/\/doi.org\/10.18653\/v1\/2025.acl-long.127","DOI":"10.18653\/v1\/2025.acl-long.127"},{"key":"1676_CR43","doi-asserted-by":"publisher","first-page":"81","DOI":"10.1007\/s44443-026-00472-5","volume":"38","author":"K Jiang","year":"2026","unstructured":"Jiang K, Xiao X, Lu X et al (2026) SemCap: sentiment-aware semantic captioning for multimodal aspect-based sentiment analysis. J King Saud Univ Comput Inf Sci 38:81. https:\/\/doi.org\/10.1007\/s44443-026-00472-5","journal-title":"J King Saud Univ Comput Inf Sci"},{"key":"1676_CR44","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2025.111369","volume":"162","author":"W Zou","year":"2025","unstructured":"Zou W, Sun X, Lu Q, Wang X, Feng J (2025) A vision and language hierarchical alignment for multimodal aspect-based sentiment analysis. Pattern Recognit 162:111369. https:\/\/doi.org\/10.1016\/j.patcog.2025.111369","journal-title":"Pattern Recognit"},{"key":"1676_CR45","doi-asserted-by":"publisher","unstructured":"Zhu L, Sun H, Gao Q et al (2025) Aspect enhancement and text simplification in multimodal aspect-based sentiment analysis for multi-aspect and multi-sentiment scenarios. In: Proceedings of the AAAI conference on artificial intelligence, vol 39, no 2, pp 1683\u20131691. https:\/\/doi.org\/10.1609\/aaai.v39i2.32161","DOI":"10.1609\/aaai.v39i2.32161"},{"key":"1676_CR46","doi-asserted-by":"publisher","unstructured":"Litvinova T, Shlyakhta D, Romanov A (2024) Re-evaluating Word2Vec and GloVe in the era of transformers: a case study on semantic change detection. In: Proceedings of the 2024 conference on empirical methods in natural language processing (EMNLP 2024), pp 1\u201312. https:\/\/doi.org\/10.18653\/v1\/2024.emnlp-main.1","DOI":"10.18653\/v1\/2024.emnlp-main.1"},{"key":"1676_CR47","doi-asserted-by":"publisher","unstructured":"Vallebueno A, Handan-Nader C, Manning C-D, Ho DE (2024) Statistical uncertainty in word embeddings: GloVe-V. In: Proceedings of the 2024 conference on empirical methods in natural language processing (EMNLP 2024), pp 9032\u20139047. https:\/\/doi.org\/10.18653\/v1\/2024.emnlp-main.510","DOI":"10.18653\/v1\/2024.emnlp-main.510"},{"key":"1676_CR48","doi-asserted-by":"publisher","unstructured":"Ethayarajh K (2019) How contextual are contextualized word representations? Comparing the geometry of BERT, ELMo, and GPT-2 embeddings. In: Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th international joint conference on natural language processing (EMNLP-IJCNLP 2019), pp 55\u201365. https:\/\/doi.org\/10.18653\/v1\/D19-1006","DOI":"10.18653\/v1\/D19-1006"},{"key":"1676_CR49","doi-asserted-by":"publisher","unstructured":"Venkit PN, Srinath M, Gautam S, Venkatraman S, Gupta V, Passonneau RJ, Wilson S (2023) The sentiment problem: a critical survey towards deconstructing sentiment analysis. In: Proceedings of the 2023 conference on empirical methods in natural language processing (EMNLP 2023), pp 13743\u201313763. https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-main.848","DOI":"10.18653\/v1\/2023.emnlp-main.848"},{"key":"1676_CR50","doi-asserted-by":"publisher","unstructured":"Lu J, Batra D, Parikh D, Lee S (2019) ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in neural information processing systems 32 (NeurIPS 2019), pp 13\u201323. https:\/\/doi.org\/10.5555\/3454287.3454289","DOI":"10.5555\/3454287.3454289"},{"key":"1676_CR51","doi-asserted-by":"publisher","unstructured":"Su W, Zhu X, Cao Y, Li B, Lu L, Wei F, Dai J (2020) VL-BERT: pre-training of generic visual-linguistic representations. In: International conference on learning representations (ICLR 2020). https:\/\/doi.org\/10.48550\/arXiv.1908.08530","DOI":"10.48550\/arXiv.1908.08530"},{"key":"1676_CR52","doi-asserted-by":"publisher","unstructured":"Triantafyllopoulos A, Schuller BW (2024) COVERAP: a modular Python framework for open-source computational paralinguistics. In: Proceedings of the 32nd ACM international conference on multimedia (ACM MM 2024), pp 1\u20139. https:\/\/doi.org\/10.1145\/3664647.3680901","DOI":"10.1145\/3664647.3680901"},{"key":"1676_CR53","doi-asserted-by":"publisher","unstructured":"Pan Y, Shen P, Liu J, Wang L (2024) Advancing speech emotion recognition through glottal source feature integration and multi-scale fusion. In: Proceedings of the 32nd ACM international conference on multimedia (ACM MM 2024), pp 8912\u20138922. https:\/\/doi.org\/10.1145\/3664647.3680895","DOI":"10.1145\/3664647.3680895"},{"key":"1676_CR54","doi-asserted-by":"publisher","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. In: Advances in neural information processing systems 30 (NeurIPS 2017), pp 5998\u20136008. https:\/\/doi.org\/10.5555\/3295222.3295349","DOI":"10.5555\/3295222.3295349"},{"key":"1676_CR55","doi-asserted-by":"publisher","unstructured":"Zadeh A, Zellers R, Pincus E, Morency L-P (2016) MOSI: multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos. https:\/\/doi.org\/10.48550\/arXiv.1606.06259","DOI":"10.48550\/arXiv.1606.06259"},{"key":"1676_CR56","doi-asserted-by":"publisher","unstructured":"Zadeh A, Pu P (2018) Multimodal language analysis in the wild: CMU-MOSEI dataset and interpretable dynamic fusion graph. In: Proceedings of the 56th annual meeting of the association for computational linguistics (ACL 2018), pp 2236\u20132246. https:\/\/doi.org\/10.18653\/v1\/P18-1215","DOI":"10.18653\/v1\/P18-1215"},{"key":"1676_CR57","doi-asserted-by":"publisher","unstructured":"Liu Z, Shen Y, Morency L-P (2018) Efficient low-rank multimodal fusion with modality-specific factors. In: Proceedings of the 56th annual meeting of the association for computational linguistics (ACL 2018). https:\/\/doi.org\/10.18653\/v1\/P18-1209","DOI":"10.18653\/v1\/P18-1209"},{"key":"1676_CR58","doi-asserted-by":"publisher","unstructured":"Wang Y, Shen Y, Liu Z, Liang P-P, Zadeh A, Morency L-P (2019) Words can shift: dynamically adjusting word representations using nonverbal behaviors. In: Proceedings of the AAAI conference on artificial intelligence (AAAI 2019), pp 7216\u20137223. https:\/\/doi.org\/10.1609\/aaai.v33i01.33017216","DOI":"10.1609\/aaai.v33i01.33017216"},{"key":"1676_CR59","doi-asserted-by":"publisher","unstructured":"Sun Z, Sarma P, Sethares W, Liang Y (2020) Learning relationships between text, audio, and video via deep canonical correlation for multimodal language analysis. In: Proceedings of the AAAI conference on artificial intelligence (AAAI 2020), pp 8992\u20138999. https:\/\/doi.org\/10.1609\/aaai.v34i05.6431","DOI":"10.1609\/aaai.v34i05.6431"},{"key":"1676_CR60","doi-asserted-by":"publisher","unstructured":"Rahman W, Hasan M-K, Lee S, Zadeh A, Mao C, Morency L-P, Hoque E (2020) Integrating multimodal information in large pretrained transformers. In: Proceedings of the annual meeting of the association for computational linguistics (ACL 2020), pp 2359\u20132369. https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.222","DOI":"10.18653\/v1\/2020.acl-main.222"}],"container-title":["Pattern Analysis and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10044-026-01676-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10044-026-01676-1","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10044-026-01676-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T07:38:56Z","timestamp":1777361936000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10044-026-01676-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,28]]},"references-count":60,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2026,6]]}},"alternative-id":["1676"],"URL":"https:\/\/doi.org\/10.1007\/s10044-026-01676-1","relation":{},"ISSN":["1433-7541","1433-755X"],"issn-type":[{"value":"1433-7541","type":"print"},{"value":"1433-755X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,4,28]]},"assertion":[{"value":"7 February 2026","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 April 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 April 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"88"}}