{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T11:32:56Z","timestamp":1777894376048,"version":"3.51.4"},"reference-count":54,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022ZD0117103"],"award-info":[{"award-number":["2022ZD0117103"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62477022"],"award-info":[{"award-number":["62477022"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62577027"],"award-info":[{"award-number":["62577027"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62177022"],"award-info":[{"award-number":["62177022"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100005234","name":"Central China Normal University","doi-asserted-by":"publisher","award":["CCNU25ai003"],"award-info":[{"award-number":["CCNU25ai003"]}],"id":[{"id":"10.13039\/501100005234","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Applied Soft Computing"],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1016\/j.asoc.2026.114869","type":"journal-article","created":{"date-parts":[[2026,2,18]],"date-time":"2026-02-18T00:35:47Z","timestamp":1771374947000},"page":"114869","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Leveraging hierarchical attention and dynamic fusion mechanisms for multi-modal speech emotion recognition"],"prefix":"10.1016","volume":"193","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4022-9139","authenticated-orcid":false,"given":"Zengzhao","family":"Chen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chuanxu","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6960-509X","authenticated-orcid":false,"given":"Zhifeng","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5359-8945","authenticated-orcid":false,"given":"Chuan","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qiuyu","family":"Zheng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0951-1072","authenticated-orcid":false,"given":"Jianwen","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.asoc.2026.114869_bib0005","series-title":"ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"1","article-title":"Speech emotion recognition based on large-scale automatic speech recognizer","author":"Fukuda","year":"2025"},{"key":"10.1016\/j.asoc.2026.114869_bib0010","article-title":"Design and analysis of a closed-loop emotion regulation system based on multimodal affective computing and emotional markov chain","author":"Wang","year":"2025","journal-title":"IEEE Trans. Syst. Man Cybern. Syst."},{"key":"10.1016\/j.asoc.2026.114869_bib0015","doi-asserted-by":"crossref","DOI":"10.1016\/j.specom.2024.103102","article-title":"Analyzing the influence of different speech data corpora and speech features on speech emotion recognition: a review","volume":"162","author":"Rathi","year":"2024","journal-title":"Speech Commun."},{"issue":"1","key":"10.1016\/j.asoc.2026.114869_bib0020","doi-asserted-by":"crossref","first-page":"38","DOI":"10.18689\/mjns-1000107","article-title":"Does personality effect emotion facial recognition? a comparison between ekman\u2019s emotion hexagon test and a newly created measure","volume":"1","author":"Jenkins","year":"2017","journal-title":"Madridge J. Neurosci."},{"key":"10.1016\/j.asoc.2026.114869_bib0025","series-title":"Emotions revealed: recognizing faces and feelings to improve communication and emotional life","author":"Ekman","year":"2003"},{"key":"10.1016\/j.asoc.2026.114869_bib0030","series-title":"The nature of emotion: fundamental questions","first-page":"55","article-title":"Moods, Emotions, and Traits","author":"Ekman","year":"1994"},{"key":"10.1016\/j.asoc.2026.114869_bib0035","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.121419","article-title":"Learning facial expression and body gesture visual information for video emotion recognition","volume":"237","author":"Wei","year":"2024","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.asoc.2026.114869_bib0040","doi-asserted-by":"crossref","DOI":"10.1007\/s10772-024-10096-7","article-title":"Automatic speech emotion recognition: a systematic literature review","author":"Mustafa","year":"2024","journal-title":"Int. J. Speech Technol."},{"key":"10.1016\/j.asoc.2026.114869_bib0045","doi-asserted-by":"crossref","first-page":"3","DOI":"10.1016\/j.imavis.2017.08.003","article-title":"A survey of multimodal sentiment analysis","volume":"65","author":"Soleymani","year":"2017","journal-title":"Image Vis. Comput."},{"issue":"4","key":"10.1016\/j.asoc.2026.114869_bib0050","doi-asserted-by":"crossref","first-page":"505","DOI":"10.1016\/S0959-4388(00)00241-5","article-title":"Sensory modalities are not separate modalities: plasticity and interactions","volume":"11","author":"Shimojo","year":"2001","journal-title":"Curr. Opin. Neurobiol."},{"key":"10.1016\/j.asoc.2026.114869_bib0055","series-title":"ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"3020","article-title":"Efficient speech emotion recognition using multi-scale CNN and attention","author":"Peng","year":"2021"},{"key":"10.1016\/j.asoc.2026.114869_bib0060","series-title":"An attention pooling based representation learning method for speech emotion recognition","author":"Li","year":"2018"},{"key":"10.1016\/j.asoc.2026.114869_bib0065","series-title":"Proceedings of the 25th ACM International Conference on Multimedia","first-page":"478","article-title":"An image-based deep spectrum feature representation for the recognition of emotional speech","author":"Cummins","year":"2017"},{"key":"10.1016\/j.asoc.2026.114869_bib0070","doi-asserted-by":"crossref","first-page":"79861","DOI":"10.1109\/ACCESS.2020.2990405","article-title":"Clustering-based speech emotion recognition by incorporating learned features and deep bilstm","volume":"8","author":"Sajjad","year":"2020","journal-title":"IEEE Access"},{"key":"10.1016\/j.asoc.2026.114869_bib0075","series-title":"Proceedings of the Joint Workshop of the 4th Workshop on Affective Social Multimedia Computing and First Multi-Modal Affective Computing of Large-Scale Multimedia Data","first-page":"27","article-title":"Deep spectrum feature representations for speech emotion recognition","author":"Zhao","year":"2018"},{"key":"10.1016\/j.asoc.2026.114869_bib0080","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1155\/2022\/7212366","article-title":"GloVe-CNN-BiLSTM model for sentiment analysis on text reviews","volume":"2022","author":"Xiaoyan","year":"2022","journal-title":"J. Sens."},{"key":"10.1016\/j.asoc.2026.114869_bib0085","first-page":"1","article-title":"A text sentiment classification model using double word embedding methods","author":"Zhou","year":"2022","journal-title":"Multim. Tools Appl."},{"key":"10.1016\/j.asoc.2026.114869_bib0090","author":"Devlin"},{"key":"10.1016\/j.asoc.2026.114869_bib0095","series-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"key":"10.1016\/j.asoc.2026.114869_bib0100","series-title":"Proceedings of the 22nd Nordic Conference on Computational Linguistics","first-page":"187","article-title":"Aspect-based sentiment analysis using BERT","author":"Hoang","year":"2019"},{"key":"10.1016\/j.asoc.2026.114869_bib0105","series-title":"2019 Artificial Intelligence for Transforming Business and Society (AITB)","first-page":"1","article-title":"Fine-grained sentiment classification using BERT","volume":"vol. 1","author":"Munikar","year":"2019"},{"key":"10.1016\/j.asoc.2026.114869_bib0110","doi-asserted-by":"crossref","first-page":"13059","DOI":"10.1007\/s11042-020-10285-x","article-title":"Attention-based multimodal contextual fusion for sentiment and emotion classification using bidirectional LSTM","volume":"80","author":"Huddar","year":"2021","journal-title":"Multimed. Tools Appl."},{"key":"10.1016\/j.asoc.2026.114869_bib0115","series-title":"2018 17th IEEE International Conference on Machine Learning and Applications (ICMLA)","first-page":"1475","article-title":"Multimodal sentiment analysis using deep learning","author":"Sharma","year":"2018"},{"key":"10.1016\/j.asoc.2026.114869_bib0120","doi-asserted-by":"crossref","first-page":"306","DOI":"10.1016\/j.inffus.2023.02.028","article-title":"Multimodal sentiment analysis based on fusion methods: a survey","volume":"95","author":"Zhu","year":"2023","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.asoc.2026.114869_bib0125","doi-asserted-by":"crossref","first-page":"723","DOI":"10.1007\/s11280-013-0221-9","article-title":"Building emotional dictionary for sentiment analysis of online news","volume":"17","author":"Rao","year":"2014","journal-title":"World Wide Web"},{"key":"10.1016\/j.asoc.2026.114869_bib0130","first-page":"1","article-title":"Sentiment analysis using dictionary-based lexicon approach: analysis on the opinion of Indian community for the topic of cryptocurrency","author":"Loomba","year":"2023","journal-title":"Ann. Data Sci."},{"key":"10.1016\/j.asoc.2026.114869_bib0135","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1186\/s13673-017-0116-3","article-title":"Optimization of sentiment analysis using machine learning classifiers","volume":"7","author":"Singh","year":"2017","journal-title":"Hum.-Centric Comput. Inf. Sci."},{"key":"10.1016\/j.asoc.2026.114869_bib0140","series-title":"2019 IEEE Eurasia Conference on Biomedical Engineering, Healthcare and Sustainability (ECBIOS)","first-page":"108","article-title":"Sentiment analysis with CNNS built on LSTM on tourists comments","author":"Gao","year":"2019"},{"issue":"2","key":"10.1016\/j.asoc.2026.114869_bib0145","doi-asserted-by":"crossref","first-page":"379","DOI":"10.1007\/s10844-021-00692-3","article-title":"RecogNet-LSTM+ CNN: a hybrid network with attention mechanism for aspect categorization and sentiment classification","volume":"58","author":"Ramaswamy","year":"2022","journal-title":"J. Intell. Inf. Syst."},{"key":"10.1016\/j.asoc.2026.114869_bib0150","series-title":"AI and Analytics for Public Health: Proceedings of the 2020 INFORMS International Conference on Service Science","first-page":"317","article-title":"Sentiment analysis based on BERT and transformer","author":"Yue","year":"2022"},{"issue":"11","key":"10.1016\/j.asoc.2026.114869_bib0155","article-title":"Transformer based contextual model for sentiment analysis of customer reviews: a fine-tuned BERT","volume":"12","author":"Durairaj","year":"2021","journal-title":"Int. J. Adv. Comput. Sci. Appl."},{"key":"10.1016\/j.asoc.2026.114869_bib0160","series-title":"At the border of Acoustics and linguistics: bag-of-audio-words for the recognition of emotions in speech","author":"Schmitt","year":"2016"},{"key":"10.1016\/j.asoc.2026.114869_bib0165","series-title":"Interspeech","article-title":"Recognition of emotions in interactive voice response systems","author":"Yacoub","year":"2003"},{"key":"10.1016\/j.asoc.2026.114869_bib0170","doi-asserted-by":"crossref","first-page":"312","DOI":"10.1016\/j.bspc.2018.08.035","article-title":"Speech emotion recognition using deep 1d & 2d CNN LSTM networks","volume":"47","author":"Zhao","year":"2019","journal-title":"Biomed. Signal Process. Control"},{"key":"10.1016\/j.asoc.2026.114869_bib0175","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2022.118943","article-title":"Learning multi-scale features for speech emotion recognition with connection attention mechanism","volume":"214","author":"Chen","year":"2023","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.asoc.2026.114869_bib0180","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2023.111077","article-title":"Spatio-temporal representation learning enhanced speech emotion recognition with multi-head attention mechanisms","volume":"281","author":"Chen","year":"2023","journal-title":"Knowl.Based Syst."},{"key":"10.1016\/j.asoc.2026.114869_bib0185","series-title":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"11686","article-title":"Disentanglement network: disentangle the emotional features from acoustic features for speech emotion recognition","author":"Yuan","year":"2024"},{"issue":"20","key":"10.1016\/j.asoc.2026.114869_bib0190","doi-asserted-by":"crossref","first-page":"43241","DOI":"10.1109\/JIOT.2025.3595096","article-title":"Enhanced speech emotion recognition in noisy environments: adaptive emotion denoising diffusion approach with iterative confidence learning strategy","volume":"12","author":"Liu","year":"2025","journal-title":"IEEE Internet Things J."},{"key":"10.1016\/j.asoc.2026.114869_bib0195","series-title":"2022 Third International Conference on Intelligent Computing Instrumentation and Control Technologies (ICICICT)","first-page":"327","article-title":"A novel multimodal speech emotion recognition system","author":"Asiya","year":"2022"},{"issue":"1","key":"10.1016\/j.asoc.2026.114869_bib0200","doi-asserted-by":"crossref","first-page":"10","DOI":"10.1109\/TBC.2022.3215245","article-title":"FV2ES: a fully end2end multimodal system for fast yet effective video emotion recognition inference","volume":"69","author":"Wei","year":"2022","journal-title":"IEEE Trans. Broadcast."},{"key":"10.1016\/j.asoc.2026.114869_bib0205","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"3417","article-title":"Nested hierarchical transformer: towards accurate, data-efficient and interpretable visual understanding","volume":"vol. 36","author":"Zhang","year":"2022"},{"key":"10.1016\/j.asoc.2026.114869_bib0210","series-title":"2018 IEEE Conference on Multimedia Information Processing and Retrieval (MIPR)","first-page":"196","article-title":"Self-attentive feature-level fusion for multimodal emotion detection","author":"Hazarika","year":"2018"},{"key":"10.1016\/j.asoc.2026.114869_bib0215","series-title":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","first-page":"2236","article-title":"Multimodal language analysis in the wild: CMU-MOSEI dataset and interpretable dynamic fusion graph","author":"Zadeh","year":"2018"},{"issue":"21","key":"10.1016\/j.asoc.2026.114869_bib0220","doi-asserted-by":"crossref","first-page":"9981","DOI":"10.3390\/app14219981","article-title":"Speech emotion recognition using transfer learning: integration of advanced speaker embeddings and image recognition models","volume":"14","author":"Jakubec","year":"2024","journal-title":"Appl. Sci."},{"issue":"4","key":"10.1016\/j.asoc.2026.114869_bib0225","doi-asserted-by":"crossref","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","article-title":"Iemocap: interactive emotional dyadic motion capture database","volume":"42","author":"Busso","year":"2008","journal-title":"Lang. Resour. Eval."},{"issue":"5","key":"10.1016\/j.asoc.2026.114869_bib0230","doi-asserted-by":"crossref","first-page":"90","DOI":"10.1145\/3129340","article-title":"Speech emotion recognition: two decades in a nutshell, benchmarks, and ongoing trends","volume":"61","author":"Schuller","year":"2018","journal-title":"Commun. ACM"},{"issue":"2","key":"10.1016\/j.asoc.2026.114869_bib0235","doi-asserted-by":"crossref","first-page":"293","DOI":"10.1109\/TSA.2004.838534","article-title":"Toward detecting emotions in spoken dialogs","volume":"13","author":"Lee","year":"2005","journal-title":"IEEE Trans. Speech Audio Process."},{"issue":"5","key":"10.1016\/j.asoc.2026.114869_bib0240","doi-asserted-by":"crossref","first-page":"713","DOI":"10.3390\/electronics9050713","article-title":"Attention-LSTM-attention model for speech emotion recognition and analysis of iemocap database","volume":"9","author":"Yu","year":"2020","journal-title":"Electronics"},{"key":"10.1016\/j.asoc.2026.114869_bib0245","series-title":"ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"1","article-title":"Using auxiliary tasks in multimodal fusion of wav2vec 2.0 and BERT for multimodal emotion recognition","author":"Sun","year":"2023"},{"key":"10.1016\/j.asoc.2026.114869_bib0250","series-title":"INTERSPEECH","first-page":"1998","article-title":"Exploiting fine-tuning of self-supervised learning models for improving bi-modal sentiment analysis and emotion recognition","author":"Yang","year":"2022"},{"key":"10.1016\/j.asoc.2026.114869_bib0255","series-title":"ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"7367","article-title":"Speech emotion recognition with co-attention based multi-level acoustic information","author":"Zou","year":"2022"},{"key":"10.1016\/j.asoc.2026.114869_bib0260","series-title":"ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"6922","article-title":"Speech emotion recognition using self-supervised features","author":"Morais","year":"2022"},{"key":"10.1016\/j.asoc.2026.114869_bib0265","series-title":"ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"1","article-title":"Multimodal emotion recognition based on deep temporal features using cross-modal transformer and self-attention","author":"Maji","year":"2023"},{"issue":"21","key":"10.1016\/j.asoc.2026.114869_bib0270","doi-asserted-by":"crossref","DOI":"10.3390\/app14219981","article-title":"Speech emotion recognition using transfer learning: integration of advanced speaker embeddings and image recognition models","volume":"14","author":"Jakubec","year":"2024","journal-title":"Appl. Sci."}],"container-title":["Applied Soft Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1568494626003170?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1568494626003170?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T07:19:31Z","timestamp":1777619971000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1568494626003170"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5]]},"references-count":54,"alternative-id":["S1568494626003170"],"URL":"https:\/\/doi.org\/10.1016\/j.asoc.2026.114869","relation":{},"ISSN":["1568-4946"],"issn-type":[{"value":"1568-4946","type":"print"}],"subject":[],"published":{"date-parts":[[2026,5]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Leveraging hierarchical attention and dynamic fusion mechanisms for multi-modal speech emotion recognition","name":"articletitle","label":"Article Title"},{"value":"Applied Soft Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.asoc.2026.114869","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"114869"}}