{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T08:06:30Z","timestamp":1779437190849,"version":"3.53.1"},"reference-count":61,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T00:00:00Z","timestamp":1763164800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T00:00:00Z","timestamp":1763164800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Natural Science Foundation of Qinghai Province of China","award":["2022-ZJ-925"],"award-info":[{"award-number":["2022-ZJ-925"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62066039"],"award-info":[{"award-number":["62066039"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Circuits Syst Signal Process"],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1007\/s00034-025-03408-y","type":"journal-article","created":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T17:34:41Z","timestamp":1763228081000},"page":"4173-4198","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A Multi-branch Interactive Attention Network Based on Self-Distillation for Speech Emotion Recognition"],"prefix":"10.1007","volume":"45","author":[{"given":"Yuanyuan","family":"Wei","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6339-7135","authenticated-orcid":false,"given":"Heming","family":"Huang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kedi","family":"Huang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yonghong","family":"Fan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jie","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"3408_CR1","doi-asserted-by":"publisher","unstructured":"A. Aftab, A. Morsali, S. Ghaemmaghami, et al., LIGHT-SERNET: a lightweight fully convolutional neural network for speech emotion recognition (ICASSP 2022), pp. 6912\u20136916. https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9746679","DOI":"10.1109\/ICASSP43922.2022.9746679"},{"key":"3408_CR2","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.119633","volume":"218","author":"MR Ahmed","year":"2023","unstructured":"M.R. Ahmed, S. Islam, A.K.M.M. Islam et al., An ensemble 1D-CNN-LSTM-GRU model with data augmentation for speech emotion recognition. Expert Syst. Appl. 218, 119633 (2023). https:\/\/doi.org\/10.1016\/j.eswa.2023.119633","journal-title":"Expert Syst. Appl."},{"key":"3408_CR3","unstructured":"B. Alkin, M. Beck, K. P\u00f6ppel, et al., Vision-LSTM: xLSTM as generic vision backbone. Preprint at arXiv:2406.04303 (2024)"},{"issue":"8","key":"3408_CR4","doi-asserted-by":"publisher","first-page":"4750","DOI":"10.3390\/app13084750","volume":"13","author":"AS Alluhaidan","year":"2023","unstructured":"A.S. Alluhaidan, O. Saidani, R. Jahangir et al., Speech emotion recognition through hybrid features and convolutional neural network. Appl. Sci. 13(8), 4750 (2023). https:\/\/doi.org\/10.3390\/app13084750","journal-title":"Appl. Sci."},{"key":"3408_CR5","first-page":"107547","volume":"37","author":"M Beck","year":"2024","unstructured":"M. Beck, K. P\u00f6ppel, M. Spanring et al., XLSTM: extended long short-term memory. Adv. Neural Inf. Process. Syst. 37, 107547\u2013107603 (2024)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"3408_CR6","doi-asserted-by":"publisher","unstructured":"F. Burkhardt, A. Paeschke, M. Rolfes, et al., A database of German emotional speech. In: Interspeech, vol. 5 (2005). pp. 1517\u20131520. https:\/\/doi.org\/10.21437\/Interspeech.2005-446","DOI":"10.21437\/Interspeech.2005-446"},{"key":"3408_CR7","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","volume":"42","author":"C Busso","year":"2008","unstructured":"C. Busso, M. Bulut, C.C. Lee et al., IEMOCAP: interactive emotional dyadic motion capture database. Lang. Resour. Eval. 42, 335\u2013359 (2008). https:\/\/doi.org\/10.1007\/s10579-008-9076-6","journal-title":"Lang. Resour. Eval."},{"issue":"20","key":"3408_CR8","first-page":"6","volume":"1","author":"Y Chavhan","year":"2010","unstructured":"Y. Chavhan, M.L. Dhore, P. Yesaware et al., Speech emotion recognition using support vector machine. Int. J. Comput. Appl. 1(20), 6\u20139 (2010)","journal-title":"Int. J. Comput. Appl."},{"key":"3408_CR9","doi-asserted-by":"crossref","unstructured":"T. Chen, C. Ding, L. Zhu, et al., xLSTM-UNet can be an effective 2D & 3D medical image segmentation backbone with Vision-LSTM (ViL) better than its Mamba counterpart. Preprint at arXiv:2407.01530 (2024)","DOI":"10.1109\/BHI62660.2024.10913659"},{"issue":"3","key":"3408_CR10","doi-asserted-by":"publisher","first-page":"1711","DOI":"10.1109\/TAFFC.2024.3369726","volume":"15","author":"W Chen","year":"2024","unstructured":"W. Chen, X. Xing, P. Chen et al., Vesper: a compact and effective pretrained model for speech emotion recognition. IEEE Trans. Affect. Comput. 15(3), 1711\u20131724 (2024). https:\/\/doi.org\/10.1109\/TAFFC.2024.3369726","journal-title":"IEEE Trans. Affect. Comput."},{"key":"3408_CR11","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2022.118943","volume":"214","author":"Z Chen","year":"2023","unstructured":"Z. Chen, J. Li, H. Liu et al., Learning multi-scale features for speech emotion recognition with connection attention mechanism. Expert Syst. Appl. 214, 118943 (2023). https:\/\/doi.org\/10.1016\/j.eswa.2022.118943","journal-title":"Expert Syst. Appl."},{"key":"3408_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2023.111077","volume":"281","author":"Z Chen","year":"2023","unstructured":"Z. Chen, M. Lin, Z. Wang et al., Spatio-temporal representation learning enhanced speech emotion recognition with multi-head attention mechanisms. Knowl.-Based Syst. 281, 111077 (2023). https:\/\/doi.org\/10.1016\/j.knosys.2023.111077","journal-title":"Knowl.-Based Syst."},{"issue":"1","key":"3408_CR13","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1109\/79.911197","volume":"18","author":"R Cowie","year":"2001","unstructured":"R. Cowie, E. Douglas-Cowie, N. Tsapatsoulis et al., Emotion recognition in human-computer interaction. IEEE Signal Proc. Mag. 18(1), 32\u201380 (2001). https:\/\/doi.org\/10.1109\/79.911197","journal-title":"IEEE Signal Proc. Mag."},{"key":"3408_CR14","doi-asserted-by":"publisher","first-page":"9910","DOI":"10.1109\/TCSVT.2024.3405406","volume":"34","author":"Y Dai","year":"2024","unstructured":"Y. Dai, Y. Li, D. Chen et al., Multimodal decoupled distillation graph neural network for emotion recognition in conversation. IEEE Trans. Circuits Syst. Video Technol. 34, 9910\u20139924 (2024). https:\/\/doi.org\/10.1109\/TCSVT.2024.3405406","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"3408_CR15","unstructured":"A. Dosovitskiy, L. Beyer, A. Kolesnikov, et al., An image is worth 16x16 words: transformers for image recognition at scale. Preprint at arXiv:2010.11929 (2020)"},{"key":"3408_CR16","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.128879","volume":"615","author":"Y Fan","year":"2025","unstructured":"Y. Fan, H. Huang, H. Han et al., Hierarchical convolutional neural networks with post-attention for speech emotion recognition. Neurocomputing 615, 128879 (2025). https:\/\/doi.org\/10.1016\/j.neucom.2024.128879","journal-title":"Neurocomputing"},{"issue":"6","key":"3408_CR17","doi-asserted-by":"publisher","first-page":"1789","DOI":"10.1007\/s11263-021-01453-z","volume":"129","author":"J Gou","year":"2021","unstructured":"J. Gou, B. Yu, S.J. Maybank et al., Knowledge distillation: a survey. Int. J. Comput. Vis. 129(6), 1789\u20131819 (2021). https:\/\/doi.org\/10.1007\/s11263-021-01453-z","journal-title":"Int. J. Comput. Vis."},{"issue":"1","key":"3408_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TNNLS.2023.3304516","volume":"36","author":"L Guo","year":"2023","unstructured":"L. Guo, S. Ding, L. Wang et al., DSTCNet: deep spectro-temporal-channel attention network for speech emotion recognition. IEEE Trans. Neural Netw. Learn. Syst. 36(1), 1\u201310 (2023). https:\/\/doi.org\/10.1109\/TNNLS.2023.3304516","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"3408_CR19","doi-asserted-by":"publisher","first-page":"118","DOI":"10.1016\/j.specom.2021.11.005","volume":"136","author":"L Guo","year":"2022","unstructured":"L. Guo, L. Wang, J. Dang et al., Learning affective representations based on magnitude and dynamic relative phase information for speech emotion recognition. Speech Commun. 136, 118\u2013127 (2022). https:\/\/doi.org\/10.1016\/j.specom.2021.11.005","journal-title":"Speech Commun."},{"key":"3408_CR20","first-page":"15908","volume":"34","author":"K Han","year":"2021","unstructured":"K. Han, A. Xiao, E. Wu et al., Transformer in transformer. Adv. Neural. Inf. Process. Syst. 34, 15908\u201315919 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3408_CR21","doi-asserted-by":"publisher","unstructured":"Y. He, N. Minematsu, D. Saito, et al., Multiple acoustic features speech emotion recognition using cross-attention transformer. ICASSP, 2023), pp. 1\u20135. https:\/\/doi.org\/10.1016\/j.eswa.2022.118943","DOI":"10.1016\/j.eswa.2022.118943"},{"issue":"6","key":"3408_CR22","doi-asserted-by":"publisher","first-page":"2225","DOI":"10.1007\/s10115-019-01419-1","volume":"62","author":"YH Hsieh","year":"2020","unstructured":"Y.H. Hsieh, S.C. Chen et al., A decision support system for service recovery in affective computing: an experimental investigation. Knowl. Inf. Syst. 62(6), 2225\u20132256 (2020). https:\/\/doi.org\/10.1007\/s10115-019-01419-1","journal-title":"Knowl. Inf. Syst."},{"key":"3408_CR23","doi-asserted-by":"publisher","unstructured":"Y. Hu, S. Hou, H., Yang, et al., A joint network based on interactive attention for speech emotion recognition, in IEEE International Conference on Multimedia and Expo (ICME) (2023), pp. 1715\u20131720. https:\/\/doi.org\/10.1109\/ICME55011.2023.00295","DOI":"10.1109\/ICME55011.2023.00295"},{"key":"3408_CR24","doi-asserted-by":"publisher","unstructured":"D.N. Jiang, L.H. Cai et al., Speech emotion classification with the combination of statistic features and temporal features, in IEEE Int. Conf. Multimedia Expo (ICME) vol. 3 (2004), pp. 1967\u20131970. https:\/\/doi.org\/10.1109\/ICME.2004.1394647","DOI":"10.1109\/ICME.2004.1394647"},{"key":"3408_CR25","doi-asserted-by":"publisher","unstructured":"X. Jiao, L. Wang, Y. Yu, MFHCA: enhancing speech emotion recognition via multi-spatial fusion and hierarchical cooperative attention, in IEEE International Conference on Multimedia and Expo (ICME) (2024), pp. 1\u20135. https:\/\/doi.org\/10.1109\/ICME57554.2024.10688053","DOI":"10.1109\/ICME57554.2024.10688053"},{"key":"3408_CR26","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.122946","volume":"245","author":"M Khan","year":"2024","unstructured":"M. Khan, W. Gueaieb, A. El Saddik et al., MSER: multimodal speech emotion recognition using cross-attention with deep fusion. Expert Syst. Appl. 245, 122946 (2024). https:\/\/doi.org\/10.1016\/j.eswa.2023.122946","journal-title":"Expert Syst. Appl."},{"key":"3408_CR27","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.112123","volume":"299","author":"H Li","year":"2024","unstructured":"H. Li, X. Zhang, S. Duan et al., Speech emotion recognition based on bi-directional acoustic-articulatory conversion. Knowl.-Based Syst. 299, 112123 (2024). https:\/\/doi.org\/10.1016\/j.knosys.2024.112123","journal-title":"Knowl.-Based Syst."},{"key":"3408_CR28","doi-asserted-by":"publisher","unstructured":"M. Li, Y. Zheng, D. Li, et al. MS-SENet: enhancing speech emotion recognition through multi-scale feature fusion with squeeze-and-excitation blocks, in ICASSP (2024), pp. 12271\u201312275. https:\/\/doi.org\/10.1109\/ICASSP48485.2024.10447232","DOI":"10.1109\/ICASSP48485.2024.10447232"},{"key":"3408_CR29","doi-asserted-by":"publisher","unstructured":"C.L. Liu, F. Yin, D.H. Wang, et al., CASIA online and offline Chinese handwriting databases, in International Conference on Document Analysis and Recognition (2011), pp. 37\u201341. https:\/\/doi.org\/10.1109\/ICDAR.2011.17","DOI":"10.1109\/ICDAR.2011.17"},{"key":"3408_CR30","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2025.128711","volume":"612","author":"T Liu","year":"2025","unstructured":"T. Liu, M. Wang, B. Yang et al., ESERNet: learning spectrogram structure relationship for effective speech emotion recognition with Swin Transformer in classroom discourse analysis. Neurocomputing 612, 128711 (2025). https:\/\/doi.org\/10.1016\/j.neucom.2025.128711","journal-title":"Neurocomputing"},{"key":"3408_CR31","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2024.109219","volume":"137","author":"Y Liu","year":"2024","unstructured":"Y. Liu, X. Chen, Y. Song et al., Discriminative feature learning based on multi-view attention network with diffusion joint loss for speech emotion recognition. Eng. Appl. Artif. Intell. 137, 109219 (2024). https:\/\/doi.org\/10.1016\/j.engappai.2024.109219","journal-title":"Eng. Appl. Artif. Intell."},{"key":"3408_CR32","doi-asserted-by":"publisher","first-page":"1063","DOI":"10.1109\/TASLP.2023.3245401","volume":"31","author":"Y Liu","year":"2023","unstructured":"Y. Liu, H. Sun, W. Guan et al., A discriminative feature representation method based on cascaded attention network with adversarial strategy for speech emotion recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 31, 1063\u20131074 (2023). https:\/\/doi.org\/10.1109\/TASLP.2023.3245401","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"3408_CR33","doi-asserted-by":"publisher","first-page":"2193","DOI":"10.1109\/TASLP.2023.3282092","volume":"31","author":"Z Liu","year":"2023","unstructured":"Z. Liu, X. Kang, F. Ren et al., Dual-TBNet: improving the robustness of speech features via dual-Transformer-BiLSTM for speech emotion recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 31, 2193\u20132203 (2023). https:\/\/doi.org\/10.1109\/TASLP.2023.3282092","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"3408_CR34","doi-asserted-by":"crossref","unstructured":"Z., Liu, Y., Lin, Y., Cao, et al., Swin transformer: hierarchical vision transformer using shifted windows, in Proceedings of the IEEE International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"issue":"8","key":"3408_CR35","doi-asserted-by":"publisher","first-page":"2297","DOI":"10.3390\/s20082297","volume":"20","author":"ZT Liu","year":"2020","unstructured":"Z.T. Liu, B.H. Wu, D.Y. Li et al., Speech emotion recognition based on selective interpolation synthetic minority over-sampling technique in small sample environment. Sensors 20(8), 2297 (2020). https:\/\/doi.org\/10.3390\/s20082297","journal-title":"Sensors"},{"key":"3408_CR36","doi-asserted-by":"crossref","unstructured":"C. Lu, H. Lian, W. Zheng, et al. Learning local to global feature aggregation for speech emotion recognition. Preprint at arXiv:2306.01491 (2023)","DOI":"10.21437\/Interspeech.2023-543"},{"issue":"6","key":"3408_CR37","doi-asserted-by":"publisher","first-page":"3159","DOI":"10.1109\/TCSS.2022.3219825","volume":"10","author":"C Lu","year":"2022","unstructured":"C. Lu, W. Zheng, H. Lian et al., Speech emotion recognition via an attentive time-frequency neural network. IEEE Trans. Comput. Soc. Syst. 10(6), 3159\u20133168 (2022). https:\/\/doi.org\/10.1109\/TCSS.2022.3219825","journal-title":"IEEE Trans. Comput. Soc. Syst."},{"key":"3408_CR38","doi-asserted-by":"publisher","first-page":"776","DOI":"10.1109\/TMM.2023.3271019","volume":"26","author":"H Ma","year":"2023","unstructured":"H. Ma, J. Wang, H. Lin et al., A Transformer-based model with self-distillation for multimodal emotion recognition in conversations. IEEE Trans. Multimed. 26, 776\u2013788 (2023). https:\/\/doi.org\/10.1109\/TMM.2023.3271019","journal-title":"IEEE Trans. Multimed."},{"key":"3408_CR39","doi-asserted-by":"crossref","unstructured":"J. Mai, X. Xing, W. Chen, et al., DropFormer: a dynamic noise-dropping transformer for speech emotion recognition, in Proc. Interspeech, pp. 2645\u20132649 (2024)","DOI":"10.21437\/Interspeech.2024-651"},{"key":"3408_CR40","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2025.3537991","author":"A Nfissi","year":"2025","unstructured":"A. Nfissi, W. Bouachir, N. Bouguila et al., SigWavNet: learning multiresolution signal wavelet network for speech emotion recognition. IEEE Trans. Affect. Comput. (2025). https:\/\/doi.org\/10.1109\/TAFFC.2025.3537991","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"4","key":"3408_CR41","doi-asserted-by":"publisher","first-page":"603","DOI":"10.1016\/S0167-6393(03)00099-2","volume":"41","author":"TL Nwe","year":"2003","unstructured":"T.L. Nwe, S.W. Foo, L.C. De Silva et al., Speech emotion recognition using hidden Markov models. Speech Commun. 41(4), 603\u2013623 (2003). https:\/\/doi.org\/10.1016\/S0167-6393(03)00099-2","journal-title":"Speech Commun."},{"key":"3408_CR42","doi-asserted-by":"publisher","first-page":"261","DOI":"10.1016\/j.jad.2018.02.026","volume":"234","author":"M Paris","year":"2018","unstructured":"M. Paris, Y. Mahajan, J. Kim et al., Emotional speech processing deficits in bipolar disorder: the role of mismatch negativity and P3a. J. Affect. Disord. 234, 261\u2013269 (2018). https:\/\/doi.org\/10.1016\/j.jad.2018.02.026","journal-title":"J. Affect. Disord."},{"key":"3408_CR43","doi-asserted-by":"publisher","DOI":"10.1016\/j.bspc.2024.106967","volume":"100","author":"SS Poorna","year":"2025","unstructured":"S.S. Poorna, V. Menon, S. Gopalan, Hybrid CNN-BiLSTM architecture with multiple attention mechanisms to enhance speech emotion recognition. Biomed. Signal Process. Control 100, 106967 (2025). https:\/\/doi.org\/10.1016\/j.bspc.2024.106967","journal-title":"Biomed. Signal Process. Control"},{"issue":"2","key":"3408_CR44","doi-asserted-by":"publisher","first-page":"226","DOI":"10.1109\/TCE.2023.3236972","volume":"69","author":"GA Prabhakar","year":"2023","unstructured":"G.A. Prabhakar, B. Basel, A. Dutta et al., Multichannel CNN-BLSTM architecture for speech emotion recognition system by fusion of magnitude and phase spectral features using DCCA for consumer applications. IEEE Trans. Consum. Electron. 69(2), 226\u2013235 (2023). https:\/\/doi.org\/10.1109\/TCE.2023.3236972","journal-title":"IEEE Trans. Consum. Electron."},{"issue":"1","key":"3408_CR45","doi-asserted-by":"publisher","first-page":"54","DOI":"10.1080\/23307706.2022.2085198","volume":"10","author":"PR Prakash","year":"2023","unstructured":"P.R. Prakash, D. Anuradha, J. Iqbal et al., A novel convolutional neural network with gated recurrent unit for automated speech emotion recognition and classification. J. Control Decis. 10(1), 54\u201363 (2023). https:\/\/doi.org\/10.1080\/23307706.2022.2085198","journal-title":"J. Control Decis."},{"key":"3408_CR46","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2024.111735","volume":"161","author":"N Saleem","year":"2024","unstructured":"N. Saleem, H. Elmannai, S. Bourouis et al., Squeeze-and-excitation 3D convolutional attention recurrent network for end-to-end speech emotion recognition. Appl. Soft Comput. 161, 111735 (2024). https:\/\/doi.org\/10.1016\/j.asoc.2024.111735","journal-title":"Appl. Soft Comput."},{"key":"3408_CR47","doi-asserted-by":"publisher","first-page":"68","DOI":"10.1007\/s10803-018-3681-z","volume":"49","author":"S Schelinski","year":"2019","unstructured":"S. Schelinski, K. von Kriegstein et al., The relation between vocal pitch and vocal emotion recognition abilities in people with autism spectrum disorder and typical development. J. Autism Dev. Disord. 49, 68\u201382 (2019). https:\/\/doi.org\/10.1007\/s10803-018-3681-z","journal-title":"J. Autism Dev. Disord."},{"key":"3408_CR48","doi-asserted-by":"publisher","unstructured":"D. Shome, A. Etemad, et al. Speech emotion recognition with distilled prosodic and linguistic affect representations, in ICASSP (2024), pp. 11976\u201311980. https:\/\/doi.org\/10.1109\/ICASSP48485.2024.10448505","DOI":"10.1109\/ICASSP48485.2024.10448505"},{"key":"3408_CR49","doi-asserted-by":"publisher","unstructured":"Y. Wang, C. Lu, H. Lian, et al., Speech swin-transformer: exploring a hierarchical transformer with shifted windows for speech emotion recognition, in ICASSP (2024), pp. 11646\u201311650. https:\/\/doi.org\/10.1109\/ICASSP48485.2024.10447726","DOI":"10.1109\/ICASSP48485.2024.10447726"},{"key":"3408_CR50","doi-asserted-by":"crossref","unstructured":"Y. Wang, C. Lu, Y. Zong, et al., Time-frequency transformer: a novel time frequency joint learning method for speech emotion recognition, in Int. Conf. Neural Inf. Process., (Springer, 2023), pp. 415\u2013427","DOI":"10.1007\/978-981-99-8138-0_33"},{"key":"3408_CR51","doi-asserted-by":"publisher","unstructured":"Z. Wan, Z. Qiu, Y. Liu, et al. Metadata-enhanced speech emotion recognition: augmented residual integration and co-attention in two-stage fine-tuning, in ICASSP, 2025, pp. 1\u20135. https:\/\/doi.org\/10.1109\/ICASSP49660.2025.10890812","DOI":"10.1109\/ICASSP49660.2025.10890812"},{"key":"3408_CR52","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2022.109648","volume":"130","author":"X Xu","year":"2022","unstructured":"X. Xu, D. Li, Y. Zhou et al., Multi-type features separating fusion learning for speech emotion recognition. Appl. Soft Comput. 130, 109648 (2022). https:\/\/doi.org\/10.1016\/j.asoc.2022.109648","journal-title":"Appl. Soft Comput."},{"key":"3408_CR53","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1016\/j.specom.2022.07.005","volume":"145","author":"JX Ye","year":"2022","unstructured":"J.X. Ye, X.C. Wen, X.Z. Wang et al., GM-TCNet: gated multi-scale temporal convolutional network using emotion causality for speech emotion recognition. Speech Commun. 145, 21\u201335 (2022). https:\/\/doi.org\/10.1016\/j.specom.2022.07.005","journal-title":"Speech Commun."},{"issue":"1","key":"3408_CR54","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1109\/TNNLS.2020.3027600","volume":"33","author":"L Yi","year":"2020","unstructured":"L. Yi, M.W. Mak, Improving speech emotion recognition with adversarial data augmentation network. IEEE Trans. Neural Netw. Learn. Syst. 33(1), 172\u2013184 (2020). https:\/\/doi.org\/10.1109\/TNNLS.2020.3027600","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"3408_CR55","doi-asserted-by":"publisher","unstructured":"Z. Yuan, C.L.P. Chen, S. Li, et al., Disentanglement network: disentangle the emotional features from acoustic features for speech emotion recognition, in ICASSP (2024), pp. 11686\u201311690. https:\/\/doi.org\/10.1109\/ICASSP48485.2024.10448044","DOI":"10.1109\/ICASSP48485.2024.10448044"},{"issue":"11","key":"3408_CR56","doi-asserted-by":"publisher","first-page":"2191","DOI":"10.3390\/electronics13112191","volume":"13","author":"S Yu","year":"2024","unstructured":"S. Yu, J. Meng, W. Fan et al., Speech emotion recognition using dual-stream representation and cross-attention fusion. Electronics 13(11), 2191 (2024). https:\/\/doi.org\/10.3390\/electronics13112191","journal-title":"Electronics"},{"issue":"21","key":"3408_CR57","doi-asserted-by":"publisher","first-page":"9897","DOI":"10.3390\/app11219897","volume":"11","author":"H Zhang","year":"2021","unstructured":"H. Zhang, H. Huang, H. Han et al., A novel heterogeneous parallel convolution Bi-LSTM for speech emotion recognition. Appl. Sci. 11(21), 9897 (2021). https:\/\/doi.org\/10.3390\/app11219897","journal-title":"Appl. Sci."},{"key":"3408_CR58","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2025.110060","volume":"144","author":"H Zhang","year":"2025","unstructured":"H. Zhang, H. Huang, P. Zhao et al., Sparse temporal aware capsule network for robust speech emotion recognition. Eng. Appl. Artif. Intell. 144, 110060 (2025). https:\/\/doi.org\/10.1016\/j.engappai.2025.110060","journal-title":"Eng. Appl. Artif. Intell."},{"key":"3408_CR59","doi-asserted-by":"publisher","unstructured":"J. Zhao, F. Wang, K. Li, et al., Temporal-frequency state space duality: an efficient paradigm for speech emotion recognition, in ICASSP (2025), pp. 1\u20135. https:\/\/doi.org\/10.1109\/ICASSP49660.2025.10890265","DOI":"10.1109\/ICASSP49660.2025.10890265"},{"key":"3408_CR60","doi-asserted-by":"crossref","unstructured":"Y. Zhong, Y. Hu, H. Huang, et al., A lightweight model based on separable convolution for speech emotion recognition, in Interspeech vol. 11 (2020), pp. 3331\u20133335. http:\/\/dx.doi.org\/10.21437\/Interspeech.2020-2408","DOI":"10.21437\/Interspeech.2020-2408"},{"key":"3408_CR61","doi-asserted-by":"publisher","unstructured":"H. Zou, Y. Si, C. Chen, et al. Speech emotion recognition with co-attention based multi-level acoustic information, in ICASSP (2022), pp. 7367\u20137371. https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9747095","DOI":"10.1109\/ICASSP43922.2022.9747095"}],"container-title":["Circuits, Systems, and Signal Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-025-03408-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00034-025-03408-y","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-025-03408-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T07:31:49Z","timestamp":1779435109000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00034-025-03408-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":61,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2026,5]]}},"alternative-id":["3408"],"URL":"https:\/\/doi.org\/10.1007\/s00034-025-03408-y","relation":{},"ISSN":["0278-081X","1531-5878"],"issn-type":[{"value":"0278-081X","type":"print"},{"value":"1531-5878","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"6 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 October 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 October 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 November 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"All authors of this study agree to publication.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}}]}}