{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T11:19:12Z","timestamp":1772104752048,"version":"3.50.1"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T00:00:00Z","timestamp":1772064000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T00:00:00Z","timestamp":1772064000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-026-21395-3","type":"journal-article","created":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T10:34:33Z","timestamp":1772102073000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Transformer encoder and data augmentation for real-time speech emotion recognition"],"prefix":"10.1007","volume":"85","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4741-8036","authenticated-orcid":false,"given":"Chawki","family":"Barhoumi","sequence":"first","affiliation":[]},{"given":"Yassine","family":"BenAyed","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,2,26]]},"reference":[{"key":"21395_CR1","doi-asserted-by":"crossref","unstructured":"Mustafa HH, Darwish NR, Hefny HA (2024) Automatic speech emotion recognition: A systematic literature review. Intl J Speech Techno 1\u201319","DOI":"10.1007\/s10772-024-10096-7"},{"key":"21395_CR2","doi-asserted-by":"publisher","first-page":"117327","DOI":"10.1109\/ACCESS.2019.2936124","volume":"7","author":"RA Khalil","year":"2019","unstructured":"Khalil RA, Jones E, Babar MI, Jan T, Zafar MH, Alhussain T (2019) Speech emotion recognition using deep learning techniques: A review. IEEE Access 7:117327\u2013117345","journal-title":"IEEE Access"},{"issue":"2","key":"21395_CR3","doi-asserted-by":"publisher","first-page":"49","DOI":"10.1007\/s10462-024-11065-x","volume":"58","author":"C Barhoumi","year":"2024","unstructured":"Barhoumi C, BenAyed Y (2024) Real-time speech emotion recognition using deep learning and data augmentation. Artificial Intell Rev 58(2):49","journal-title":"Artificial Intell Rev"},{"key":"21395_CR4","doi-asserted-by":"crossref","unstructured":"Middya AI, Nag B, Roy S (2024) Effective MLP and CNN based ensemble learning for speech emotion recognition. Multimed Tools Appl 1\u201328","DOI":"10.1007\/s11042-024-19017-x"},{"issue":"4","key":"21395_CR5","doi-asserted-by":"publisher","first-page":"1249","DOI":"10.3390\/s21041249","volume":"21","author":"BJ Abbaschian","year":"2021","unstructured":"Abbaschian BJ, Sierra-Sosa D, Elmaghraby A (2021) Deep learning techniques for speech emotion recognition from databases to models. Sensors 21(4):1249","journal-title":"Sensors"},{"key":"21395_CR6","doi-asserted-by":"crossref","unstructured":"Oh KJ, Lee D, Ko B, Choi HJ (2017) A chatbot for psychiatric counseling in mental healthcare service based on emotional dialogue analysis and sentence generation. In: 2017 18th IEEE international conference on mobile data management (MDM), IEEE, pp 371\u2013375","DOI":"10.1109\/MDM.2017.64"},{"issue":"13","key":"21395_CR7","doi-asserted-by":"publisher","first-page":"6212","DOI":"10.3390\/s23136212","volume":"23","author":"R Ullah","year":"2023","unstructured":"Ullah R, Asif M, Shah WA, Anjam F, Ullah I, Khurshaid T, Wuttisittikulkij L, Shah S, Ali SM, Alibakhshikenari M (2023) Speech emotion recognition using convolution neural networks and multi-head convolutional transformer. Sensors 23(13):6212","journal-title":"Sensors"},{"key":"21395_CR8","doi-asserted-by":"crossref","unstructured":"Pan B, Hirota K, Jia Z, Dai Y (2023) A review of multimodal emotion recognition from datasets preprocessing features and fusion methods. Neurocomputing 126866","DOI":"10.1016\/j.neucom.2023.126866"},{"issue":"1","key":"21395_CR9","doi-asserted-by":"publisher","first-page":"153","DOI":"10.1007\/s11760-023-02716-7","volume":"18","author":"SP Mishra","year":"2024","unstructured":"Mishra SP, Warule P, Deb S (2024) Speech emotion recognition using mfcc-based entropy feature. Signal Image Video Process 18(1):153\u2013161","journal-title":"Signal Image Video Process"},{"key":"21395_CR10","doi-asserted-by":"publisher","first-page":"1616","DOI":"10.1109\/TIFS.2019.2941773","volume":"15","author":"A Chowdhury","year":"2019","unstructured":"Chowdhury A, Ross A (2019) Fusing mfcc and lpc features using 1d triplet cnn for speaker recognition in severely degraded audio signals. IEEE Trans Inf Forensic Secur 15:1616\u20131629","journal-title":"IEEE Trans Inf Forensic Secur"},{"issue":"4","key":"21395_CR11","doi-asserted-by":"publisher","first-page":"2525","DOI":"10.1007\/s11277-023-10244-3","volume":"129","author":"MJ Al-Dujaili","year":"2023","unstructured":"Al-Dujaili MJ, Ebrahimi-Moghadam A (2023) Speech emotion recognition: A comprehensive survey. Wirel Personal Commun 129(4):2525\u20132561","journal-title":"Wirel Personal Commun"},{"key":"21395_CR12","doi-asserted-by":"publisher","unstructured":"O\u2019Shaughnessy D (2025) Review of automatic estimation of emotions in speech. Appl Sci 15(10). https:\/\/doi.org\/10.3390\/app15105731","DOI":"10.3390\/app15105731"},{"key":"21395_CR13","doi-asserted-by":"crossref","unstructured":"Zhang T, Tan Z (2024) Survey of deep emotion recognition in dynamic data using facial speech and textual cues. Multimed Tools Appl 1\u201340","DOI":"10.36227\/techrxiv.15184302.v2"},{"key":"21395_CR14","unstructured":"Vaswani A (2017) Attention is all you need. Adv Neural Inf Process Syst"},{"issue":"12","key":"21395_CR15","doi-asserted-by":"publisher","first-page":"7433","DOI":"10.1007\/s00034-023-02454-8","volume":"42","author":"S Ahmed","year":"2023","unstructured":"Ahmed S, Nielsen IE, Tripathi A, Siddiqui S, Ramachandran RP, Rasool G (2023) Transformers in time-series analysis: A tutorial. Circ Syst Signal Process 42(12):7433\u20137466","journal-title":"Circ Syst Signal Process"},{"key":"21395_CR16","doi-asserted-by":"crossref","unstructured":"Barhoumi C, BenAyed Y (2023) Improving speech emotion recognition using data augmentation and balancing techniques. In: 2023 international conference on cyberworlds (CW), IEEE, pp 282\u2013289","DOI":"10.1109\/CW58918.2023.00051"},{"issue":"12","key":"21395_CR17","doi-asserted-by":"publisher","first-page":"7357","DOI":"10.1007\/s00034-023-02446-8","volume":"42","author":"M Hama Saeed","year":"2023","unstructured":"Hama Saeed M (2023) Improved speech emotion classification using deep neural network. Circ Syst Signal Process 42(12):7357\u20137376","journal-title":"Circ Syst Signal Process"},{"issue":"3","key":"21395_CR18","doi-asserted-by":"publisher","first-page":"1565","DOI":"10.18280\/ts.410344","volume":"41","author":"AS Shinde","year":"2024","unstructured":"Shinde AS, Patil VV (2024) Effect of data augmentation cross-validation methods in robustness of explainable speech based emotion recognition. Traitement du Signal 41(3):1565","journal-title":"Traitement du Signal"},{"key":"21395_CR19","doi-asserted-by":"crossref","unstructured":"Baklouti I, Ahmed OB, Baklouti R, Fernandez C (2024) Cross-lingual transfert learning for speech emotion recognition. In: 2024 IEEE 7th international conference on advanced technologies signal and image processing (ATSIP), vol 1. IEEE, pp 559\u2013563","DOI":"10.1109\/ATSIP62566.2024.10638918"},{"issue":"8","key":"21395_CR20","doi-asserted-by":"publisher","first-page":"4750","DOI":"10.3390\/app13084750","volume":"13","author":"AS Alluhaidan","year":"2023","unstructured":"Alluhaidan AS, Saidani O, Jahangir R, Nauman MA, Neffati OS (2023) Speech emotion recognition through hybrid features and convolutional neural network. Appl Sci 13(8):4750","journal-title":"Appl Sci"},{"issue":"4","key":"21395_CR21","doi-asserted-by":"publisher","first-page":"839","DOI":"10.3390\/electronics12040839","volume":"12","author":"K Bhangale","year":"2023","unstructured":"Bhangale K, Kothandaraman M (2023) Speech emotion recognition based on multiple acoustic features and deep convolutional neural network. Electron 12(4):839","journal-title":"Electron"},{"key":"21395_CR22","doi-asserted-by":"crossref","unstructured":"Dabbabi K, Mars A (2024) Self-supervised learning for speech emotion recognition task using audio-visual features and distil hubert model on baved and ravdess databases. J Syst Sci Syst Eng 1\u201331","DOI":"10.1007\/s11518-024-5607-y"},{"key":"21395_CR23","doi-asserted-by":"crossref","unstructured":"Akinpelu S, Viriri S, Adegun A (2023) Lightweight deep learning framework for speech emotion recognition. IEEE Access","DOI":"10.1109\/ACCESS.2023.3297269"},{"issue":"1","key":"21395_CR24","doi-asserted-by":"publisher","first-page":"13126","DOI":"10.1038\/s41598-024-63776-4","volume":"14","author":"S Akinpelu","year":"2024","unstructured":"Akinpelu S, Viriri S, Adegun A (2024) An enhanced speech emotion recognition using vision transformer. Sci Rep 14(1):13126","journal-title":"Sci Rep"},{"key":"21395_CR25","doi-asserted-by":"crossref","unstructured":"Mantegazza I, Ntalampiras S (2023) Italian speech emotion recognition. In: 2023 24th international conference on digital signal processing (DSP), IEEE, pp 1\u20135","DOI":"10.1109\/DSP58604.2023.10167766"},{"key":"21395_CR26","doi-asserted-by":"crossref","unstructured":"Thiripurasundari D, Bhangale K, Aashritha V, Mondreti S, Kothandaraman M (2024) Speech emotion recognition for human\u2013computer interaction. Int J Speech Technol 1\u201314","DOI":"10.1007\/s10772-024-10138-0"},{"key":"21395_CR27","doi-asserted-by":"crossref","unstructured":"Hashem A, Arif M, Alghamdi M (2023) Speech emotion recognition approaches: A systematic review. Speech Commun 102974","DOI":"10.1016\/j.specom.2023.102974"},{"issue":"21","key":"21395_CR28","doi-asserted-by":"publisher","first-page":"9890","DOI":"10.3390\/app14219890","volume":"14","author":"KM Roh","year":"2024","unstructured":"Roh KM, Lee SP (2024) Enhanced speech emotion recognition using conditional-dcgan-based data augmentation. Appl Sci 14(21):9890","journal-title":"Appl Sci"},{"key":"21395_CR29","doi-asserted-by":"crossref","unstructured":"Sharrab YO, Attar H, Eljinini MAH, Al-Omary Y, Al-Momani W (2025) Advancements in speech recognition: A systematic review of deep learning transformer models trends innovations and future directions. IEEE Access","DOI":"10.1109\/ACCESS.2025.3550855"},{"issue":"3","key":"21395_CR30","doi-asserted-by":"publisher","first-page":"1920","DOI":"10.11591\/eei.v13i3.6049","volume":"13","author":"K Al Mukarram","year":"2024","unstructured":"Al Mukarram K, Mukhlas MA, Zahra A (2024) Enhancing speech emotion recognition with deep learning using multi-feature stacking and data augmentation. Bull Electr Eng Inf 13(3):1920\u20131926","journal-title":"Bull Electr Eng Inf"},{"key":"21395_CR31","doi-asserted-by":"publisher","first-page":"36018","DOI":"10.1109\/ACCESS.2022.3163856","volume":"10","author":"F Andayani","year":"2022","unstructured":"Andayani F, Theng LB, Tsun MT, Chua C (2022) Hybrid LSTM-transformer model for emotion recognition from speech audio files. IEEE Access 10:36018\u201336027","journal-title":"IEEE Access"},{"key":"21395_CR32","doi-asserted-by":"crossref","unstructured":"Nediyanchath A, Paramasivam P, Yenigalla P (2020) Multi-head attention for speech emotion recognition with auxiliary learning of gender recognition. In: ICASSP 2020-2020 IEEE international conference on acoustics speech and signal processing (ICASSP), IEEE, pp 7179\u20137183","DOI":"10.1109\/ICASSP40776.2020.9054073"},{"key":"21395_CR33","doi-asserted-by":"publisher","unstructured":"Waleed GT, Shaker SH (2025) Speech emotion recognition on meld and ravdess datasets using CNN. Inf 16(7). https:\/\/doi.org\/10.3390\/info16070518","DOI":"10.3390\/info16070518"},{"key":"21395_CR34","doi-asserted-by":"crossref","unstructured":"Burkhardt F, Paeschke A, Rolfes M, Sendlmeier WF, Weiss B et al (2005) A database of german emotional speech. In: Interspeech, vol 5, pp 1517\u20131520","DOI":"10.21437\/Interspeech.2005-446"},{"issue":"5","key":"21395_CR35","doi-asserted-by":"publisher","first-page":"0196391","DOI":"10.1371\/journal.pone.0196391","volume":"13","author":"SR Livingstone","year":"2018","unstructured":"Livingstone SR, Russo FA (2018) The ryerson audio-visual database of emotional speech and song (ravdess): A dynamic multimodal set of facial and vocal expressions in north american english. PLoS ONE 13(5):0196391","journal-title":"PLoS ONE"},{"key":"21395_CR36","doi-asserted-by":"publisher","DOI":"10.5683\/SP2\/E8H2MF","author":"MK Pichora-Fuller","year":"2020","unstructured":"Pichora-Fuller MK, Dupuis K (2020). Toronto emotional speech set (TESS). https:\/\/doi.org\/10.5683\/SP2\/E8H2MF","journal-title":"Toronto emotional speech set (TESS)"},{"issue":"245","key":"21395_CR37","first-page":"1","volume":"21","author":"S Chen","year":"2020","unstructured":"Chen S, Dobriban E, Lee JH (2020) A group-theoretic framework for data augmentation. J Mach Learn Res 21(245):1\u201371","journal-title":"J Mach Learn Res"},{"issue":"4","key":"21395_CR38","doi-asserted-by":"publisher","first-page":"947","DOI":"10.1007\/s10772-023-10063-8","volume":"26","author":"N Barsainyan","year":"2023","unstructured":"Barsainyan N, Singh DK (2023) Optimized cross-corpus speech emotion recognition framework based on normalized 1d convolutional neural network with data augmentation and feature selection. Int J Speech Technol 26(4):947\u2013961","journal-title":"Int J Speech Technol"},{"key":"21395_CR39","doi-asserted-by":"crossref","unstructured":"Dang A, Vu TH, Wang JC et al (2023) Emix: A data augmentation method for speech emotion recognition. In: ICASSP 2023-2023 IEEE international conference on acoustics speech and signal processing (ICASSP), IEEE, pp 1\u20135","DOI":"10.1109\/ICASSP49357.2023.10096789"},{"key":"21395_CR40","doi-asserted-by":"crossref","unstructured":"Abdelhamid AA, El-Kenawy ESM, Alotaibi B, Amer GM, Abdelkader MY, Ibrahim A, Eid MM (2022) Robust speech emotion recognition using CNN+ LSTM based on stochastic fractal search optimization algorithm. IEEE Access 10:49265\u201349284","DOI":"10.1109\/ACCESS.2022.3172954"},{"key":"21395_CR41","doi-asserted-by":"crossref","unstructured":"Padi S, Sadjadi SO, Sriram RD, Manocha D (2021) Improved speech emotion recognition using transfer learning and spectrogram augmentation. In: Proceedings of the 2021 international conference on multimodal interaction, pp 645\u2013652","DOI":"10.1145\/3462244.3481003"},{"key":"21395_CR42","doi-asserted-by":"crossref","unstructured":"Ko T, Peddinti V, Povey D, Khudanpur S (2015) Audio augmentation for speech recognition. In: Interspeech, vol 2015, p 3586","DOI":"10.21437\/Interspeech.2015-711"},{"issue":"16","key":"21395_CR43","doi-asserted-by":"publisher","first-page":"5941","DOI":"10.3390\/s22165941","volume":"22","author":"BT Atmaja","year":"2022","unstructured":"Atmaja BT, Sasou A (2022) Effects of data augmentations on speech emotion recognition. Sensors 22(16):5941","journal-title":"Sensors"},{"key":"21395_CR44","doi-asserted-by":"crossref","unstructured":"McFee B, Raffel C, Liang D, Ellis DP, McVicar M, Battenberg E, Nieto O (2015) librosa: Audio and music signal analysis in python. In: SciPy, pp 18\u201324","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"21395_CR45","doi-asserted-by":"publisher","DOI":"10.1016\/j.apacoust.2023.109578","volume":"212","author":"SP Mishra","year":"2023","unstructured":"Mishra SP, Warule P, Deb S (2023) Variational mode decomposition based acoustic and entropy features for speech emotion recognition. Appl Acoust 212:109578","journal-title":"Appl Acoust"},{"key":"21395_CR46","doi-asserted-by":"crossref","unstructured":"Nouisser A, Zouari R, Kherallah M (2022) Enhanced mobilenet and transfer learning for facial emotion recognition. In: 2022 international arab conference on information technology (ACIT), IEEE, pp 1\u20135","DOI":"10.1109\/ACIT57182.2022.9994192"},{"key":"21395_CR47","unstructured":"Gehring J, Auli M, Grangier D, Yarats D, Dauphin YN (2017) Convolutional sequence to sequence learning. In: International conference on machine learning, PMLR, pp 1243\u20131252"},{"key":"21395_CR48","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2023.111077","volume":"281","author":"Z Chen","year":"2023","unstructured":"Chen Z, Lin M, Wang Z, Zheng Q, Liu C (2023) Spatio-temporal representation learning enhanced speech emotion recognition with multi-head attention mechanisms. Knowl-Based Syst 281:111077","journal-title":"Knowl-Based Syst"},{"key":"21395_CR49","doi-asserted-by":"crossref","unstructured":"Mishra S, Bhatnagar N, Prakasam P, Sureshkumar TR (2024) Speech emotion recognition and classification using hybrid deep cnn and bilstm model. Multimedia Tools Appl 83(13):37603\u201337620","DOI":"10.1007\/s11042-023-16849-x"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-026-21395-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-026-21395-3","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-026-21395-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T10:34:43Z","timestamp":1772102083000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-026-21395-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,26]]},"references-count":49,"journal-issue":{"issue":"3","published-online":{"date-parts":[[2026,3]]}},"alternative-id":["21395"],"URL":"https:\/\/doi.org\/10.1007\/s11042-026-21395-3","relation":{},"ISSN":["1573-7721"],"issn-type":[{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,26]]},"assertion":[{"value":"7 January 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 November 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 January 2026","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 February 2026","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no relevant financial or non-financial interests to disclose.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"226"}}