{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T20:21:16Z","timestamp":1768335676695,"version":"3.49.0"},"reference-count":55,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,1,6]],"date-time":"2026-01-06T00:00:00Z","timestamp":1767657600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,6]],"date-time":"2026-01-06T00:00:00Z","timestamp":1767657600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Cluster Comput"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1007\/s10586-025-05883-z","type":"journal-article","created":{"date-parts":[[2026,1,6]],"date-time":"2026-01-06T15:32:55Z","timestamp":1767713575000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Wav2TP: A novel speech emotion recognition model using temporal pooling over transformer-based Wav2Vec2 embeddings"],"prefix":"10.1007","volume":"29","author":[{"given":"Yunus","family":"Korkmaz","sequence":"first","affiliation":[]},{"given":"Yaser","family":"Jararweh","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,6]]},"reference":[{"issue":"6","key":"5883_CR1","doi-asserted-by":"publisher","first-page":"676","DOI":"10.1002\/wcs.147","volume":"2","author":"JR Zadra","year":"2011","unstructured":"Zadra, J.R., Clore, G.L.: Emotion and Perception: The Role of Affective Information. Wiley Interdiscip Rev Cogn Sci. 2(6), 676\u2013685 (2011)","journal-title":"Wiley Interdiscip Rev Cogn Sci."},{"key":"5883_CR2","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102019","volume":"102","author":"SK Khare","year":"2024","unstructured":"Khare, S.K., et al.: Emotion recognition and artificial intelligence: A systematic review (2014\u20132023) and research recommendations. Information Fusion 102, 102019 (2024)","journal-title":"Information Fusion"},{"key":"5883_CR3","volume-title":"Emotion Recognition using Speech Features","author":"SR Krothapalli","year":"2013","unstructured":"Krothapalli, S.R., Koolagudi, S.G.: Speech Emotion Recognition: A Review. In: Emotion Recognition using Speech Features. Springer, New York, NY (2013)"},{"key":"5883_CR4","doi-asserted-by":"crossref","unstructured":"Alhussein, G., et al.: Speech emotion recognition in conversations using artificial intelligence: A systematic review and meta-analysis. Artif. Intell. Rev., 58, 198 (2025)","DOI":"10.1007\/s10462-025-11197-8"},{"key":"5883_CR5","doi-asserted-by":"publisher","DOI":"10.3390\/app13084750","author":"AS Alluhaidan","year":"2023","unstructured":"Alluhaidan, A.S., et al.: Speech Emotion Recognition through Hybrid Features and Convolutional Neural Network. Applied Sciences (2023). https:\/\/doi.org\/10.3390\/app13084750","journal-title":"Applied Sciences"},{"key":"5883_CR6","doi-asserted-by":"publisher","DOI":"10.3390\/electronics10101163","author":"E Lieskovsk\u00e1","year":"2021","unstructured":"Lieskovsk\u00e1, E., et al.: A Review on Speech Emotion Recognition Using Deep Learning and Attention Mechanism. Electronics (2021). https:\/\/doi.org\/10.3390\/electronics10101163","journal-title":"Electronics"},{"key":"5883_CR7","doi-asserted-by":"publisher","DOI":"10.3390\/a13030070","author":"K Zvarevashe","year":"2020","unstructured":"Zvarevashe, K., Olugbara, O.: Ensemble Learning of Hybrid Acoustic Features for Speech Emotion Recognition. Algorithms (2020). https:\/\/doi.org\/10.3390\/a13030070","journal-title":"Algorithms"},{"key":"5883_CR8","doi-asserted-by":"publisher","DOI":"10.1017\/ATSIP.2021.7","volume":"10","author":"A Ando","year":"2021","unstructured":"Ando, A., et al.: Speech emotion recognition based on listener-dependent emotion perception models. APSIPA Transactions on Signal and Information Processing 10, e6 (2021)","journal-title":"APSIPA Transactions on Signal and Information Processing"},{"key":"5883_CR9","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Advancements and challenges in speech emotion recognition: a comprehensive review, Proc. SPIE 13077, Fourth International Conference on Signal Processing and Machine Learning (CONF-SPML), (2024)","DOI":"10.1117\/12.3027122"},{"key":"5883_CR10","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural. Inf. Process. Syst. 30, 6000\u20136010 (2017)"},{"key":"5883_CR11","unstructured":"Wolf, T., et al.: Transformers: State-of-the-Art Natural Language Processing, Neurocomputing, In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pp. 38\u201345, (2020)"},{"key":"5883_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.122666","volume":"241","author":"S Islam","year":"2024","unstructured":"Islam, S., et al.: A comprehensive survey on applications of transformers for deep learning tasks. Expert Systems with Applications 241, 122666 (2024)","journal-title":"Expert Systems with Applications"},{"key":"5883_CR13","unstructured":"Baevski, A., et al.: Wav2vec 2.0: a framework for self-supervised learning of speech representations, In Proceedings of the 34th International Conference on Neural Information Processing Systems (NIPS \u201820), pp. 12449\u201312460, (2020)"},{"key":"5883_CR14","doi-asserted-by":"publisher","first-page":"96","DOI":"10.1007\/s42452-024-05731-6","volume":"6","author":"H Jiang","year":"2024","unstructured":"Jiang, H., et al.: Scanning dial: the instantaneous audio classification transformer. Discover Appl. Sci. 6, 96 (2024)","journal-title":"Discover Appl. Sci."},{"key":"5883_CR15","doi-asserted-by":"crossref","unstructured":"Zhu, W., Omar, M.: Multiscale audio spectrogram transformer for efficient audio classification (2023). arXiv:2303.10757","DOI":"10.1109\/ICASSP49357.2023.10096513"},{"key":"5883_CR16","unstructured":"Kim, S., et al.: Squeezeformer: An efficient transformer for automatic speech recognition (2022). arXiv:2206.00888"},{"key":"5883_CR17","doi-asserted-by":"publisher","DOI":"10.3390\/rs16132442","author":"Z Pu","year":"2024","unstructured":"Pu, Z., et al.: A Novel Multi-Feature Fusion Model Based on Pre-Trained Wav2vec 2.0 for Underwater Acoustic Target Recognition. Remote Sensing (2024). https:\/\/doi.org\/10.3390\/rs16132442","journal-title":"Remote Sensing"},{"key":"5883_CR18","doi-asserted-by":"publisher","first-page":"340","DOI":"10.1007\/s11263-018-1111-5","volume":"127","author":"A Cherian","year":"2019","unstructured":"Cherian, A., Gould, S.: Second-order Temporal pooling for action recognition. Int. J. Comput. Vision. 127, 340\u2013362 (2019)","journal-title":"Int. J. Comput. Vision"},{"issue":"3","key":"5883_CR19","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1016\/0098-3004(93)90090-R","volume":"19","author":"A Ma\u0107kiewicz","year":"1993","unstructured":"Ma\u0107kiewicz, A., Ratajczak, W.: Principal components analysis (PCA). Comput. Geosci. 19(3), 303\u2013342 (1993)","journal-title":"Comput. Geosci."},{"key":"5883_CR20","doi-asserted-by":"crossref","unstructured":"Ye, J., et al.: Temporal modeling matters: A novel Temporal emotional modeling approach for speech emotion recognition (2022). arXiv:2211.08233","DOI":"10.1109\/ICASSP49357.2023.10096370"},{"key":"5883_CR21","doi-asserted-by":"crossref","unstructured":"Zou, H., et al.: Speech emotion recognition with Co-Attention based Multi-Level acoustic information (2022). arXiv:2203.15326","DOI":"10.1109\/ICASSP43922.2022.9747095"},{"key":"5883_CR22","doi-asserted-by":"publisher","DOI":"10.3390\/s23136212","author":"R Ullah","year":"2023","unstructured":"Ullah, R., et al.: Speech Emotion Recognition Using Convolution Neural Networks and Multi-Head Convolutional Transformer. Sensors (2023). https:\/\/doi.org\/10.3390\/s23136212","journal-title":"Sensors"},{"key":"5883_CR23","unstructured":"Jafarzadeh, P., et al.: Speaker emotion recognition: Leveraging Self-Supervised models for feature extraction using Wav2Vec2 and HuBERT (2024). arXiv:2411.02964"},{"key":"5883_CR24","doi-asserted-by":"publisher","first-page":"11265","DOI":"10.1007\/s11042-022-13463-1","volume":"82","author":"K Chauhan","year":"2023","unstructured":"Chauhan, K., et al.: A method for simplifying the spoken emotion recognition system using a shallow neural network and temporal feature stacking & pooling (TFSP). Multimedia Tools and Applications 82, 11265\u201311283 (2023)","journal-title":"Multimedia Tools and Applications"},{"issue":"2","key":"5883_CR25","doi-asserted-by":"publisher","first-page":"401","DOI":"10.1049\/cit2.12233","volume":"8","author":"N Saleem","year":"2023","unstructured":"Saleem, N., et al.: DeepCNN: Spectro-temporal feature representation for speech emotion recognition. CAAI Transactions on Intelligence Technology 8(2), 401\u2013417 (2023)","journal-title":"CAAI Transactions on Intelligence Technology"},{"key":"5883_CR26","doi-asserted-by":"publisher","first-page":"225","DOI":"10.1007\/s10462-024-10869-1","volume":"57","author":"NC Ristea","year":"2024","unstructured":"Ristea, N.C., et al.: Cascaded cross-modal transformer for audio\u2013textual classification. Artif. Intell. Rev. 57, 225 (2024)","journal-title":"Artif. Intell. Rev."},{"key":"5883_CR27","doi-asserted-by":"publisher","first-page":"23667","DOI":"10.1007\/s11227-024-06158-x","volume":"80","author":"B Nasersharif","year":"2024","unstructured":"Nasersharif, B., Namvarpour, M.: Exploring the potential of Wav2vec 2.0 for speech emotion recognition using classifier combination and attention-based feature fusion. J. Supercomputing. 80, 23667\u201323688 (2024)","journal-title":"J. Supercomputing"},{"key":"5883_CR28","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvoice.2024.09.002","author":"J Cai","year":"2024","unstructured":"Cai, J., et al.: Voice disorder classification using Wav2vec 2.0 feature extraction. J. Voice. Article in Press(2024). https:\/\/doi.org\/10.1016\/j.jvoice.2024.09.002","journal-title":"J. Voice"},{"key":"5883_CR29","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2023.110814","volume":"277","author":"N Naderi","year":"2023","unstructured":"Naderi, N., Nasersharif, B.: Cross Corpus Speech Emotion Recognition using transfer learning and attention-based fusion of Wav2Vec2 and prosody features. Knowledge-Based Systems 277, 110814 (2023)","journal-title":"Knowledge-Based Systems"},{"key":"5883_CR30","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2023.101550","volume":"83","author":"SR Kadiri","year":"2024","unstructured":"Kadiri, S.R., Javanmardi, F., Alku, P.: Investigation of self-supervised pre-trained models for classification of voice quality from speech and neck surface accelerometer signals. Computer Speech & Language 83, 101550 (2024)","journal-title":"Computer Speech & Language"},{"key":"5883_CR31","doi-asserted-by":"publisher","first-page":"633","DOI":"10.1007\/s44230-024-00088-w","volume":"4","author":"E Lesyk","year":"2024","unstructured":"Lesyk, E., et al.: Empathetic Deep Learning: Transferring Adult Speech Emotion Models to Children with Gender-Specific Adaptations Using Neural Embeddings. Human-Centric Intelligent Systems 4, 633\u2013642 (2024)","journal-title":"Human-Centric Intelligent Systems"},{"key":"5883_CR32","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2023.103010","volume":"156","author":"M Liu","year":"2024","unstructured":"Liu, M., et al.: Multiscale-multichannel feature extraction and classification through one-dimensional convolutional neural network for Speech emotion recognition. Speech Communication 156, 103010 (2024)","journal-title":"Speech Communication"},{"issue":"1","key":"5883_CR33","doi-asserted-by":"publisher","first-page":"1009","DOI":"10.32604\/cmc.2023.041332","volume":"77","author":"S Park","year":"2023","unstructured":"Park, S., et al.: Using Speaker-Specific Emotion Representations in Wav2vec 2.0-Based Modules for Speech Emotion Recognition. Computers, Materials and Continua 77(1), 1009\u20131030 (2023)","journal-title":"Computers, Materials and Continua"},{"key":"5883_CR34","doi-asserted-by":"publisher","DOI":"10.1016\/j.dcan.2024.10.007","author":"X Li","year":"2024","unstructured":"Li, X., Zhang, Z.: Cross-feature fusion speech emotion recognition based on attention mask residual network and Wav2vec 2.0. Digit. Commun. Networks. Article in Press(2024). https:\/\/doi.org\/10.1016\/j.dcan.2024.10.007","journal-title":"Digit. Commun. Networks"},{"key":"5883_CR35","doi-asserted-by":"crossref","unstructured":"Burkhardt, F., et al.: A Database of German Emotional Speech, Proceedings of Interspeech, Lisbon, Portugal, (2005)","DOI":"10.21437\/Interspeech.2005-446"},{"key":"5883_CR36","first-page":"pp120","volume":"23","author":"A Zulfiqar","year":"2019","unstructured":"Zulfiqar, A., et al.: Descriptive statistics: Measures of central Tendency, Dispersion, correlation and regression. Airway. 23, pp120\u2013125 (2019)","journal-title":"Airway"},{"key":"5883_CR37","doi-asserted-by":"publisher","first-page":"533","DOI":"10.1038\/323533a0","volume":"323","author":"DE Rumelhart","year":"1986","unstructured":"Rumelhart, D.E., et al.: Learning representations by back-propagating errors. Nature. 323, 533\u2013536 (1986)","journal-title":"Nature"},{"key":"5883_CR38","unstructured":"Kingma, D.P., Ba, J.: Adam: A method for stochastic optimization (2014). arXiv:1412.6980"},{"key":"5883_CR39","unstructured":"Nair, V., Hinton, G.E.: Rectified Linear Units Improve Restricted Boltzmann Machines, In Proceedings of the 27th International Conference on Machine Learning (ICML-10), pp. 807\u2013814, (2010)"},{"key":"5883_CR40","doi-asserted-by":"publisher","first-page":"2560","DOI":"10.1016\/j.procs.2024.02.074","volume":"232","author":"A Chakhtouna","year":"2024","unstructured":"Chakhtouna, A., et al.: Unveiling embedded features in Wav2vec2 and HuBERT Msodels for speech emotion recognition. Procedia Comput. Sci. 232, 2560\u20132569 (2024)","journal-title":"Procedia Comput. Sci."},{"key":"5883_CR41","doi-asserted-by":"publisher","DOI":"10.3390\/app12010327","author":"C Luna-Jim\u00e9nez","year":"2021","unstructured":"Luna-Jim\u00e9nez, C., et al.: A Proposal for Multimodal Emotion Recognition Using Aural Transformers and Action Units on RAVDESS Dataset. Applied Sciences (2021). https:\/\/doi.org\/10.3390\/app12010327","journal-title":"Applied Sciences"},{"key":"5883_CR42","doi-asserted-by":"crossref","unstructured":"Pepino, L., et al.: Emotion Recognition from Speech Using Wav2vec 2.0 Embeddings, Proc. Interspeech, pp. 3400\u20133404, (2021)","DOI":"10.21437\/Interspeech.2021-703"},{"key":"5883_CR43","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0318297","author":"N Wang","year":"2025","unstructured":"Wang, N., Yang, D.: Speech emotion recognition using fine-tuned Wav2vec2.0 and neural controlled differential equations classifier. Plos One (2025). https:\/\/doi.org\/10.1371\/journal.pone.0318297","journal-title":"Plos One"},{"key":"5883_CR44","doi-asserted-by":"publisher","DOI":"10.3389\/fnins.2023.1183132","author":"F Li","year":"2023","unstructured":"Li, F., et al.: GCF2-Net: global-aware cross-modal feature fusion network for speech emotion recognition. Frontiers in Neuroscience (2023). https:\/\/doi.org\/10.3389\/fnins.2023.1183132","journal-title":"Frontiers in Neuroscience"},{"issue":"6","key":"5883_CR45","doi-asserted-by":"publisher","first-page":"1576","DOI":"10.1109\/TMM.2017.2766843","volume":"20","author":"S Zhang","year":"2018","unstructured":"Zhang, S., et al.: Speech Emotion Recognition Using Deep Convolutional Neural Network and Discriminant Temporal Pyramid Matching. IEEE Transactions on Multimedia 20(6), 1576\u20131590 (2018)","journal-title":"IEEE Transactions on Multimedia"},{"key":"5883_CR46","doi-asserted-by":"crossref","unstructured":"Kim, J., et al.: Towards speech emotion recognition in the wild using aggregated corpora and deep Multi-Task learning, arXiv:170803920, (2017)","DOI":"10.21437\/Interspeech.2017-736"},{"key":"5883_CR47","doi-asserted-by":"crossref","unstructured":"Tiwari, U., et al.: Multi-Conditioning and Data Augmentation Using Generative Noise Model for Speech Emotion Recognition in Noisy Conditions, IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7194\u20137198, (2020)","DOI":"10.1109\/ICASSP40776.2020.9053581"},{"key":"5883_CR48","doi-asserted-by":"publisher","DOI":"10.3390\/bdcc7030146","author":"M Saumard","year":"2023","unstructured":"Saumard, M.: Enhancing Speech Emotions Recognition Using Multivariate Functional Data Analysis. Big Data and Cognitive Computing (2023). https:\/\/doi.org\/10.3390\/bdcc7030146","journal-title":"Big Data and Cognitive Computing"},{"key":"5883_CR49","doi-asserted-by":"publisher","DOI":"10.3389\/fpsyg.2022.1075624","author":"C Sun","year":"2022","unstructured":"Sun, C., et al.: Speech emotion recognition based on improved masking EMD and convolutional recurrent neural network. Frontiers in Psychology (2022). https:\/\/doi.org\/10.3389\/fpsyg.2022.1075624","journal-title":"Frontiers in Psychology"},{"key":"5883_CR50","doi-asserted-by":"publisher","DOI":"10.52228\/JRUB.2023-36-2-10","author":"N Dewangan","year":"2023","unstructured":"Dewangan, N., et al.: Time-Frequency Image-based Speech Emotion Recognition using Artificial Neural Network. Journal of Ravishankar University (PART-B) (2023). https:\/\/doi.org\/10.52228\/JRUB.2023-36-2-10","journal-title":"Journal of Ravishankar University (PART-B)"},{"key":"5883_CR51","doi-asserted-by":"publisher","DOI":"10.1016\/j.apacoust.2023.109492","volume":"211","author":"C Hema","year":"2023","unstructured":"Hema, C., Garcia Marquez, F.P.: Emotional speech Recognition using CNN and Deep learning techniques. Applied Acoustics 211, 109492 (2023)","journal-title":"Applied Acoustics"},{"issue":"1","key":"5883_CR52","doi-asserted-by":"publisher","first-page":"190","DOI":"10.14500\/aro.12038","volume":"13","author":"AA Abdullah","year":"2025","unstructured":"Abdullah, A.A., et al.: In-depth Analysis on Machine Learning Approaches: Techniques, Applications, and Trends. Aro-The Scientific Journal of Koya University 13(1), 190\u2013202 (2025)","journal-title":"Aro-The Scientific Journal of Koya University"},{"key":"5883_CR53","unstructured":"Abdullah, A.A., et al.: Breaking walls: Pioneering automatic speech recognition for central kurdish: End-to-End transformer paradigm (2024). arXiv:2406.02561"},{"issue":"2","key":"5883_CR54","doi-asserted-by":"publisher","first-page":"108","DOI":"10.37652\/juaps.2022.176500","volume":"16","author":"AA Abdullah","year":"2022","unstructured":"Abdullah, A.A., Veisi, H.: Central Kurdish Automatic Speech Recognition using Deep Learning. Journal of University of Anbar for Pure Science 16(2), 108\u2013118 (2022)","journal-title":"Journal of University of Anbar for Pure Science"},{"key":"5883_CR55","unstructured":"Abdullah, A.A., et al.: End-to-End Transformer-based automatic speech recognition for Northern kurdish: A pioneering approach (2024). arXiv:2410.16330"}],"container-title":["Cluster Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-025-05883-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10586-025-05883-z","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-025-05883-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T13:46:54Z","timestamp":1768312014000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10586-025-05883-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,6]]},"references-count":55,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2026,4]]}},"alternative-id":["5883"],"URL":"https:\/\/doi.org\/10.1007\/s10586-025-05883-z","relation":{},"ISSN":["1386-7857","1573-7543"],"issn-type":[{"value":"1386-7857","type":"print"},{"value":"1573-7543","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1,6]]},"assertion":[{"value":"8 October 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 November 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 December 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 January 2026","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 January 2026","order":6,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Update","order":7,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The original online version of this article was revised: The affiliation of the authors Yunus Korkmaz and Yaser Jararweh have been swapped. The affiliations are listed correctly now.","order":8,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"94"}}