{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T21:45:52Z","timestamp":1773092752162,"version":"3.50.1"},"reference-count":92,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2020,5,14]],"date-time":"2020-05-14T00:00:00Z","timestamp":1589414400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,5,14]],"date-time":"2020-05-14T00:00:00Z","timestamp":1589414400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Circuits Syst Signal Process"],"published-print":{"date-parts":[[2020,11]]},"DOI":"10.1007\/s00034-020-01429-3","type":"journal-article","created":{"date-parts":[[2020,5,14]],"date-time":"2020-05-14T00:02:25Z","timestamp":1589414545000},"page":"5681-5709","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":29,"title":["Attention and Feature Selection for Automatic Speech Emotion Recognition Using Utterance and Syllable-Level Prosodic Features"],"prefix":"10.1007","volume":"39","author":[{"given":"Starlet Ben","family":"Alex","sequence":"first","affiliation":[]},{"given":"Leena","family":"Mary","sequence":"additional","affiliation":[]},{"given":"Ben P.","family":"Babu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,5,14]]},"reference":[{"key":"1429_CR1","doi-asserted-by":"crossref","unstructured":"S.B. Alex, B.P. Babu, L. Mary, Utterance and syllable level prosodic features for automatic emotion recognition, in 2018 IEEE Recent Advances in Intelligent Computational Systems (RAICS) (IEEE, 2018) pp 31\u201335","DOI":"10.1109\/RAICS.2018.8635059"},{"key":"1429_CR2","unstructured":"D. Bahdanau, K. Cho, Y. Bengio, Neural machine translation by jointly learning to align and translate (2014). arXiv:1409.0473"},{"issue":"3","key":"1429_CR3","doi-asserted-by":"crossref","first-page":"614","DOI":"10.1037\/0022-3514.70.3.614","volume":"70","author":"R Banse","year":"1996","unstructured":"R. Banse, K.R. Scherer, Acoustic profiles in vocal emotion expression. J. Pers. Soc. Psychol. 70(3), 614\u2013636 (1996)","journal-title":"J. Pers. Soc. Psychol."},{"issue":"2","key":"1429_CR4","doi-asserted-by":"crossref","first-page":"244","DOI":"10.1109\/TETC.2013.2274797","volume":"1","author":"I Bisio","year":"2013","unstructured":"I. Bisio, A. Delfino, F. Lavagetto, M. Marchese, A. Sciarrone, Gender-driven emotion recognition through speech signals for ambient intelligence applications. IEEE Trans. Emerg. Top. Comput. 1(2), 244\u2013257 (2013)","journal-title":"IEEE Trans. Emerg. Top. Comput."},{"key":"1429_CR5","doi-asserted-by":"crossref","unstructured":"F. Burkhardt, A. Paeschke, M. Rolfes, W.F. Sendlmeier, B. Weiss, A database of German emotional speech, in Ninth European Conference on Speech Communication and Technology (2005)","DOI":"10.21437\/Interspeech.2005-446"},{"key":"1429_CR6","doi-asserted-by":"crossref","unstructured":"C. Busso, M. Bulut, C.C. Lee, A. Kazemzadeh, E. Mower, S. Kim, J.N. Chang, S. Lee, S.S. Narayanan, Iemocap: Interactive emotional dyadic motion capture database. Lang. Resour. Eval. 42(4), 335 (2008)","DOI":"10.1007\/s10579-008-9076-6"},{"issue":"4","key":"1429_CR7","doi-asserted-by":"crossref","first-page":"582","DOI":"10.1109\/TASL.2008.2009578","volume":"17","author":"C Busso","year":"2009","unstructured":"C. Busso, S. Lee, S. Narayanan, Analysis of emotionally salient aspects of fundamental frequency for emotion detection. IEEE Trans. Audio Speech Lang. Process. 17(4), 582\u2013596 (2009)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"2","key":"1429_CR8","doi-asserted-by":"crossref","first-page":"69","DOI":"10.1016\/S0376-6357(02)00078-5","volume":"60","author":"M Cabanac","year":"2002","unstructured":"M. Cabanac, What is emotion? Behav. Process. 60(2), 69\u201383 (2002)","journal-title":"Behav. Process."},{"issue":"6","key":"1429_CR9","doi-asserted-by":"crossref","first-page":"3392","DOI":"10.1121\/1.410601","volume":"96","author":"DA Cairns","year":"1994","unstructured":"D.A. Cairns, J.H. Hansen, Nonlinear analysis and classification of speech under stressed conditions. J. Acoust. Soc. Am. 96(6), 3392\u20133400 (1994)","journal-title":"J. Acoust. Soc. Am."},{"key":"1429_CR10","unstructured":"N. Campbell, P. Mokhtari, Voice quality: the 4th prosodic dimension, in 15th ICPhS, (2003), pp. 2417\u20132420"},{"issue":"6","key":"1429_CR11","doi-asserted-by":"crossref","first-page":"1154","DOI":"10.1016\/j.dsp.2012.05.007","volume":"22","author":"L Chen","year":"2012","unstructured":"L. Chen, X. Mao, Y. Xue, L.L. Cheng, Speech emotion recognition: Features and classification models. Digit. Signal Proc. 22(6), 1154\u20131160 (2012)","journal-title":"Digit. Signal Proc."},{"key":"1429_CR12","unstructured":"V. Chernykh, G. Sterling, P. Prihodko, Emotion recognition from speech with recurrent neural networks (2017). arXiv:1701.08071"},{"key":"1429_CR13","unstructured":"F. Chollet, et\u00a0al. Keras. https:\/\/github.com\/keras-team\/keras (2015)"},{"key":"1429_CR14","unstructured":"J.K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, Y. Bengio, Attention-based models for speech recognition, in Advances in Neural Information Processing Systems, (2015), pp. 577\u2013585"},{"issue":"1","key":"1429_CR15","doi-asserted-by":"crossref","first-page":"32","DOI":"10.1109\/79.911197","volume":"18","author":"R Cowie","year":"2001","unstructured":"R. Cowie, E. Douglas-Cowie, N. Tsapatsoulis, G. Votsis, S. Kollias, W. Fellenz, J.G. Taylor, Emotion recognition in human-computer interaction. IEEE Signal Process. Mag. 18(1), 32\u201380 (2001)","journal-title":"IEEE Signal Process. Mag."},{"issue":"1","key":"1429_CR16","doi-asserted-by":"crossref","first-page":"30","DOI":"10.1109\/TASL.2011.2134090","volume":"20","author":"GE Dahl","year":"2012","unstructured":"G.E. Dahl, D. Yu, L. Deng, A. Acero, Context-dependent pre-trained deep neural networks for large-vocabulary speech recognition. IEEE Trans. Audio Speech Lang. Process. 20(1), 30\u201342 (2012)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"1429_CR17","doi-asserted-by":"crossref","DOI":"10.1093\/oso\/9780195112719.001.0001","volume-title":"The Expression of the Emotions in Man and Animals","author":"C Darwin","year":"1998","unstructured":"C. Darwin, P. Prodger, The Expression of the Emotions in Man and Animals (Oxford University Press, New York, 1998)"},{"issue":"4","key":"1429_CR18","doi-asserted-by":"crossref","first-page":"357","DOI":"10.1109\/TASSP.1980.1163420","volume":"28","author":"SB Davis","year":"1980","unstructured":"S.B. Davis, P. Mermelstein, Comparison of parametric representation for monosyllabic word recognition in continuously spoken sentences. IEEE Trans. Acoust. Speech Signal Process. 28(4), 357\u2013366 (1980)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"1429_CR19","unstructured":"K. Djolander, The snack sound toolkit. http:\/\/www.speech.kth.se\/snack (2004)"},{"issue":"3","key":"1429_CR20","doi-asserted-by":"crossref","first-page":"572","DOI":"10.1016\/j.patcog.2010.09.020","volume":"44","author":"M El Ayadi","year":"2011","unstructured":"M. El Ayadi, M.S. Kamel, F. Karray, Survey on speech emotion recognition: Features, classification schemes, and databases. Pattern Recogn. 44(3), 572\u2013587 (2011)","journal-title":"Pattern Recogn."},{"key":"1429_CR21","doi-asserted-by":"crossref","unstructured":"F. Eyben, A. Batliner, B. Schuller, Towards a standard set of acoustic features for the processing of emotion in speech, in Proceedings of Meetings on Acoustics 159ASA, vol.\u00a09 (ASA, 2010), p 060006","DOI":"10.1121\/1.4739483"},{"issue":"2","key":"1429_CR22","doi-asserted-by":"crossref","first-page":"190","DOI":"10.1109\/TAFFC.2015.2457417","volume":"7","author":"F Eyben","year":"2015","unstructured":"F. Eyben, K.R. Scherer, B.W. Schuller, J. Sundberg, E. Andr\u00e9, C. Busso, L.Y. Devillers, J. Epps, P. Laukka, S.S. Narayanan et al., The geneva minimalistic acoustic parameter set (gemaps) for voice research and affective computing. IEEE Trans. Affect. Comput. 7(2), 190\u2013202 (2015)","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"4","key":"1429_CR23","doi-asserted-by":"crossref","first-page":"378","DOI":"10.1016\/S0892-1997(05)80199-7","volume":"9","author":"MP Gelfer","year":"1995","unstructured":"M.P. Gelfer, D.M. Fendel, Comparisons of jitter, shimmer, and signal-to-noise ratio from directly digitized versus taped voice samples. J. Voice 9(4), 378\u2013382 (1995)","journal-title":"J. Voice"},{"key":"1429_CR24","doi-asserted-by":"crossref","unstructured":"S. Gharsellaoui, S.A. Selouani, A.O. Dahmane, Automatic emotion recognition using auditory and prosodic indicative features, in IEEE 28th Canadian Conference on Electrical and Computer Engineering (CCECE), 2015 (IEEE, 2015) pp. 1265\u20131270","DOI":"10.1109\/CCECE.2015.7129460"},{"key":"1429_CR25","unstructured":"I.J. Goodfellow, D. Warde-Farley, M. Mirza, A. Courville, Y. Bengio, Maxout networks (2013). arXiv:1302.4389"},{"key":"1429_CR26","doi-asserted-by":"crossref","first-page":"531","DOI":"10.21437\/Interspeech.2018-1076","volume":"2018","author":"G Gosztolya","year":"2018","unstructured":"G. Gosztolya, T. Gr\u00f3sz, L. T\u00f3th, General utterance-level feature extraction for classifying crying sounds, atypical & self-assessed affect and heart beats. Proc. Interspeech 2018, 531\u2013535 (2018)","journal-title":"Proc. Interspeech"},{"key":"1429_CR27","unstructured":"A. Graves, Generating sequences with recurrent neural networks (2013). arXiv:1308.0850"},{"key":"1429_CR28","doi-asserted-by":"crossref","unstructured":"K. Han, D. Yu, I. Tashev, Speech emotion recognition using deep neural network and extreme learning machine, in Fifteenth Annual Conference of the International Speech Communication Association (2014)","DOI":"10.21437\/Interspeech.2014-57"},{"key":"1429_CR29","doi-asserted-by":"crossref","unstructured":"S. Hantke, H. Sagha, N. Cummins, B. Schuller, Emotional speech of mentally and physically disabled individuals: Introducing the emotass database and first findings, in Proceedings of Interspeech 2017 (ISCA, Stockholm, Sweden, 2017) pp. 3137\u20133141","DOI":"10.21437\/Interspeech.2017-409"},{"key":"1429_CR30","doi-asserted-by":"crossref","unstructured":"Q. Jin, C. Li, S. Chen, H. Wu, Speech emotion recognition with acoustic and lexical features, in 2015 IEEE international conference on acoustics, speech and signal processing (ICASSP) (IEEE, 2015) pp. 4749\u20134753","DOI":"10.1109\/ICASSP.2015.7178872"},{"issue":"3","key":"1429_CR31","doi-asserted-by":"crossref","first-page":"280","DOI":"10.1001\/archpsyc.1986.01800030098011","volume":"43","author":"WF Johnson","year":"1986","unstructured":"W.F. Johnson, R.N. Emde, K.R. Scherer, M.D. Klinnert, Recognition of emotion from vocal cues. Arch. Gen. Psychiatry 43(3), 280\u2013283 (1986)","journal-title":"Arch. Gen. Psychiatry"},{"issue":"5","key":"1429_CR32","doi-asserted-by":"crossref","first-page":"770","DOI":"10.1037\/0033-2909.129.5.770","volume":"129","author":"PN Juslin","year":"2003","unstructured":"P.N. Juslin, P. Laukka, Communication of emotions in vocal expression and music performance: Different channels, same code? Psychol. Bull. 129(5), 770\u2013814 (2003)","journal-title":"Psychol. Bull."},{"issue":"2","key":"1429_CR33","doi-asserted-by":"crossref","first-page":"99","DOI":"10.1007\/s10772-011-9125-1","volume":"15","author":"SG Koolagudi","year":"2012","unstructured":"S.G. Koolagudi, K.S. Rao, Emotion recognition from speech: a review. Int. J. Speech Technol. 15(2), 99\u2013117 (2012)","journal-title":"Int. J. Speech Technol."},{"issue":"5","key":"1429_CR34","first-page":"1","volume":"8","author":"M Kuhn","year":"2008","unstructured":"M. Kuhn, Building predictive models in r using the caret package. J. Stat. Softw. 8(5), 1\u201326 (2008)","journal-title":"J. Stat. Softw."},{"key":"1429_CR35","doi-asserted-by":"crossref","unstructured":"O.W. Kwon, K. Chan, J. Hao, T.W. Lee, Emotion recognition by speech signals, in INTERSPEECH (2003)","DOI":"10.21437\/Eurospeech.2003-80"},{"key":"1429_CR36","doi-asserted-by":"crossref","first-page":"3107","DOI":"10.21437\/Interspeech.2018-1568","volume":"2018","author":"S Latif","year":"2018","unstructured":"S. Latif, R. Rana, J. Qadir, J. Epps, Variational autoencoders for learning latent representations of speech emotion: A preliminary study. Proc. Interspeech 2018, 3107\u20133111 (2018)","journal-title":"Proc. Interspeech"},{"key":"1429_CR37","first-page":"288","volume":"35","author":"P Laukka","year":"2000","unstructured":"P. Laukka, P.N. Juslin, A. Gabrielsson, Impact of intended emotion intensity on cue utilization and decoding accuracy in vocal expression of emotion. Int. J. Psychol. 35, 288\u2013289 (2000)","journal-title":"Int. J. Psychol."},{"issue":"2","key":"1429_CR38","doi-asserted-by":"crossref","first-page":"293","DOI":"10.1109\/TSA.2004.838534","volume":"13","author":"CM Lee","year":"2005","unstructured":"C.M. Lee, S.S. Narayanan, Toward detecting emotions in spoken dialogs. IEEE Trans. Speech Audio Process. 13(2), 293\u2013303 (2005)","journal-title":"IEEE Trans. Speech Audio Process."},{"key":"1429_CR39","unstructured":"C.M. Lee, S. Yildirim, M. Bulut, A. Kazemzadeh, C. Busso, Z. Deng, S. Lee, S. Narayanan, Emotion recognition based on phoneme classes, in Interspeech, (2004) pp. 205\u2013211"},{"key":"1429_CR40","doi-asserted-by":"crossref","unstructured":"J. Lee, I. Tashev, High-level feature representation using recurrent neural network for speech emotion recognition, in Sixteenth Annual Conference of the International Speech Communication Association (2015)","DOI":"10.21437\/Interspeech.2015-336"},{"key":"1429_CR41","doi-asserted-by":"crossref","unstructured":"P. Li, Y. Song, I. McLoughlin, W. Guo, L. Dai, An attention pooling based representation learning method for speech emotion recognition. Proc. Interspeech 2018, 3087\u20133091 (2018)","DOI":"10.21437\/Interspeech.2018-1242"},{"key":"1429_CR42","doi-asserted-by":"crossref","unstructured":"X. Li, J. Tao, M.T. Johnson, J. Soltis, A. Savage, K.M. Leong, J.D. Newman, Stress and emotion classification using jitter and shimmer features, in 2007 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) vol.\u00a04 (IEEE, 2007), pp IV\u20131081","DOI":"10.1109\/ICASSP.2007.367261"},{"issue":"3","key":"1429_CR43","first-page":"18","volume":"2","author":"A Liaw","year":"2002","unstructured":"A. Liaw, M. Wiener, Classification and regression by randomforest. R News 2(3), 18\u201322 (2002)","journal-title":"R News"},{"key":"1429_CR44","doi-asserted-by":"crossref","unstructured":"W. Lim, D. Jang, T. Lee, Speech emotion recognition using convolutional and recurrent neural networks, in Signal and Information Processing Association Annual Summit and Conference (APSIPA), 2016 Asia-Pacific (IEEE, 2016) pp. 1\u20134","DOI":"10.1109\/APSIPA.2016.7820699"},{"issue":"6","key":"1429_CR45","doi-asserted-by":"crossref","first-page":"490","DOI":"10.1109\/TMM.2010.2051872","volume":"12","author":"I Luengo","year":"2010","unstructured":"I. Luengo, E. Navas, I. Hern\u00e1ez, Feature analysis and evaluation for automatic emotion identification in speech. IEEE Trans. Multimedia 12(6), 490\u2013501 (2010)","journal-title":"IEEE Trans. Multimedia"},{"key":"1429_CR46","doi-asserted-by":"crossref","unstructured":"M. Lugger, B. Yang, The relevance of voice quality features in speaker independent emotion recognition, in 2007 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) vol\u00a04, (IEEE, 2007), pp. IV\u201317","DOI":"10.1109\/ICASSP.2007.367152"},{"key":"1429_CR47","doi-asserted-by":"crossref","first-page":"152","DOI":"10.21437\/Interspeech.2018-1832","volume":"2018","author":"D Luo","year":"2018","unstructured":"D. Luo, Y. Zou, D. Huang, Investigation on joint representation learning for robust feature extraction in speech emotion recognition. Proc. Interspeech 2018, 152\u2013156 (2018)","journal-title":"Proc. Interspeech"},{"issue":"10","key":"1429_CR48","doi-asserted-by":"crossref","first-page":"782","DOI":"10.1016\/j.specom.2008.04.010","volume":"50","author":"L Mary","year":"2008","unstructured":"L. Mary, B. Yegnanarayana, Extraction and representation of prosodic features for language and speaker recognition. Speech Commun. 50(10), 782\u2013796 (2008)","journal-title":"Speech Commun."},{"key":"1429_CR49","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/s10772-018-9517-6","volume":"21","author":"L Mary","year":"2018","unstructured":"L. Mary, A.P. Antony, B.P. Babu, S.M. Prasanna, Automatic syllabification of speech signal using short time energy and vowel onset points. Int. J. Speech Technol. 21, 1\u20139 (2018)","journal-title":"Int. J. Speech Technol."},{"issue":"2","key":"1429_CR50","doi-asserted-by":"crossref","first-page":"184","DOI":"10.1109\/T-AFFC.2011.40","volume":"3","author":"A Metallinou","year":"2012","unstructured":"A. Metallinou, M. Wollmer, A. Katsamanis, F. Eyben, B. Schuller, S. Narayanan, Context-sensitive learning for enhanced audiovisual emotion classification. IEEE Trans. Affect. Comput. 3(2), 184\u2013198 (2012)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"1429_CR51","doi-asserted-by":"crossref","unstructured":"S. Mirsamadi, E. Barsoum, C. Zhang, Automatic speech emotion recognition using recurrent neural networks with local attention, in 2017 IEEE International Conference on Acoustics (Speech and Signal Processing (ICASSP), IEEE, 2017), pp. 2227\u20132231","DOI":"10.1109\/ICASSP.2017.7952552"},{"key":"1429_CR52","unstructured":"V. Mohanan, L. Mary, Prosody based emotion recognition using SVM, in International Conference on Signal and Speech Processing (ICSSP), (2016) pp. 100\u2013105"},{"issue":"2","key":"1429_CR53","doi-asserted-by":"crossref","first-page":"1097","DOI":"10.1121\/1.405558","volume":"93","author":"IR Murray","year":"1993","unstructured":"I.R. Murray, J.L. Arnott, Toward the simulation of emotion in synthetic speech: A review of the literature on human vocal emotion. J. Acoust. Soc. Am. 93(2), 1097\u20131108 (1993)","journal-title":"J. Acoust. Soc. Am."},{"key":"1429_CR54","doi-asserted-by":"crossref","unstructured":"M. Neumann, N.T. Vu, Attentive convolutional neural network based speech emotion recognition: A study on the impact of input features, signal length, and acted speech (2017). arXiv:1706.00612","DOI":"10.21437\/Interspeech.2017-917"},{"issue":"4","key":"1429_CR55","doi-asserted-by":"crossref","first-page":"603","DOI":"10.1016\/S0167-6393(03)00099-2","volume":"41","author":"TL Nwe","year":"2003","unstructured":"T.L. Nwe, S.W. Foo, L.C. De Silva, Speech emotion recognition using hidden markov models. Speech Commun. 41(4), 603\u2013623 (2003)","journal-title":"Speech Commun."},{"key":"1429_CR56","doi-asserted-by":"crossref","unstructured":"J. Qiu, K. Sun, I.J. Rudas, H. Gao, Command filter-based adaptive nn control for mimo nonlinear systems with full-state constraints and actuator hysteresis. IEEE Trans. Cybern. (2019a)","DOI":"10.1109\/TCYB.2019.2944761"},{"issue":"11","key":"1429_CR57","doi-asserted-by":"crossref","first-page":"2152","DOI":"10.1109\/TFUZZ.2019.2895560","volume":"27","author":"J Qiu","year":"2019","unstructured":"J. Qiu, K. Sun, T. Wang, H. Gao, Observer-based fuzzy adaptive event-triggered control for pure-feedback nonlinear systems with prescribed performance. IEEE Trans. Fuzzy Syst. 27(11), 2152\u20132162 (2019b)","journal-title":"IEEE Trans. Fuzzy Syst."},{"key":"1429_CR58","unstructured":"R Core Team, R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna, Austria (2018). https:\/\/www.R-project.org\/"},{"issue":"2","key":"1429_CR59","doi-asserted-by":"crossref","first-page":"143","DOI":"10.1007\/s10772-012-9172-2","volume":"16","author":"KS Rao","year":"2013","unstructured":"K.S. Rao, S.G. Koolagudi, R.R. Vempada, Emotion recognition from speech using global and local prosodic features. Int. J. Speech Technol. 16(2), 143\u2013160 (2013)","journal-title":"Int. J. Speech Technol."},{"issue":"10","key":"1429_CR60","doi-asserted-by":"crossref","first-page":"1671","DOI":"10.1109\/LSP.2015.2420092","volume":"22","author":"F Richardson","year":"2015","unstructured":"F. Richardson, D. Reynolds, N. Dehak, Deep neural network approaches to speaker and language recognition. IEEE Signal Process. Lett. 22(10), 1671\u20131675 (2015)","journal-title":"IEEE Signal Process. Lett."},{"issue":"3","key":"1429_CR61","doi-asserted-by":"crossref","first-page":"315","DOI":"10.1016\/j.ipm.2008.09.003","volume":"45","author":"J Rong","year":"2009","unstructured":"J. Rong, G. Li, Y.P.P. Chen, Acoustic feature selection for automatic emotion recognition from speech. Inf. Process. Manag. 45(3), 315\u2013328 (2009)","journal-title":"Inf. Process. Manag."},{"key":"1429_CR62","doi-asserted-by":"crossref","unstructured":"A. Satt, S. Rozenberg, R. Hoory, Efficient emotion recognition from speech using deep learning on spectrograms, in INTERSPEECH, (2017) pp. 1089\u20131093","DOI":"10.21437\/Interspeech.2017-200"},{"key":"1429_CR63","unstructured":"K.R. Scherer, Methods of research on vocal communication: Paradigms and parameters. Handbook of methods in nonverbal behavior research, pp. 136\u2013198 (1982)"},{"issue":"3","key":"1429_CR64","first-page":"137","volume":"137","author":"KR Scherer","year":"2000","unstructured":"K.R. Scherer et al., Psychological models of emotion. Neuropsychol. Emot. 137(3), 137\u2013162 (2000)","journal-title":"Neuropsychol. Emot."},{"key":"1429_CR65","doi-asserted-by":"crossref","unstructured":"B. Schuller, G. Rigoll, Timing levels in segment-based speech emotion recognition, in Proceedings of International Conference on Spoken Language Processing ICSLP, Pittsburgh, USA, 2006","DOI":"10.21437\/Interspeech.2006-502"},{"key":"1429_CR66","unstructured":"B. Schuller, G. Rigoll, M. Lang, Speech emotion recognition combining acoustic features and linguistic information in a hybrid support vector machine-belief network architecture, in 2004 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), vol\u00a01, (IEEE, 2004), pp. I\u2013577"},{"key":"1429_CR67","unstructured":"B. Schuller, S. Steidl, A. Batliner, The interspeech 2009 emotion challenge, in Proceedings of Interspeech 2009, Brighton, UK, (2009) pp. 312\u2013315"},{"key":"1429_CR68","doi-asserted-by":"crossref","unstructured":"B. Schuller, S. Steidl, A. Batliner, F. Burkhardt, L. Devillers, C. M\u00fcller, S. Narayanan, The interspeech 2010 paralinguistic challenge, in Proceedings of INTERSPEECH 2010, Makuhari, Japan, (2010a) pp. 2794\u20132797","DOI":"10.21437\/Interspeech.2010-739"},{"issue":"2","key":"1429_CR69","doi-asserted-by":"crossref","first-page":"119","DOI":"10.1109\/T-AFFC.2010.8","volume":"1","author":"B Schuller","year":"2010","unstructured":"B. Schuller, B. Vlasenko, F. Eyben, M. Wollmer, A. Stuhlsatz, A. Wendemuth, G. Rigoll, Cross-corpus acoustic emotion recognition: Variances and strategies. IEEE Trans. Affect. Comput. 1(2), 119\u2013131 (2010b)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"1429_CR70","doi-asserted-by":"crossref","unstructured":"B. Schuller, A. Batliner, S. Steidl, F. Schiel, J. Krajewski, The interspeech 2011 speaker state challenge, in Proceedings of INTERSPEECH 2011, Florence, Italy (2011a)","DOI":"10.21437\/Interspeech.2011-801"},{"issue":"9\u201310","key":"1429_CR71","doi-asserted-by":"crossref","first-page":"1062","DOI":"10.1016\/j.specom.2011.01.011","volume":"53","author":"B Schuller","year":"2011","unstructured":"B. Schuller, A. Batliner, S. Steidl, D. Seppi, Recognising realistic emotions and affect in speech: State of the art and lessons learnt from the first challenge. Speech Commun. 53(9\u201310), 1062\u20131087 (2011b)","journal-title":"Speech Commun."},{"key":"1429_CR72","doi-asserted-by":"crossref","unstructured":"B. Schuller, S. Steidl, A. Batliner, E. N\u00f6th, A. Vinciarelli, F. Burkhardt, R. van Son, F. Weninger, F. Eyben, T. Bocklet, The interspeech 2012 speaker trait challenge, in Proceedings of INTERSPEECH, Portland (OR, USA, 2012)","DOI":"10.21437\/Interspeech.2012-86"},{"key":"1429_CR73","doi-asserted-by":"crossref","unstructured":"B. Schuller, S. Steidl, A. Batliner, A. Vinciarelli, K. Scherer, F. Ringeval, M. Chetouani, F. Weninger, F. Eyben, E. Marchi, et\u00a0al. The interspeech 2013 computational paralinguistics challenge: social signals, conflict, emotion, autism, in Proceedings of INTERSPEECH 2013, Lyon, France","DOI":"10.21437\/Interspeech.2013-56"},{"key":"1429_CR74","doi-asserted-by":"crossref","unstructured":"B.W. Schuller, S. Steidl, A. Batliner, P.B. Marschik, H. Baumeister, F. Dong, S. Hantke, F. Pokorny, E.M. Rathner, K.D. Bartl-Pokorny, et\u00a0al. The interspeech 2018 computational paralinguistics challenge: Atypical & self-assessed affect, crying & heart beats, in Proceedings of Interspeech 2018, Hyderabad, India pp. 122\u2013126 (2018)","DOI":"10.21437\/Interspeech.2018-51"},{"key":"1429_CR75","first-page":"5688","volume-title":"2011 IEEE international conference on Acoustics","author":"A Stuhlsatz","year":"2011","unstructured":"A. Stuhlsatz, C. Meyer, F. Eyben, T. Zielke, G. Meier, B. Schuller, Deep neural networks for acoustic emotion recognition: raising the benchmarks, 2011 IEEE international conference on Acoustics (Speech and Signal Processing (ICASSP), IEEE, 2011), pp. 5688\u20135691"},{"issue":"8","key":"1429_CR76","doi-asserted-by":"crossref","first-page":"1587","DOI":"10.1109\/TFUZZ.2018.2883374","volume":"27","author":"K Sun","year":"2018","unstructured":"K. Sun, S. Mou, J. Qiu, T. Wang, H. Gao, Adaptive fuzzy control for nontriangular structural stochastic switched nonlinear systems with full state constraints. IEEE Trans. Fuzzy Syst. 27(8), 1587\u20131601 (2018)","journal-title":"IEEE Trans. Fuzzy Syst."},{"key":"1429_CR77","first-page":"4509","volume-title":"2009 IEEE International Conference on Acoustics","author":"R Sun","year":"2009","unstructured":"R. Sun, E. Moore, J.F. Torres, Investigating glottal parameters for differentiating emotional categories with similar prosodics, 2009 IEEE International Conference on Acoustics (Speech and Signal Processing (ICASSP), IEEE, 2009), pp. 4509\u20134512"},{"issue":"3","key":"1429_CR78","doi-asserted-by":"crossref","first-page":"162","DOI":"10.1109\/T-AFFC.2011.14","volume":"2","author":"J Sundberg","year":"2011","unstructured":"J. Sundberg, S. Patel, E. Bjorkner, K.R. Scherer, Interdependencies among voice source parameters in emotional speech. IEEE Trans. Affect. Comput. 2(3), 162\u2013174 (2011)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"1429_CR79","doi-asserted-by":"crossref","unstructured":"D. Tacconi, O. Mayora, P. Lukowicz, B. Arnrich, C. Setz, G. Troster, C. Haring, Activity and emotion recognition to support early diagnosis of psychiatric diseases, in Second International Conference on Pervasive Computing Technologies for Healthcare, pp. 100\u2013102 (2008)","DOI":"10.4108\/ICST.PERVASIVEHEALTH2008.2511"},{"key":"1429_CR80","doi-asserted-by":"crossref","first-page":"162","DOI":"10.21437\/Interspeech.2018-2581","volume":"2018","author":"D Tang","year":"2018","unstructured":"D. Tang, J. Zeng, M. Li, An end-to-end deep learning framework with speech emotion recognition of atypical individuals. Proc. Interspeech 2018, 162\u2013166 (2018)","journal-title":"Proc. Interspeech"},{"issue":"3","key":"1429_CR81","doi-asserted-by":"crossref","first-page":"1697","DOI":"10.1121\/1.428453","volume":"107","author":"P Taylor","year":"2000","unstructured":"P. Taylor, Analysis and synthesis of intonation using the tilt model. J. Acoust. Soc. Am. 107(3), 1697\u20131714 (2000)","journal-title":"J. Acoust. Soc. Am."},{"issue":"1","key":"1429_CR82","doi-asserted-by":"crossref","first-page":"213","DOI":"10.1016\/S0167-6393(02)00083-3","volume":"40","author":"L Ten Bosch","year":"2003","unstructured":"L. Ten Bosch, Emotions, speech and the asr framework. Speech Commun. 40(1), 213\u2013225 (2003)","journal-title":"Speech Commun."},{"key":"1429_CR83","doi-asserted-by":"crossref","unstructured":"G. Trigeorgis, F. Ringeval, R. Brueckner, E. Marchi, M.A. Nicolaou, B. Schuller, S. Zafeiriou, Adieu features? end-to-end speech emotion recognition using a deep convolutional recurrent network, in 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (IEEE, 2016) pp. 5200\u20135204","DOI":"10.1109\/ICASSP.2016.7472669"},{"key":"1429_CR84","first-page":"4052","volume-title":"2014 IEEE International Conference on Acoustics","author":"E Variani","year":"2014","unstructured":"E. Variani, X. Lei, E. McDermott, I. Lopez-Moreno, J. Gonzalez-Dominguez, Deep neural networks for small footprint text-dependent speaker verification, 2014 IEEE International Conference on Acoustics (Speech and Signal Processing (ICASSP), IEEE, 2014), pp. 4052\u20134056"},{"key":"1429_CR85","doi-asserted-by":"publisher","unstructured":"V. Vegesna, P. Jain, K. Gurugubelli, A. Vuppala, Emotional speech classifier systems: For sensitive assistance to support disabled individuals, pp. 6\u201310 (2018). https:\/\/doi.org\/10.21437\/SMM.2018-2","DOI":"10.21437\/SMM.2018-2"},{"key":"1429_CR86","doi-asserted-by":"crossref","first-page":"147","DOI":"10.21437\/Interspeech.2018-1238","volume":"2018","author":"J Wagner","year":"2018","unstructured":"J. Wagner, D. Schiller, A. Seiderer, E. Andr\u00e9, Deep learning in paralinguistic recognition tasks: Are hand-crafted features still relevant? Proc. Interspeech 2018, 147\u2013151 (2018)","journal-title":"Proc. Interspeech"},{"key":"1429_CR87","first-page":"5150","volume-title":"2017 IEEE International Conference on Acoustics","author":"ZQ Wang","year":"2017","unstructured":"Z.Q. Wang, I. Tashev, Learning utterance-level representations for speech emotion and age\/gender recognition using deep neural networks, 2017 IEEE International Conference on Acoustics (Speech and Signal Processing (ICASSP), IEEE, 2017), pp. 5150\u20135154"},{"key":"1429_CR88","unstructured":"S. Yildirim, M. Bulut, C.M. Lee, A. Kazemzadeh, Z. Deng, S. Lee, S. Narayanan, C. Busso, An acoustic study of emotions expressed in speech, in Eighth International Conference on Spoken Language Processing, (2004) pp. 2193\u20132196"},{"issue":"1","key":"1429_CR89","doi-asserted-by":"crossref","first-page":"29","DOI":"10.1016\/j.csl.2009.12.004","volume":"25","author":"S Yildirim","year":"2011","unstructured":"S. Yildirim, S. Narayanan, A. Potamianos, Detecting emotional state of a child in a conversational computer game. Comput. Speech Lang. 25(1), 29\u201344 (2011)","journal-title":"Comput. Speech Lang."},{"key":"1429_CR90","unstructured":"D. Yu, M.L. Seltzer, J. Li, J.T. Huang, F. Seide, Feature learning in deep neural networks-studies on speech recognition tasks (2013). arXiv:1301.3605"},{"key":"1429_CR91","doi-asserted-by":"crossref","first-page":"272","DOI":"10.21437\/Interspeech.2018-1477","volume":"2018","author":"Z Zhao","year":"2018","unstructured":"Z. Zhao, Y. Zheng, Z. Zhang, H. Wang, Y. Zhao, C. Li, Exploring spatio-temporal representations by integrating attention-based bidirectional-lstm-rnns and fcns for speech emotion recognition. Proc. Interspeech 2018, 272\u2013276 (2018)","journal-title":"Proc. Interspeech"},{"key":"1429_CR92","doi-asserted-by":"crossref","unstructured":"P. Zhou, W. Shi, J. Tian, Z. Qi, B. Li, H. Hao, B. Xu, Attention-based bidirectional long short-term memory networks for relation classification, in Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), vol\u00a02 (2016) pp. 207\u2013212","DOI":"10.18653\/v1\/P16-2034"}],"container-title":["Circuits, Systems, and Signal Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-020-01429-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00034-020-01429-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-020-01429-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,5]],"date-time":"2024-08-05T20:22:55Z","timestamp":1722889375000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00034-020-01429-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5,14]]},"references-count":92,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2020,11]]}},"alternative-id":["1429"],"URL":"https:\/\/doi.org\/10.1007\/s00034-020-01429-3","relation":{},"ISSN":["0278-081X","1531-5878"],"issn-type":[{"value":"0278-081X","type":"print"},{"value":"1531-5878","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,5,14]]},"assertion":[{"value":"3 September 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 April 2020","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 April 2020","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 May 2020","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Compliance with Ethical Standards"}},{"value":"The authors wish to confirm that there are no known conflicts of interest associated with this publication and there has been no significant financial support for this work that could have influenced its outcome.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}