{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,20]],"date-time":"2025-10-20T10:15:07Z","timestamp":1760955307467},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2011,10,19]],"date-time":"2011-10-19T00:00:00Z","timestamp":1318982400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["J Ambient Intell Human Comput"],"published-print":{"date-parts":[[2012,3]]},"DOI":"10.1007\/s12652-011-0088-5","type":"journal-article","created":{"date-parts":[[2011,10,18]],"date-time":"2011-10-18T16:30:56Z","timestamp":1318955456000},"page":"47-60","source":"Crossref","is-referenced-by-count":12,"title":["Robust emotion recognition by spectro-temporal modulation statistic features"],"prefix":"10.1007","volume":"3","author":[{"given":"Tai-Shih","family":"Chi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lan-Ying","family":"Yeh","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chin-Cheng","family":"Hsu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2011,10,19]]},"reference":[{"issue":"1","key":"88_CR1","doi-asserted-by":"crossref","first-page":"20","DOI":"10.1145\/1007730.1007735","volume":"6","author":"GEAPA Batista","year":"2004","unstructured":"Batista GEAPA, Prati RC, Monard MC (2004) A study of the behavior of several methods for balancing machine learning training data. ACM SIGKDD Explor Newslett 6(1):20\u201329","journal-title":"ACM SIGKDD Explor Newslett"},{"key":"88_CR2","doi-asserted-by":"crossref","DOI":"10.7551\/mitpress\/1486.001.0001","volume-title":"Auditory scene analysis: The perceptual organization of sound","author":"AS Bregman","year":"1990","unstructured":"Bregman AS (1990) Auditory scene analysis: The perceptual organization of sound. MIT press, Cambridge"},{"key":"88_CR3","doi-asserted-by":"crossref","unstructured":"Burkhardt F, Paeschke A, Rolfes M, Sendlmeier W, Weiss B (2005) A database of German emotional speech. In: Proceedings of Interspeech, pp 489\u2013492","DOI":"10.21437\/Interspeech.2005-446"},{"key":"88_CR4","doi-asserted-by":"crossref","first-page":"582","DOI":"10.1109\/TASL.2008.2009578","volume":"17","author":"C Busso","year":"2009","unstructured":"Busso C, Lee S, Narayanan S (2009) Analysis of emotionally salient aspects of fundamental frequency for emotion detection. IEEE Trans Audio Speech Lang Process 17:582\u2013596","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"88_CR5","doi-asserted-by":"crossref","first-page":"304","DOI":"10.1121\/1.429466","volume":"108","author":"RP Carlyon","year":"2000","unstructured":"Carlyon RP, Moore BCJ, Micheyl C (2000) The effect of modulation rate on the detection of frequency modulation and mistuning of complex tones. J Acoust Soc Am 108:304\u2013315","journal-title":"J Acoust Soc Am"},{"key":"88_CR6","unstructured":"Chang CC, Lin CJ (2001) LIBSVM: a library for support vector machines. Software available at http:\/\/www.csie.ntu.edu.tw\/~cjlin\/libsvm"},{"key":"88_CR7","doi-asserted-by":"crossref","first-page":"321","DOI":"10.1613\/jair.953","volume":"16","author":"NV Chawla","year":"2002","unstructured":"Chawla NV, Hall LO, Bowyer KW, Kegelmeyer WP (2002) SMOTE: synthetic minority oversampling technique. J Artif Intell Res 16:321\u2013357","journal-title":"J Artif Intell Res"},{"issue":"5","key":"88_CR8","doi-asserted-by":"crossref","first-page":"EL190","DOI":"10.1121\/1.3565471","volume":"129","author":"TS Chi","year":"2011","unstructured":"Chi TS, Hsu CC (2011) Multiband analysis and synthesis of spectro-temporal modulations of Fourier spectrogram. J Acoust Soc Am 129(5):EL190\u2013EL196","journal-title":"J Acoust Soc Am"},{"issue":"2","key":"88_CR9","doi-asserted-by":"crossref","first-page":"887","DOI":"10.1121\/1.1945807","volume":"118","author":"T Chi","year":"2005","unstructured":"Chi T, Ru P, Shamma SA (2005) Multi-resolution spectro-temporal analysis of complex sounds. J Acoust Soc Am 118(2):887\u2013906","journal-title":"J Acoust Soc Am"},{"key":"88_CR10","doi-asserted-by":"crossref","first-page":"32","DOI":"10.1109\/79.911197","volume":"18","author":"R Cowie","year":"2001","unstructured":"Cowie R, Douglas-Cowie E, Tsapatsoulis N, Votsis G, Kollias S, Fellenz W, Taylor JG (2001) Emotion recognition in human-computer interaction. IEEE Signal Process Magazine 18:32\u201380","journal-title":"IEEE Signal Process Magazine"},{"key":"88_CR11","unstructured":"Eyben F, Wollmer M, Schuller B (2009) Speech and music interpretation by large-space extraction. http:\/\/sourceforge.net\/projects\/openSMILE"},{"key":"88_CR12","doi-asserted-by":"crossref","unstructured":"Ezzat T, Bouvrie J, Poggio T (2007) Spectro-temporal analysis of speech using 2-D Gabor filters. In: Proceedings of Interspeech, pp 506\u2013509","DOI":"10.21437\/Interspeech.2007-236"},{"key":"88_CR13","doi-asserted-by":"crossref","first-page":"90","DOI":"10.1109\/TASL.2009.2023679","volume":"18","author":"TH Falk","year":"2010","unstructured":"Falk TH, Chan WY (2010) Modulation spectral features for robust far-field speaker identification. IEEE Trans Audio Speech Lang Process 18:90\u2013100","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"88_CR14","doi-asserted-by":"crossref","first-page":"1340","DOI":"10.1121\/1.1452740","volume":"111","author":"N Grimault","year":"2002","unstructured":"Grimault N, Bacon SP, Micheyl C (2002) Auditory stream segregation on the basis of amplitude modulation rate. J Acoust Soc Am 111:1340\u20131348","journal-title":"J Acoust Soc Am"},{"key":"88_CR15","unstructured":"Jiang DN, Cai LH (2004) Speech emotion classification with the combination of statistic features and temporal features. In: Proceedings of the ICME, pp 1967\u20131970"},{"key":"88_CR16","doi-asserted-by":"crossref","unstructured":"Kawahara H, de Cheveign\u00e9 A, Banno H, Takahashi T, Irino T (2005) Nearly defect-free F0 trajectory extraction for expressive speech modifications based on STRAIGHT. In: Proceedings of Interspeech, pp 537\u2013540","DOI":"10.21437\/Interspeech.2005-335"},{"key":"88_CR17","doi-asserted-by":"crossref","first-page":"293","DOI":"10.1109\/TSA.2004.838534","volume":"13","author":"CM Lee","year":"2005","unstructured":"Lee CM, Narayanan SS (2005) Toward detecting emotions in spoken dialogs. IEEE Trans Speech Audio Process 13:293\u2013303","journal-title":"IEEE Trans Speech Audio Process"},{"key":"88_CR18","doi-asserted-by":"crossref","DOI":"10.1201\/9781420015836","volume-title":"Speech enhancement: theory and practice","author":"PC Loizou","year":"2007","unstructured":"Loizou PC (2007) Speech enhancement: theory and practice. CRC, New York"},{"issue":"5","key":"88_CR19","doi-asserted-by":"crossref","first-page":"753","DOI":"10.1016\/j.specom.2010.07.002","volume":"53","author":"BT Meyer","year":"2011","unstructured":"Meyer BT, Kollmeier B (2011) Robustness of spectro-temporal features against intrinsic and extrinsic variations in automatic speech recognition. Speech Commun 53(5):753\u2013767","journal-title":"Speech Commun"},{"key":"88_CR20","doi-asserted-by":"crossref","unstructured":"Mozziconacci S (2002) Prosody and emotions. In: Proceedings of Speech Prosody, pp 1\u20139","DOI":"10.21437\/SpeechProsody.2002-1"},{"key":"88_CR21","doi-asserted-by":"crossref","first-page":"603","DOI":"10.1016\/S0167-6393(03)00099-2","volume":"41","author":"T New","year":"2003","unstructured":"New T, Foo S, DeSilva L (2003) Speech emotion recognition using hidden markov models. Speech Commun 41:603\u2013623","journal-title":"Speech Commun"},{"key":"88_CR22","volume-title":"Speech communications-human and machine","author":"D O\u2019Shaughnessy","year":"2000","unstructured":"O\u2019Shaughnessy D (2000) Speech communications-human and machine, 2nd edn. IEEE Press, Piscataway","edition":"2"},{"key":"88_CR23","unstructured":"Pao TL, Chen YT, Yeh JH, Li PJ (2006) Mandarin emotional speech recognition based on SVM and NN. In: Proceedings of the 18th International Conference on Pattern Recognition, vol. 1, pp 1096\u20131110"},{"key":"88_CR24","doi-asserted-by":"crossref","unstructured":"Pudil P, Ferri FJ, Novovicova J, Kittler J (1994) Floating search methods for feature selection with nonmonotonic criterion functions. In: Proceedings of the international Conference on Computer Vision & Image Processing, pp 279\u2013283","DOI":"10.1109\/ICPR.1994.576920"},{"key":"88_CR25","doi-asserted-by":"crossref","unstructured":"Ringeval F, Chetouani M (2008) A vowel based approach for acted emotion recognition. In: Proceedings of Interspeech, pp 2763\u20132766","DOI":"10.1007\/978-3-540-70872-8_19"},{"key":"88_CR26","doi-asserted-by":"crossref","first-page":"227","DOI":"10.1016\/S0167-6393(02)00084-5","volume":"40","author":"K Scherer","year":"2003","unstructured":"Scherer K (2003) Vocal communication of emotion: A review of research paradigms. Speech Commun 40:227\u2013256","journal-title":"Speech Commun"},{"key":"88_CR27","doi-asserted-by":"crossref","unstructured":"Schuller B, Rigoll G (2006) Timing levels in segment-based speech emotion recognition. In: Proceedings of Interspeech, pp 1818\u20131821","DOI":"10.21437\/Interspeech.2006-502"},{"key":"88_CR28","unstructured":"Schuller B, Rigoll G, Lang M (2003) Hidden markov model-based speech emotion recognition. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), vol. 2, pp 1\u20134"},{"key":"88_CR29","doi-asserted-by":"crossref","unstructured":"Schuller B, Rigoll G, Lang M (2004) Speech emotion recognition combining acoustic features and linguistic information in a hybrid support vector machine-belief network architecture. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), vol. 1, pp 577\u2013580","DOI":"10.1109\/ICASSP.2004.1326051"},{"key":"88_CR30","doi-asserted-by":"crossref","unstructured":"Schuller B, Arsi\u0107 D, Wallhoff F, Rigoll G (2006) Emotion recognition in the noise applying large acoustic feature sets. In: Proceedings of Speech Prosody","DOI":"10.21437\/SpeechProsody.2006-150"},{"key":"88_CR31","doi-asserted-by":"crossref","unstructured":"Schuller B, Batliner A, Seppi D, Steidl S, Vogt T, Wagner J, Devillers L, Vidrascu L, Amir N, Kessous L, Aharonson V (2007a). The relevance of feature type for the automatic classification of emotional user states: Low level descriptors and functionals. In: Proceedings of Interspeech, pp 2253\u20132256","DOI":"10.21437\/Interspeech.2007-612"},{"key":"88_CR32","doi-asserted-by":"crossref","unstructured":"Schuller B, Seppi D, Batliner A, Maier A, Steidl S (2007b) Towards more reality in the recognition of emotional speech. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), vol. 4, pp 941\u2013944","DOI":"10.1109\/ICASSP.2007.367226"},{"key":"88_CR33","unstructured":"Schuller B, W\u00f6llmer M, Eyben F, Rigoll G (2009) Spectral or voice quality? Feature type relevance for the discrimination of emotion pairs. In: Hancil S (ed) The Role of Prosody in Affective Speech, Linguistic Insights, Studies in Language and Communication, Vol. 97. Peter Lang Publishing Group, New York, pp 285\u2013307"},{"key":"88_CR34","doi-asserted-by":"crossref","first-page":"2631","DOI":"10.1121\/1.428649","volume":"107","author":"S Shamma","year":"2000","unstructured":"Shamma S, Klein DJ (2000) The case of the missing pitch templates: how harmonic templates may form in the early auditory system. J Acoust Soc Am 107:2631\u20132644","journal-title":"J Acoust Soc Am"},{"key":"88_CR35","volume-title":"Automatic classification of emotion-related user states in spontaneous children\u2019s speech","author":"S Steidl","year":"2009","unstructured":"Steidl S (2009) Automatic classification of emotion-related user states in spontaneous children\u2019s speech. Logos Verlag, Berlin"},{"issue":"3","key":"88_CR36","doi-asserted-by":"crossref","first-page":"247","DOI":"10.1016\/0167-6393(93)90095-3","volume":"12","author":"A Varga","year":"1993","unstructured":"Varga A, Steeneken HJM (1993) Assessment for automatic speech recognition: II. NOISEX-92: a database and an experiment to study the effect of additive noise on speech recognition systems. Speech Commun 12(3):247\u2013251","journal-title":"Speech Commun"},{"issue":"9","key":"88_CR37","doi-asserted-by":"crossref","first-page":"1162","DOI":"10.1016\/j.specom.2006.04.003","volume":"48","author":"V Ververidis","year":"2006","unstructured":"Ververidis V, Kotropoulos C (2006) Emotional speech recognition: resources, features, and methods. Speech Commun 48(9):1162\u20131181","journal-title":"Speech Commun"},{"issue":"5","key":"88_CR38","doi-asserted-by":"crossref","first-page":"768","DOI":"10.1016\/j.specom.2010.08.013","volume":"53","author":"S Wu","year":"2011","unstructured":"Wu S, Falk TH, Chan WY (2011) Automatic speech emotion recognition using modulation spectral features. Speech Commun 53(5):768\u2013785","journal-title":"Speech Commun"},{"key":"88_CR39","doi-asserted-by":"crossref","unstructured":"You M, Chen C, Bu J, Liu J, Tao J (2006) Emotion recognition from noisy speech. In: Proceedings of the ICME, pp 1653\u20131656","DOI":"10.1109\/ICME.2006.262865"},{"key":"88_CR40","doi-asserted-by":"crossref","unstructured":"Yu F, Chang E, Xu YQ, Shum HY (2001) Emotion detection from speech to enrich multimedia content. In: Proceedings of the IEEE Pacific-Rim Conference on Multimedia, vol 1, pp 550\u2013557","DOI":"10.1007\/3-540-45453-5_71"},{"issue":"1","key":"88_CR41","doi-asserted-by":"crossref","first-page":"39","DOI":"10.1109\/TPAMI.2008.52","volume":"31","author":"Z Zeng","year":"2009","unstructured":"Zeng Z, Pantic M, Rosiman GI, Huang TS (2009) A survey of affect recognition methods: Audio, visual, and spontaneous expressions. IEEE Trans Pattern Anal Mach Intell 31(1):39\u201358","journal-title":"IEEE Trans Pattern Anal Mach Intell"}],"container-title":["Journal of Ambient Intelligence and Humanized Computing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s12652-011-0088-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s12652-011-0088-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s12652-011-0088-5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,13]],"date-time":"2024-04-13T21:02:21Z","timestamp":1713042141000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s12652-011-0088-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2011,10,19]]},"references-count":41,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2012,3]]}},"alternative-id":["88"],"URL":"https:\/\/doi.org\/10.1007\/s12652-011-0088-5","relation":{},"ISSN":["1868-5137","1868-5145"],"issn-type":[{"value":"1868-5137","type":"print"},{"value":"1868-5145","type":"electronic"}],"subject":[],"published":{"date-parts":[[2011,10,19]]}}}