{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T19:53:43Z","timestamp":1760385223122,"version":"3.37.3"},"reference-count":39,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2012,6,19]],"date-time":"2012-06-19T00:00:00Z","timestamp":1340064000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2012,9]]},"DOI":"10.1007\/s10772-012-9158-0","type":"journal-article","created":{"date-parts":[[2012,6,18]],"date-time":"2012-06-18T16:00:02Z","timestamp":1340035202000},"page":"313-323","source":"Crossref","is-referenced-by-count":16,"title":["Synthesized speech for model training in cross-corpus recognition of human emotion"],"prefix":"10.1007","volume":"15","author":[{"given":"Bj\u00f6rn","family":"Schuller","sequence":"first","affiliation":[]},{"given":"Zixing","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Felix","family":"Weninger","sequence":"additional","affiliation":[]},{"given":"Felix","family":"Burkhardt","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2012,6,19]]},"reference":[{"key":"9158_CR1","first-page":"167","volume":"35","author":"G. Bergmann","year":"1988","unstructured":"Bergmann, G., Goldbeck, T., & Scherer, K. R. (1988). Emotionale Eindruckswirkung von prosodischen Sprachmerkmalen. Zeitschrift f\u00fcr experimentelle und angewandte Psychologie, 35, 167\u2013200.","journal-title":"Zeitschrift f\u00fcr experimentelle und angewandte Psychologie"},{"key":"9158_CR2","doi-asserted-by":"crossref","first-page":"3217","DOI":"10.21437\/Interspeech.2011-805","volume-title":"Proc. of INTERSPEECH","author":"D. Bone","year":"2011","unstructured":"Bone, D., Black, M. P., Li, M., Metallinou, A., Lee, S., & Narayanan, S. (2011). Intoxicated speech detection by fusion of speaker normalized hierarchical features and GMM supervectors. In Proc. of INTERSPEECH, Florence, Italy (pp. 3217\u20133220)."},{"key":"9158_CR3","volume-title":"Simulation emotionaler Sprechweise mit Sprachsynthesesystemen","author":"F. Burkhardt","year":"2000","unstructured":"Burkhardt, F. (2000). Simulation emotionaler Sprechweise mit Sprachsynthesesystemen. Aachen: Shaker Verlag."},{"key":"9158_CR4","volume-title":"Proc. Interspeech 2005","author":"F. Burkhardt","year":"2005","unstructured":"Burkhardt, F. (2005). Emofilt: The simulation of emotional speech by prosody transformation. In Proc. Interspeech 2005, Lisbon, Portugal."},{"key":"9158_CR5","volume-title":"Proc. Interspeech 2009","author":"F. Burkhardt","year":"2009","unstructured":"Burkhardt, F. (2009). Rule-based voice quality variation with formant synthesis. In Proc. Interspeech 2009."},{"key":"9158_CR6","volume-title":"Proc. of the ISCA workshop on speech and emotion","author":"F. Burkhardt","year":"2000","unstructured":"Burkhardt, F., & Sendlmeier, W. F. (2000). Verification of acoustical correlates of emotional speech using formant-synthesis. In Proc. of the ISCA workshop on speech and emotion."},{"key":"9158_CR7","doi-asserted-by":"crossref","first-page":"1517","DOI":"10.21437\/Interspeech.2005-446","volume-title":"Proc. Interspeech (ISCA)","author":"F. Burkhardt","year":"2005","unstructured":"Burkhardt, F., Paeschke, A., Rolfes, M., Sendlmeier, W., & Weiss, B. (2005). A database of German emotional speech. In Proc. Interspeech (ISCA), Lisbon, Portugal (pp. 1517\u20131520)."},{"key":"9158_CR8","first-page":"1","volume":"8","author":"J. E. Cahn","year":"1989","unstructured":"Cahn, J. E. (1989). The affect editor. Journal of the American Voice I\/O Society, 8, 1\u201319.","journal-title":"Journal of the American Voice I\/O Society"},{"key":"9158_CR9","first-page":"19","volume-title":"Proceedings of the ISCA workshop on speech and emotion","author":"R. Cowie","year":"2000","unstructured":"Cowie, R., Douglas-Cowie, E., Savvidou, S., McMahon, E., Sawey, M., & Schr\u00f6der, M. (2000). Feeltrace: an instrument for recording perceived emotion in real time. In Proceedings of the ISCA workshop on speech and emotion, Newcastle, Northern Ireland (pp. 19\u201324)."},{"key":"9158_CR10","volume-title":"Proc. ICSLP","author":"T. Dutoit","year":"1996","unstructured":"Dutoit, T., Pagel, V., Pierret, N., Bataille, F., & Van\u00a0der Vreken, O. (1996). The MBROLA project: towards a set of high-quality speech synthesizers free of use for non-commercial purposes. In Proc. ICSLP."},{"key":"9158_CR11","unstructured":"Engbert, I. S., & Hansen, A. V. Documentation of the Danish emotional speech database des. Tech. rep., Center for PersonKommunikation, Aalborg University, Denmark (2007). http:\/\/cpk.auc.dk\/~tb\/speech\/Emotions\/ . Last visited 11\/13\/2007."},{"key":"9158_CR12","volume-title":"Proc. affective computing and intelligent interaction (ACII)","author":"F. Eyben","year":"2009","unstructured":"Eyben, F., W\u00f6llmer, M., & Schuller, B. (2009). openEAR\u2014introducing the Munich open-source emotion and affect recognition toolkit. In Proc. affective computing and intelligent interaction (ACII), Amsterdam, The Netherlands. New York: IEEE."},{"key":"9158_CR13","first-page":"381","volume-title":"Proc. of ASRU","author":"M. Grimm","year":"2005","unstructured":"Grimm, M., & Kroschel, K. (2005). Evaluation of natural emotions using self assessment manikins. In Proc. of ASRU (pp. 381\u2013385)."},{"key":"9158_CR14","first-page":"865","volume-title":"Proc. of the IEEE international conference on multimedia and Expo (ICME)","author":"M. Grimm","year":"2008","unstructured":"Grimm, M., Kroschel, K., & Narayanan, S. (2008). The \u201cVera am Mittag\u201d German audio-visual emotional speech database. In Proc. of the IEEE international conference on multimedia and Expo (ICME), Hannover, Germany (pp. 865\u2013868)."},{"key":"9158_CR15","doi-asserted-by":"crossref","first-page":"10","DOI":"10.1145\/1656274.1656278","volume":"11","author":"M. Hall","year":"2009","unstructured":"Hall, M., Frank, E., Holmes, G., Pfahringer, B., Reutemann, P., & Witten, I. H. (2009). The WEKA data mining software: an update. ACM SIGKDD Explorations Newsletter, 11, 10\u201318.","journal-title":"ACM SIGKDD Explorations Newsletter"},{"key":"9158_CR16","doi-asserted-by":"crossref","first-page":"1743","DOI":"10.21437\/Eurospeech.1997-494","volume-title":"Proc. EUROSPEECH-97","author":"J. Hansen","year":"1997","unstructured":"Hansen, J., & Bou-Ghazale, S. (1997). Getting started with susas: a speech under simulated and actual stress database. In Proc. EUROSPEECH-97, Rhodes, Greece (Vol.\u00a04, pp. 1743\u20131746)."},{"key":"9158_CR17","doi-asserted-by":"crossref","first-page":"11","DOI":"10.1145\/1178723.1178726","volume-title":"Proceedings of the 1st ACM workshop on audio and music computing multimedia table of contents","author":"K. Lee","year":"2006","unstructured":"Lee, K., & Slaney, M. (2006). Automatic chord recognition from audio using a supervised HMM trained with audio-from-symbolic data. In Proceedings of the 1st ACM workshop on audio and music computing multimedia table of contents, Santa Barbara, CA, USA (pp. 11\u201320). New York: ACM."},{"key":"9158_CR18","volume-title":"Proc. of text, speech and dialogue","author":"I. Lefter","year":"2010","unstructured":"Lefter, I., Rothkrantz, L. J. M., Wiggers, P., & van Leeuwen, D. A. (2010). Emotion recognition from speech by combining databases and fusion of classifiers. In Proc. of text, speech and dialogue, Berlin, Germany."},{"key":"9158_CR19","series-title":"Lecture notes in computer science","first-page":"743","volume-title":"Proc. of ACII","author":"D. D. Li","year":"2007","unstructured":"Li, D. D., & Yang, Y. C. (2007). Affect-insensitive speaker recognition by feature variety training. In A. Paiva, R. Prada, & R. W. Picard (Eds.), Proc. of ACII. Lecture notes in computer science: Vol. 4738 (pp. 743\u2013744). Berlin: Springer."},{"key":"9158_CR20","first-page":"1937","volume-title":"Proc. of ICASSP","author":"M. Li","year":"2012","unstructured":"Li, M., Metallinou, A., Bone, D., & Narayanan, S. (2012). Speaker states recognition using latent factor analysis based eigenchannel factor vector modeling. In Proc. of ICASSP, Kyoto, Japan (pp.\u00a01937\u20131940)."},{"key":"9158_CR21","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/MLSP.2009.5306198","volume-title":"Proc. of IEEE international workshop on machine learning for signal processing (MLSP)","author":"A. Mahdhaoui","year":"2009","unstructured":"Mahdhaoui, A., & Chetouani, M. (2009). A new approach for motherese detection using a semi-supervised algorithm. In Proc. of IEEE international workshop on machine learning for signal processing (MLSP) (pp. 1\u20136)."},{"key":"9158_CR22","volume-title":"IEEE workshop on multimedia database management","author":"O. Martin","year":"2006","unstructured":"Martin, O., Kotsia, I., Macq, B., & Pitas, I. (2006). The eNTERFACE\u201905 audio-visual emotion database. In IEEE workshop on multimedia database management."},{"key":"9158_CR23","first-page":"185","volume-title":"Advances in kernel methods: support vector learning","author":"J. C. Platt","year":"1999","unstructured":"Platt, J. C. (1999). Fast training of support vector machines using sequential minimal optimization. In Advances in kernel methods: support vector learning (pp. 185\u2013208). Cambridge: MIT Press."},{"key":"9158_CR24","unstructured":"Portele, T. TXT2PHO\u2014a TTS front end for the German inventories of the MBROLA project (1999). http:\/\/www.sk.uni-bonn.de\/forschung\/phonetik\/sprachsynthese\/txt2pho ."},{"issue":"6","key":"9158_CR25","doi-asserted-by":"crossref","first-page":"1161","DOI":"10.1037\/h0077714","volume":"39","author":"J. A. Russell","year":"1980","unstructured":"Russell, J. A. (1980). A circumplex model of affect. Journal of Personality and Social Psychology, 39(6), 1161\u20131178.","journal-title":"Journal of Personality and Social Psychology"},{"key":"9158_CR26","first-page":"807","volume-title":"Proc. of ICSLP","author":"K. R. Scherer","year":"2000","unstructured":"Scherer, K. R., Johnstone, T., Klasmeyer, G., & B\u00e4nziger, T. (2000). Can automatic speaker verification be improved by training the algorithms on emotional speech. In Proc. of ICSLP, Beijing, China (pp. 807\u2013810)."},{"key":"9158_CR27","first-page":"2589","volume-title":"Proc. 15th international conference of phonetic sciences","author":"M. Schr\u00f6der","year":"2003","unstructured":"Schr\u00f6der, M., & Grice, M. (2003). Expressing vocal effort in concatenative synthesis. In Proc. 15th international conference of phonetic sciences, Barcelona, Spain (pp. 2589\u20132592)."},{"key":"9158_CR28","doi-asserted-by":"crossref","unstructured":"Schr\u00f6der, M., & Trouvain, J. (2003). The german text-to-speech synthesis system MARY: a\u00a0tool for research, development and teaching. International Journal of Speech Technology 365\u2013377.","DOI":"10.1023\/A:1025708916924"},{"key":"9158_CR29","first-page":"5150","volume-title":"Proc. ICASSP 2010","author":"B. Schuller","year":"2010","unstructured":"Schuller, B., & Burkhardt, F. (2010). Learning with synthesized speech for automatic emotion recognition. In Proc. ICASSP 2010, Dallas, TX, USA (pp. 5150\u20135153)."},{"key":"9158_CR30","first-page":"733","volume-title":"Proc. ICASSP 2007","author":"B. Schuller","year":"2007","unstructured":"Schuller, B., Wimmer, M., Arsic, D., Rigoll, G., & Radig, B. (2007). Audiovisual behavior modeling by combined feature spaces. In Proc. ICASSP 2007 (Vol. II, pp. 733\u2013736). New York: IEEE Press, Honolulu, Hawaii, USA."},{"issue":"12","key":"9158_CR31","doi-asserted-by":"crossref","first-page":"1760","DOI":"10.1016\/j.imavis.2009.02.013","volume":"27","author":"B. Schuller","year":"2009","unstructured":"Schuller, B., M\u00fcller, R., Eyben, F., Gast, J., H\u00f6rnler, B., W\u00f6llmer, M., Rigoll, G., H\u00f6thker, A., & Konosu, H. (2009a). Being bored? Recognising natural interest by extensive audiovisual integration for real-life application. Image and Vision Computing, 27(12), 1760\u20131774. Special issue on visual and multimodal analysis of human spontaneous behavior.","journal-title":"Image and Vision Computing"},{"key":"9158_CR32","first-page":"552","volume-title":"Proc. IEEE ASRU","author":"B. Schuller","year":"2009","unstructured":"Schuller, B., Vlasenko, B., Eyben, F., Rigoll, G., & Wendemuth, A. (2009b). Acoustic emotion recognition: a benchmark comparison of performances. In Proc. IEEE ASRU, Merano, Italy (pp.\u00a0552\u2013557)."},{"key":"9158_CR33","doi-asserted-by":"crossref","first-page":"1062","DOI":"10.1016\/j.specom.2011.01.011","volume":"53","author":"B. Schuller","year":"2011","unstructured":"Schuller, B., Batliner, A., Steidl, S., & Seppi, D. (2011). Recognising realistic emotions and affect in speech: state of the art and\u00a0lessons learnt from the first challenge. Speech Communication, 53, 1062\u20131087.","journal-title":"Speech Communication"},{"key":"9158_CR34","doi-asserted-by":"crossref","unstructured":"Schuller, B., Steidl, S., Batliner, A., Burkhardt, F., Devillers, L., M\u00fcller, C., & Narayanan, S. (2012, to appear). Paralinguistics in speech and language\u2014state-of-the-art and the challenge. Computer Speech and Language (Special issue on paralinguistics in naturalistic speech and language) (2012) 39\u00a0p. http:\/\/dx.doi.org\/10.1016\/j.csl.2012.02.005 .","DOI":"10.1016\/j.csl.2012.02.005"},{"key":"9158_CR35","volume-title":"Proc. of speech prosody","author":"S. Steidl","year":"2012","unstructured":"Steidl, S., Polzehl, T., Bunnell, H. T., Dou, Y., Muthukumar, P. K., Perry, D., Prahallad, K., Vaughn, C., Black, A. W., & Metze, F. (2012). Emotion identification for evaluation of synthesized emotional speech. In Proc. of speech prosody."},{"key":"9158_CR36","first-page":"2102","volume-title":"Proc. of INTERSPEECH","author":"W. Wu","year":"2006","unstructured":"Wu, W., Zheng, T. F., Xu, M. X., & Bao, H. J. (2006). Study on speaker verification on emotional speech. In Proc. of INTERSPEECH, Pittsburgh, PA (pp.\u00a02102\u20132105)."},{"key":"9158_CR37","volume-title":"The HTK book, for HTK version 3","author":"S. Young","year":"2006","unstructured":"Young, S., Evermann, G., Gales, M., Hain, T., Kershaw, D., Liu, X., Moore, G., Odell, J., Ollason, D., Povey, D., Valtchev, V., & Woodland, P. (2006). The HTK book, for HTK version 3, 4th ed. Cambridge: Cambridge University, Engineering Department.","edition":"4"},{"issue":"1","key":"9158_CR38","doi-asserted-by":"crossref","first-page":"39","DOI":"10.1109\/TPAMI.2008.52","volume":"31","author":"Z. Zeng","year":"2009","unstructured":"Zeng, Z., Pantic, M., Roisman, G. I., & Huang, T. S. (2009). A survey of affect recognition methods: audio, visual, and spontaneous expressions. IEEE Transactions on Pattern Analysis and Machine Intelligence, 31(1), 39\u201358.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"9158_CR39","volume-title":"Proc. IEEE automatic speech recognition and understanding workshop (ASRU)","author":"Z. Zhang","year":"2011","unstructured":"Zhang, Z., Weninger, F., W\u00f6llmer, M., & Schuller, B. (2011). Unsupervised learning in cross-corpus acoustic emotion recognition. In Proc. IEEE automatic speech recognition and understanding workshop (ASRU), Big Island, HI, USA."}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-012-9158-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10772-012-9158-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-012-9158-0","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,6,23]],"date-time":"2023-06-23T15:19:35Z","timestamp":1687533575000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10772-012-9158-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012,6,19]]},"references-count":39,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2012,9]]}},"alternative-id":["9158"],"URL":"https:\/\/doi.org\/10.1007\/s10772-012-9158-0","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"type":"print","value":"1381-2416"},{"type":"electronic","value":"1572-8110"}],"subject":[],"published":{"date-parts":[[2012,6,19]]}}}