{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,18]],"date-time":"2025-05-18T21:40:01Z","timestamp":1747604401803,"version":"3.40.5"},"publisher-location":"Cham","reference-count":43,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319155562"},{"type":"electronic","value":"9783319155579"}],"license":[{"start":{"date-parts":[[2015,1,1]],"date-time":"2015-01-01T00:00:00Z","timestamp":1420070400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2015,1,1]],"date-time":"2015-01-01T00:00:00Z","timestamp":1420070400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2015]]},"DOI":"10.1007\/978-3-319-15557-9_2","type":"book-chapter","created":{"date-parts":[[2015,2,11]],"date-time":"2015-02-11T13:05:56Z","timestamp":1423659956000},"page":"15-24","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Steps Towards More Natural Human-Machine Interaction via Audio-Visual Word Prominence Detection"],"prefix":"10.1007","author":[{"given":"Martin","family":"Heckmann","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2015,2,12]]},"reference":[{"key":"2_CR1","unstructured":"Replaygain 1.0 specification. http:\/\/wiki.hydrogenaudio.org\/"},{"key":"2_CR2","unstructured":"Al Moubayed, S., Beskow, J.: Effects of visual prominence cues on speech intelligibility. In: Proceedings of International Conference on Auditory Visual Speech Process. (AVSP), vol. 9, p. 16. ISCA (2009)"},{"key":"2_CR3","doi-asserted-by":"crossref","unstructured":"Arias, J.P., Busso, C., Yoma, N.B.: Energy and f0 contour modeling with functional data analysis for emotional speech detection. In: Proceedings of INTERSPEECH, Lyon, FR (2013)","DOI":"10.21437\/Interspeech.2013-253"},{"issue":"1","key":"2_CR4","doi-asserted-by":"publisher","first-page":"278","DOI":"10.1016\/j.csl.2013.07.002","volume":"28","author":"JP Arias","year":"2014","unstructured":"Arias, J.P., Busso, C., Yoma, N.B.: Shape-based modeling of the fundamental frequency contour for emotion detection in speech. Comput. Speech Lang. 28(1), 278\u2013294 (2014)","journal-title":"Comput. Speech Lang."},{"key":"2_CR5","doi-asserted-by":"crossref","unstructured":"Beskow, J., Granstr\u00f6m, B., House, D.: Visual correlates to prominence in several expressive modes. In: Proceedings of INTERSPEECH, pp. 1272\u20131275. ISCA (2006)","DOI":"10.21437\/Interspeech.2006-375"},{"key":"2_CR6","unstructured":"Black, A., Taylor, P., Caley, R.: The festival speech synthesis system. Technical report (1998)"},{"key":"2_CR7","first-page":"122","volume":"25","author":"G Bradski","year":"2000","unstructured":"Bradski, G.: The openCV library. Dr. Dobb\u2019s J. Softw. Tools 25, 122\u2013125 (2000)","journal-title":"Dr. Dobb\u2019s J. Softw. Tools"},{"key":"2_CR8","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1007\/978-1-4614-8280-2_13","volume-title":"Natural Interaction with Robots, Knowbots and Smartphones - Putting Spoken Dialog Systems into Practice","author":"A Buendia","year":"2014","unstructured":"Buendia, A., Devillers, L.: From informative cooperative dialogues to long-term social relation with a robot. In: Mariani, J., Devillers, L., Garnier-Rizet, M., Rosset, S. (eds.) Natural Interaction with Robots, Knowbots and Smartphones - Putting Spoken Dialog Systems into Practice, pp. 135\u2013151. Springer, Heidelberg (2014)"},{"key":"2_CR9","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1007\/978-3-540-76442-7_11","volume-title":"Verbal and Nonverbal Communication Behaviours","author":"N Campbell","year":"2007","unstructured":"Campbell, N.: On the use of nonverbal speech sounds in human communication. In: Esposito, A., Faundez-Zanuy, M., Keller, E., Marinaro, M. (eds.) COST Action 2102. LNCS (LNAI), vol. 4775, pp. 117\u2013128. Springer, Heidelberg (2007)"},{"issue":"1","key":"2_CR10","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1016\/j.robot.2007.09.015","volume":"56","author":"A Ceravola","year":"2008","unstructured":"Ceravola, A., Stein, M., Goerick, C.: Researching and developing a real-time infrastructure for intelligent systems - evolution of an integrated approach. Robot. Auton. Syst. 56(1), 14\u201328 (2008)","journal-title":"Robot. Auton. Syst."},{"key":"2_CR11","doi-asserted-by":"publisher","first-page":"27:1","DOI":"10.1145\/1961189.1961199","volume":"2","author":"CC Chang","year":"2011","unstructured":"Chang, C.C., Lin, C.J.: LIBSVM: a library for support vector machines. ACM Trans. Intell. Syst. Technol. 2, 27:1\u201327:27 (2011). http:\/\/www.csie.ntu.edu.tw\/\u00a0cjlin\/libsvm","journal-title":"ACM Trans. Intell. Syst. Technol."},{"key":"2_CR12","doi-asserted-by":"publisher","first-page":"2421","DOI":"10.1121\/1.2229005","volume":"120","author":"M Cooke","year":"2006","unstructured":"Cooke, M., Barker, J., Cunningham, S., Shao, X.: An audio-visual corpus for speech perception and automatic speech recognition. J. Acoust. Soc. Am. 120, 2421\u20132424 (2006)","journal-title":"J. Acoust. Soc. Am."},{"key":"2_CR13","doi-asserted-by":"crossref","unstructured":"Cvejic, E., Kim, J., Davis, C., Gibert, G.: Prosody for the eyes: quantifying visual prosody using guided principal component analysis. In: Proceedings of INTERSPEECH. ISCA (2010)","DOI":"10.21437\/Interspeech.2010-434"},{"key":"2_CR14","doi-asserted-by":"crossref","unstructured":"Dohen, M., L\u0153venbruck, H., Harold, H., et al.: Visual correlates of prosodic contrastive focus in french: description and inter-speaker variability. In: Speech Prosody, Dresden, Germany (2006)","DOI":"10.21437\/SpeechProsody.2006-210"},{"key":"2_CR15","doi-asserted-by":"crossref","unstructured":"Eyben, F., W\u00f6llmer, M., Schuller, B.: Opensmile: the munich versatile and fast open-source audio feature extractor. In: Proceedings of International Conference on Multimedia, pp. 1459\u20131462. ACM (2010)","DOI":"10.1145\/1873951.1874246"},{"key":"2_CR16","doi-asserted-by":"crossref","unstructured":"Graf, H., Cosatto, E., Strom, V., Huang, F.: Visual prosody: facial movements accompanying speech. In: International Conference on Automatic Face and Gesture Recognition, pp. 396\u2013401. IEEE (2002)","DOI":"10.1109\/AFGR.2002.1004186"},{"key":"2_CR17","doi-asserted-by":"crossref","unstructured":"Heckmann, M.: Audio-visual evaluation and detection of word prominence in a human-machine interaction scenario. In: Proceedings of INTERSPEECH. ISCA, Portland (2012)","DOI":"10.21437\/Interspeech.2012-626"},{"key":"2_CR18","doi-asserted-by":"crossref","unstructured":"Heckmann, M.: Inter-speaker variability in audio-visual classification of word prominence. In: Proceedings of INTERSPEECH, Lyon, France (2013)","DOI":"10.21437\/Interspeech.2013-443"},{"key":"2_CR19","doi-asserted-by":"crossref","unstructured":"Heckmann, M., Domont, X., Joublin, F., Goerick, C.: A closer look on hierarchical spectro-temporal features (HIST). In: Proceedings of INTERSPEECH, Brisbane, Australia (2008)","DOI":"10.21437\/Interspeech.2008-208"},{"key":"2_CR20","doi-asserted-by":"crossref","unstructured":"Heckmann, M., Gl\u00e4ser, C., Vaz, M., Rodemann, T., Joublin, F., Goerick, C.: Listen to the parrot: demonstrating the quality of online pitch and formant extraction via feature-based resynthesis. In: Proceedings IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS), Nice (2008)","DOI":"10.1109\/IROS.2008.4650923"},{"key":"2_CR21","doi-asserted-by":"crossref","unstructured":"Heckmann, M., Joublin, F., Goerick, C.: Combining rate and place information for robust pitch extraction. In: Proceedings of INTERSPEECH, pp. 2765\u20132768, Antwerp (2007)","DOI":"10.21437\/Interspeech.2007-463"},{"issue":"1","key":"2_CR22","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1016\/S0095-4470(02)00071-2","volume":"31","author":"M Heldner","year":"2003","unstructured":"Heldner, M.: On the reliability of overall intensity and spectral emphasis as acoustic correlates of focal accents in swedish. J. Phonetics 31(1), 39\u201362 (2003)","journal-title":"J. Phonetics"},{"key":"2_CR23","unstructured":"Jeon, J., Wang, W., Liu, Y.: N-best rescoring based on pitch-accent patterns. In: Proceedings of 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, vol. 1, pp. 732\u2013741. Association for Computational Linguistics (2011)"},{"key":"2_CR24","doi-asserted-by":"crossref","unstructured":"Kristjansson, T., Deligne, S., Olsen, P.: Voicing features for robust speech detection. In: Proceedings of INTERSPEECH, vol. 2, p. 3 (2005)","DOI":"10.21437\/Interspeech.2005-186"},{"key":"2_CR25","unstructured":"Lee, A., Kawahara, T.: Recent development of open-source speech recognition engine julius. In: Proceedings of Asia-Pacific Signal and Information Processing Association Annual Summit and Conference, pp. 131\u2013137 (2009)"},{"key":"2_CR26","doi-asserted-by":"crossref","unstructured":"Levow, G.: Identifying local corrections in human-computer dialogue. In: Eighth International Conference on Spoken Language Processing (ICSLP) (2004)","DOI":"10.21437\/Interspeech.2004-146"},{"issue":"3","key":"2_CR27","doi-asserted-by":"publisher","first-page":"417","DOI":"10.1162\/coli.2006.32.3.417","volume":"32","author":"D Litman","year":"2006","unstructured":"Litman, D., Hirschberg, J., Swerts, M.: Characterizing and predicting corrections in spoken dialogue systems. Comput. Linguist. 32(3), 417\u2013438 (2006)","journal-title":"Comput. Linguist."},{"key":"2_CR28","unstructured":"Michele, G., Torreira, F., Boves, L.: Using FDA for investigating multidimensional dynamic phonetic contrasts. Preprint submitted to Journal of Phonetics (2013)"},{"issue":"2","key":"2_CR29","doi-asserted-by":"publisher","first-page":"133","DOI":"10.1111\/j.0963-7214.2004.01502010.x","volume":"15","author":"K Munhall","year":"2004","unstructured":"Munhall, K., Jones, J., Callan, D., Kuratate, T., Vatikiotis-Bateson, E.: Visual prosody and speech intelligibility. Psychol. Sci. 15(2), 133 (2004)","journal-title":"Psychol. Sci."},{"key":"2_CR30","doi-asserted-by":"crossref","unstructured":"Nakadai, K., Okuno, H., Nakajima, H., Hasegawa, Y., Tsujino, H.: An open source software system for robot audition hark and its evaluation. In: Proceedings of IEEE-RAS International Conference on Humanoid Robots (2008)","DOI":"10.1109\/ICHR.2008.4756031"},{"key":"2_CR31","unstructured":"Ramsay, J.: Functions for functional data analysis in R, SPLUS and Matlab. http:\/\/www.psych.mcgill.ca\/misc\/fda\/"},{"key":"2_CR32","doi-asserted-by":"crossref","DOI":"10.1007\/b98888","volume-title":"Functional Data Analysis","author":"J Ramsay","year":"2005","unstructured":"Ramsay, J., Silverman, B.: Functional Data Analysis. Springer, New York (2005)"},{"key":"2_CR33","unstructured":"Rosenberg, A.: Automatic detection and classification of prosodic events. Ph.D. thesis, Columbia University (2009)"},{"key":"2_CR34","doi-asserted-by":"crossref","unstructured":"Schnall, A., Heckmann, M.: Integrating sequence information in the audio-visual detection of word prominence in a human-machine interaction scenario. In: Proceedings of INTERSPEECH, Singapore (2014)","DOI":"10.21437\/Interspeech.2014-565"},{"issue":"2","key":"2_CR35","doi-asserted-by":"publisher","first-page":"165","DOI":"10.1109\/T-AFFC.2011.34","volume":"3","author":"M Schroder","year":"2012","unstructured":"Schroder, M., Bevacqua, E., Cowie, R., Eyben, F., Gunes, H., Heylen, D., Ter Maat, M., McKeown, G., Pammi, S., Pantic, M., Pelachaud, C., Schuller, B., de Sevin, E., Valstar, M., W\u00f6llmer, M.: Building autonomous sensitive artificial listeners. IEEE Trans. Affect. Comput. 3(2), 165\u2013183 (2012)","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"1","key":"2_CR36","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1016\/j.csl.2012.02.005","volume":"27","author":"B Schuller","year":"2013","unstructured":"Schuller, B., Steidl, S., Batliner, A., Burkhardt, F., Devillers, L., M\u00fcller, C., Narayanan, S.: Paralinguistics in speech and languag-state-of-the-art and the challenge. Comput. Speech Lang. 27(1), 4\u201339 (2013)","journal-title":"Comput. Speech Lang."},{"key":"2_CR37","doi-asserted-by":"crossref","unstructured":"Shriberg, E.: Spontaneous speech: How people really talk and why engineers should care. In: Proceedings of EUROSPEECH. ISCA (2005)","DOI":"10.21437\/Interspeech.2005-3"},{"issue":"3","key":"2_CR38","doi-asserted-by":"publisher","first-page":"455","DOI":"10.1016\/j.specom.2005.02.018","volume":"46","author":"E Shriberg","year":"2005","unstructured":"Shriberg, E., Ferrer, L., Kajarekar, S., Venkataraman, A., Stolcke, A.: Modeling prosodic feature sequences for speaker recognition. Speech Commun. 46(3), 455\u2013472 (2005)","journal-title":"Speech Commun."},{"key":"2_CR39","doi-asserted-by":"crossref","unstructured":"Shriberg, E., Stolcke, A., Hakkani-T\u00fcr, D.Z., Heck, L.P.: Learning when to listen: detecting system-addressed speech in human-human-computer dialog. In: Proceedings of INTERSPEECH (2012)","DOI":"10.21437\/Interspeech.2012-83"},{"issue":"2","key":"2_CR40","doi-asserted-by":"publisher","first-page":"219","DOI":"10.1016\/j.wocn.2007.05.001","volume":"36","author":"M Swerts","year":"2008","unstructured":"Swerts, M., Krahmer, E.: Facial expression and prosodic prominence: effects of modality and facial area. J. Phonetics 36(2), 219\u2013238 (2008)","journal-title":"J. Phonetics"},{"key":"2_CR41","doi-asserted-by":"crossref","unstructured":"Swerts, M., Litman, D., Hirschberg, J.: Corrections in spoken dialogue systems. In: Sixth International Conference on Spoken Language Processing (ICSLP). ISCA, Bejing (2000)","DOI":"10.21437\/ICSLP.2000-344"},{"issue":"2","key":"2_CR42","doi-asserted-by":"publisher","first-page":"690","DOI":"10.1109\/TASL.2006.881703","volume":"15","author":"D Wang","year":"2007","unstructured":"Wang, D., Narayanan, S.: An acoustic measure for word prominence in spontaneous speech. IEEE Trans. Audio Speech and Lang. Proc. 15(2), 690\u2013701 (2007)","journal-title":"IEEE Trans. Audio Speech and Lang. Proc."},{"key":"2_CR43","volume-title":"The HTK Book","author":"S Young","year":"1995","unstructured":"Young, S., Odell, J., Ollason, D., Valtchev, V., Woodland, P.: The HTK Book. Cambridge University, Cambridge (1995)"}],"container-title":["Lecture Notes in Computer Science","Multimodal Analyses enabling Artificial Agents in Human-Machine Interaction"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-15557-9_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,18]],"date-time":"2025-05-18T21:00:46Z","timestamp":1747602046000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-319-15557-9_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015]]},"ISBN":["9783319155562","9783319155579"],"references-count":43,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-15557-9_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2015]]},"assertion":[{"value":"12 February 2015","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}}