{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T18:24:32Z","timestamp":1772907872785,"version":"3.50.1"},"reference-count":36,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2013,6,27]],"date-time":"2013-06-27T00:00:00Z","timestamp":1372291200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/2.0"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["J AUDIO SPEECH MUSIC PROC."],"published-print":{"date-parts":[[2013,12]]},"DOI":"10.1186\/1687-4722-2013-16","type":"journal-article","created":{"date-parts":[[2013,6,27]],"date-time":"2013-06-27T16:41:45Z","timestamp":1372351305000},"source":"Crossref","is-referenced-by-count":4,"title":["Acoustic-visual synthesis technique using bimodal unit-selection"],"prefix":"10.1186","volume":"2013","author":[{"given":"Slim","family":"Ouni","sequence":"first","affiliation":[]},{"given":"Vincent","family":"Colotte","sequence":"additional","affiliation":[]},{"given":"Utpala","family":"Musti","sequence":"additional","affiliation":[]},{"given":"Asterios","family":"Toutios","sequence":"additional","affiliation":[]},{"given":"Brigitte","family":"Wrobel-Dautcourt","sequence":"additional","affiliation":[]},{"given":"Marie-Odile","family":"Berger","sequence":"additional","affiliation":[]},{"given":"Caroline","family":"Lavecchia","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2013,6,27]]},"reference":[{"key":"79_CR1","volume-title":"Proceedings of the 14th International Congress of Phonetic Sciences (ICPhS \u201899)","author":"J Barker","year":"1999","unstructured":"Barker J, Berthommier F: Evidence of correlation between acoustic and visual features of speech. In Proceedings of the 14th International Congress of Phonetic Sciences (ICPhS \u201899). San Francisco, CA: International Phonetic Association (IPA),; 1\u20137 August 1999."},{"issue":"1\u20132","key":"79_CR2","doi-asserted-by":"publisher","first-page":"23","DOI":"10.1016\/S0167-6393(98)00048-X","volume":"26","author":"H Yehia","year":"1998","unstructured":"Yehia H, Rubin P, Vatikiotis-Bateson E: Quantitative association of vocal-tract and facial behavior. Speech Commun 1998, 26(1\u20132):23-43.","journal-title":"Speech Commun"},{"key":"79_CR3","doi-asserted-by":"publisher","first-page":"212","DOI":"10.1121\/1.1907309","volume":"26","author":"W Sumby","year":"1954","unstructured":"Sumby W, Pollack I: Visual contribution to speech intelligibility in noise. J. Acoust. Soc. Am 1954, 26: 212. 10.1121\/1.1907309","journal-title":"J. Acoust. Soc. Am"},{"key":"79_CR4","first-page":"53","volume-title":"2nd International Conference on Speech Synthesis","author":"Le Goff","year":"1994","unstructured":"Goff Le, Guiard-Marigny T, Cohen M, Benoit C: Real-time analysis-synthesis and intelligibility of talking faces. In 2nd International Conference on Speech Synthesis. Newark, NY: ISCA\/IEEE,; September 1994:53-56."},{"key":"79_CR5","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1155\/2007\/47891","volume":"2007","author":"S Ouni","year":"2007","unstructured":"Ouni S, Cohen MM, Ishak H, Massaro DW: Visual contribution to speech perception: measuring the intelligibility of animated talking heads. EURASIP J. Audio Speech Music Process 2007, 2007: 3-3. 10.1155\/2007\/47891","journal-title":"EURASIP J. Audio Speech Music Process"},{"issue":"4","key":"79_CR6","doi-asserted-by":"publisher","first-page":"331","DOI":"10.1023\/A:1025700715107","volume":"6","author":"G Bailly","year":"2003","unstructured":"Bailly G, B\u00e9rar M, Elisei F, Odisio M: Audiovisual speech synthesis. Int. J. Speech Technol 2003, 6(4):331-346. 10.1023\/A:1025700715107","journal-title":"Int. J. Speech Technol"},{"key":"79_CR7","volume-title":"Proceedings of the International Congress on Phonetic Sciences","author":"BJ Theobald","year":"2007","unstructured":"Theobald BJ: Audiovisual speech synthesis. In Proceedings of the International Congress on Phonetic Sciences. Saarbrucken: International Phonetic Association (IPA),; 6\u201310 August 2007."},{"key":"79_CR8","doi-asserted-by":"publisher","first-page":"174192","DOI":"10.1155\/2009\/174192","volume":"2009","author":"K Liu","year":"2009","unstructured":"Liu K, Ostermann J: Optimization of an image-based talking head system. EURASIP J. Audio Speech Music Process 2009, 2009: 174192. 10.1155\/2009\/174192","journal-title":"EURASIP J. Audio Speech Music Process"},{"key":"79_CR9","doi-asserted-by":"publisher","first-page":"597267","DOI":"10.1155\/2009\/597267","volume":"2009","author":"JD Edge","year":"2009","unstructured":"Edge JD, Hilton A, Jackson P: Model-based synthesis of visual speech movements from 3D video. EURASIP J. Audio Speech Music Process 2009, 2009: 597267. 10.1155\/2009\/597267","journal-title":"EURASIP J. Audio Speech Music Process"},{"key":"79_CR10","doi-asserted-by":"publisher","first-page":"719","DOI":"10.1068\/p090719","volume":"9","author":"NF Dixon","year":"1980","unstructured":"Dixon NF, Spitz L: The detection of audiovisual desynchrony. Perception 1980, 9: 719-721. 10.1068\/p090719","journal-title":"Perception"},{"key":"79_CR11","doi-asserted-by":"publisher","first-page":"34","DOI":"10.3758\/BF03208030","volume":"45","author":"KP Green","year":"1989","unstructured":"Green KP, Kuhl PK: The role of visual information in the processing of place and manner features in speech perception. Percept. Psychophys 1989, 45: 34-42. 10.3758\/BF03208030","journal-title":"Percept. Psychophys"},{"key":"79_CR12","first-page":"278","volume":"17","author":"KP Green","year":"1991","unstructured":"Green KP, Kuhl PK: Integral processing of visual place and auditory voicing information during phonetic perception. J. Exp. Psychol.: Hum. Percept. Perform 1991, 17: 278-288.","journal-title":"J. Exp. Psychol.: Hum. Percept. Perform"},{"key":"79_CR13","doi-asserted-by":"publisher","first-page":"1174","DOI":"10.1155\/S1110865702206046","volume":"11","author":"J Jiang","year":"2002","unstructured":"Jiang J, Alwan A, Keating PA, Auer ET, Bernstein LE: On the importance of audiovisual coherence for the perceived quality of synthesized visual speech. EURASIP J. Appl. Signal Process 2002, 11: 1174-1188.","journal-title":"EURASIP J. Appl. Signal Process"},{"key":"79_CR14","volume-title":"Proceedings of AVSP","author":"J Jiang","year":"2005","unstructured":"Jiang J, Bernstein LE, Edward T, Auer J: Realistic face animation from sparse stereo meshes. In Proceedings of AVSP. British Columbia: AVISA,; 24\u201327 July 2005."},{"key":"79_CR15","doi-asserted-by":"publisher","first-page":"746","DOI":"10.1038\/264746a0","volume":"264","author":"H McGurk","year":"1976","unstructured":"McGurk H, MacDonald J: Hearing lips and seeing voices. Nature 1976, 264: 746-748. 10.1038\/264746a0","journal-title":"Nature"},{"key":"79_CR16","doi-asserted-by":"publisher","first-page":"169819","DOI":"10.1155\/2009\/169819","volume":"2009","author":"W Mattheyses","year":"2009","unstructured":"Mattheyses W, Latacz L, Verhelst W: On the importance of audiovisual coherence for the perceived quality of synthesized visual speech. EURASIP J. Audio Speech Music Process 2009, 2009: 169819. 10.1155\/2009\/169819","journal-title":"EURASIP J. Audio Speech Music Process"},{"key":"79_CR17","volume-title":"Proceedings of ICASSP","author":"A Hunt","year":"1996","unstructured":"Hunt A, Black A: Unit selection in a concatenative speech synthesis system using a large speech database. In Proceedings of ICASSP. Atlanta: IEEE,; 7\u201310 May 1996."},{"key":"79_CR18","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511816338","volume-title":"in Text-to-Speech Synthesis","author":"P Taylor","year":"2009","unstructured":"Taylor P: in Text-to-Speech Synthesis. Cambridge: Cambridge Univ. Press; 2009."},{"key":"79_CR19","volume-title":"Proceedings of the AVSP","author":"A Hallgren","year":"1998","unstructured":"Hallgren A, Lyberg B: Visual speech synthesis with concatenative speech. In Proceedings of the AVSP. Terrigal-Sydney: AVISA,; 4\u20136 December 1998."},{"key":"79_CR20","first-page":"959","volume-title":"Proceedings of the Eurospeech Conference","author":"M Tamura","year":"1999","unstructured":"Tamura M, Kondo S, Masuko T, Kobayashi T: Text-to-audio-visual speech synthesis based on parameter generation from HMM. In Proceedings of the Eurospeech Conference. Budapest: ; 5\u20139 September 1999:959-962."},{"key":"79_CR21","volume-title":"Proceedings of the Interspeech 2000","author":"S Minnis","year":"2000","unstructured":"Minnis S, Breen A: Modeling visual coarticulation in synthetic talking heads using a lip motion unit inventory with concatenative synthesis. In Proceedings of the Interspeech 2000. Beijing: ISCA,; 16\u201320 October 2000."},{"key":"79_CR22","volume-title":"Proceedings of the International Conference on Speech and Computer","author":"S Fagel","year":"2006","unstructured":"Fagel S: Joint audio-visual units selection - the JAVUS speech synthesizer. In Proceedings of the International Conference on Speech and Computer. St. Petersburg: SPIIRAS,; June 2006."},{"key":"79_CR23","volume-title":"Interspeech 2010","author":"A Toutios","year":"2010","unstructured":"Toutios A, Musti U, Ouni S, Colotte V, Wrobel-Dautcourt B, Berger MO: Setup for acoustic-visual speech synthesis by concatenating bimodal units. In Interspeech 2010. Makuhari, Japan: Visac Publications,; 2010."},{"key":"79_CR24","volume-title":"Proceedings of the AVSP","author":"B Wrobel-Dautcourt","year":"2005","unstructured":"Wrobel-Dautcourt B, Berger M, Potard B, Laprie Y, Ouni S: A low-cost stereovision based system for acquisition of visible articulatory data. In Proceedings of the AVSP. British Columbia: AVISA,; 2005."},{"key":"79_CR25","doi-asserted-by":"publisher","first-page":"131","DOI":"10.1007\/978-94-009-2037-8_6","volume-title":"Speech Production and Speech Modelling","author":"S Maeda","year":"1990","unstructured":"Maeda S: Compensatory articulation during speech: evidence from the analysis and synthesis of vocal-tract shapes using an articulatory model. In Speech Production and Speech Modelling. Edited by: Hardcastle WJ, Marchal A. Dordrecht: Kluwer Academic,; 1990:131-149."},{"issue":"4","key":"79_CR26","doi-asserted-by":"publisher","first-page":"317","DOI":"10.1016\/j.specom.2007.01.014","volume":"49","author":"R Clark","year":"2007","unstructured":"Clark R, Richmond K, King S: Multisyn: Open-domain unit selection for the festival speech synthesis system. Speech Commun 2007, 49(4):317-330. 10.1016\/j.specom.2007.01.014","journal-title":"Speech Commun"},{"key":"79_CR27","unstructured":"Colotte V, Lafosse A: Soja: french text-to-speech synthesis system. . Accessed 21 June 2013 \n                    http:\/\/soja-tts.loria.fr\/"},{"key":"79_CR28","volume-title":"Interspeech Proceedings","author":"V Colotte","year":"2005","unstructured":"Colotte V, Beaufort R: Linguistic features weighting for a text-to-speech system without prosody model. In Interspeech Proceedings. Lisbon: ISCA,; 4\u20138 September 2005."},{"key":"79_CR29","volume-title":"International Conference on Auditory-Visual Speech Processing - AVSP2011","author":"U Musti","year":"2011","unstructured":"Musti U, Colotte V, Toutios A, Ouni S: Introducing visual target cost within an acoustic-visual unit-selection speech synthesizer. In International Conference on Auditory-Visual Speech Processing - AVSP2011. Volterra: ; 31 August to 3 September 2011."},{"key":"79_CR30","volume-title":"5th Conference on Auditory-Visual Speech Processing - AVSP 2005","author":"V Robert","year":"2005","unstructured":"Robert V, Wrobel-Dautcourt B, Laprie Y, Bonneau A: Inter speaker variability of labial coarticulation with the view of developing a formal coarticulation model for French. In 5th Conference on Auditory-Visual Speech Processing - AVSP 2005. Vancouver Island: AVISA,; 24\u201327 July 2005."},{"key":"79_CR31","first-page":"27","volume-title":"12th Annual Conference of the International Speech Communication Association - Interspeech","author":"A Toutios","year":"2011","unstructured":"Toutios A, Musti U, Ouni S, Colotte V: Weight optimization for bimodal unit-selection talking head synthesis. In 12th Annual Conference of the International Speech Communication Association - Interspeech 2011. Edited by: ISCA, ISCA . Florence: ISCA,; 27\u201331 August 2011."},{"issue":"5\u20136","key":"79_CR32","doi-asserted-by":"publisher","first-page":"453","DOI":"10.1016\/0167-6393(90)90021-Z","volume":"9","author":"E Moulines","year":"1990","unstructured":"Moulines E, Charpentier F: Pitch-synchronous waveform processing techniques for text-to-speech synthesis using diphones. Speech Commun 1990, 9(5\u20136):453-467.","journal-title":"Speech Commun"},{"key":"79_CR33","volume-title":"XI European Signal Processing Conference - EUSIPCO 2002 (2002)","author":"V Colotte","year":"2002","unstructured":"Colotte V, Laprie Y: Higher precision pitch marking for TD-PSOLA. In XI European Signal Processing Conference - EUSIPCO 2002 (2002). Toulouse: EURASIP,; 3\u20136 September 2002."},{"key":"79_CR34","doi-asserted-by":"crossref","first-page":"552","DOI":"10.1007\/978-3-642-04380-2_87","volume-title":"IVA \u201909 Proceedings of the 9th International Conference on Intelligent Virtual Agents","author":"B Weiss","year":"2009","unstructured":"Weiss B, K\u00fchnel C, Wechsung I, M\u00f6ller S, Fagel S: Web-Based Evaluation of talking heads: how valid is it? In IVA \u201909 Proceedings of the 9th International Conference on Intelligent Virtual Agents. Amsterdam: Springer,; 14\u201316 September 2009:552-553."},{"key":"79_CR35","doi-asserted-by":"publisher","first-page":"115","DOI":"10.1016\/j.specom.2004.11.008","volume":"45","author":"S Ouni","year":"2005","unstructured":"Ouni S, Cohen MM, Massaro DW: Training Baldi to be multilingual: a case study for an Arabic Badr. Speech Commun 2005, 45: 115-137. 10.1016\/j.specom.2004.11.008","journal-title":"Speech Commun"},{"key":"79_CR36","first-page":"33","volume":"7","author":"M Mori","year":"1970","unstructured":"Mori M: The uncanny valley. Energy 1970, 7: 33-35.","journal-title":"Energy"}],"container-title":["EURASIP Journal on Audio, Speech, and Music Processing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1186\/1687-4722-2013-16\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1186\/1687-4722-2013-16.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1186\/1687-4722-2013-16.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,1,21]],"date-time":"2019-01-21T20:59:00Z","timestamp":1548104340000},"score":1,"resource":{"primary":{"URL":"https:\/\/asmp-eurasipjournals.springeropen.com\/articles\/10.1186\/1687-4722-2013-16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2013,6,27]]},"references-count":36,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2013,12]]}},"alternative-id":["79"],"URL":"https:\/\/doi.org\/10.1186\/1687-4722-2013-16","relation":{},"ISSN":["1687-4722"],"issn-type":[{"value":"1687-4722","type":"electronic"}],"subject":[],"published":{"date-parts":[[2013,6,27]]},"article-number":"16"}}