{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T16:48:26Z","timestamp":1759942106526},"reference-count":39,"publisher":"Springer Science and Business Media LLC","issue":"22","license":[{"start":{"date-parts":[[2014,8,3]],"date-time":"2014-08-03T00:00:00Z","timestamp":1407024000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2015,11]]},"DOI":"10.1007\/s11042-014-2118-8","type":"journal-article","created":{"date-parts":[[2014,8,2]],"date-time":"2014-08-02T08:11:39Z","timestamp":1406967099000},"page":"9849-9869","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":17,"title":["HMM trajectory-guided sample selection for photo-realistic talking head"],"prefix":"10.1007","volume":"74","author":[{"given":"Lijuan","family":"Wang","sequence":"first","affiliation":[]},{"given":"Frank K.","family":"Soong","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2014,8,3]]},"reference":[{"key":"2118_CR1","first-page":"187","volume":"99","author":"V Blanz","year":"1999","unstructured":"Blanz V, Vetter T (1999) A morphable model for the synthesis Of 3D faces. Proc ACM SIGGRAPH 99:187\u2013194","journal-title":"Proc ACM SIGGRAPH"},{"key":"2118_CR2","first-page":"353","volume":"97","author":"C Bregler","year":"1997","unstructured":"Bregler C, Covell M, Slaney M (1997) Video rewrite: driving visual speech with audio. Proc ACM SIGGRAPH 97:353\u2013360","journal-title":"Proc ACM SIGGRAPH"},{"issue":"1","key":"2118_CR3","doi-asserted-by":"crossref","first-page":"9","DOI":"10.1109\/79.911195","volume":"18","author":"T Chen","year":"2001","unstructured":"Chen T (2001) Audiovisual speech processing. Signal Proc Mag 18(1):9\u201321","journal-title":"Signal Proc Mag"},{"issue":"3","key":"2118_CR4","doi-asserted-by":"crossref","first-page":"152","DOI":"10.1109\/6046.865480","volume":"2","author":"E Cosatto","year":"2000","unstructured":"Cosatto E, Graf HP (2000) Photo-realistic talking heads from image samples. IEEE Trans Multimed 2(3):152\u2013163","journal-title":"IEEE Trans Multimed"},{"key":"2118_CR5","unstructured":"Donovan RE, Eide EM (1998) The IBM trainable speech synthesis system. Proc 1998 I.E. Int\u2019l Conf Acoust Speech Signal Process (ICASSP 98) IEEE, pp 1703\u20131706"},{"key":"2118_CR6","doi-asserted-by":"crossref","unstructured":"Ezzat T, Geiger G, PoggioT (2002) Trainable video realistic speech animation. Proc ACM SIGGRAPH 2002, pp 388\u2013398","DOI":"10.1145\/566654.566594"},{"key":"2118_CR7","doi-asserted-by":"crossref","unstructured":"Ezzat T, Poggio T (1998) Miketalk: a talking facial display based on morphing visemes. Proc Comput Animat, pp 96\u2013102","DOI":"10.1109\/CA.1998.681913"},{"key":"2118_CR8","unstructured":"Hirai T, Tenpaku S (2004) Using 5ms segments in concatenative speech synthesis. Proc 5th ISCA Speech Synt Work Int\u2019l Speech Comm Assoc pp 37\u201342"},{"key":"2118_CR9","doi-asserted-by":"crossref","unstructured":"Huang F, Cosatto E, Graf HP (2002) Triphone based unit selection for concatenative visual speech synthesis. Proc 2002 I.E. Int\u2019l Conf Acoust Speech Signal Proc (ICASSP 02) IEEE, pp 2037\u20132040","DOI":"10.1109\/ICASSP.2002.5745033"},{"key":"2118_CR10","doi-asserted-by":"crossref","unstructured":"Huang X et al (1997) Recent improvements on microsoft\u2019s trainable text-to-speech system \u2013 Whistler. Proc 1997 I.E. Int\u2019l Conf Acoust Speech Signal Process (ICASSP 97) IEEE, pp 959\u2013962","DOI":"10.1109\/ICASSP.1997.596097"},{"key":"2118_CR11","doi-asserted-by":"crossref","unstructured":"Hunt A, Black A (1996) Unit selection in a concatenative speech synthesis system using a large speech database. Proc 1996 I.E. Int\u2019l Conf Acoust Speech Signal Process (ICASSP 96) IEEE, pp 373\u2013376","DOI":"10.1109\/ICASSP.1996.541110"},{"issue":"3","key":"2118_CR12","doi-asserted-by":"crossref","first-page":"341","DOI":"10.1109\/TVCG.2005.43","volume":"11","author":"SA King","year":"2005","unstructured":"King SA, Parent RE (2005) Creating speech-synchronized animation. IEEE Trans Vis Comput Graph 11(3):341\u2013352","journal-title":"IEEE Trans Vis Comput Graph"},{"key":"2118_CR13","unstructured":"Lewis JP Fast normalized cross-correlation. Industrial Light & Magic"},{"key":"2118_CR14","unstructured":"Ling ZH, Wang RH (2006) HMM-based unit selection using frame sized speech segments. Proc 7th Ann Conf Int\u2019l Speech Comm Assoc. (Interspeech 06) Int\u2019l Speech Comm Assoc, pp 2034\u20132037"},{"key":"2118_CR15","doi-asserted-by":"crossref","unstructured":"Liu K, Ostermann J (2008) Realistic facial animation system for interactive services. Proc 9th Ann Conf Int\u2019l Speech Comm Assoc (Interspeech 08), Int\u2019l Speech Comm Assoc, pp 2330\u20132333","DOI":"10.21437\/Interspeech.2008-594"},{"key":"2118_CR16","unstructured":"Liu K, Weissenfeld A, Ostermann J (2006) Parameterization of mouth images by LLE and PCA for image-based facial animation. Proc 2006 I.E. Int\u2019l Conf Acoust Speech Signal Process (ICASSP 06) IEEE, pp 461\u2013464"},{"key":"2118_CR17","doi-asserted-by":"crossref","unstructured":"Mattheyses W et al (2008) Multimodal unit selection for 2D audiovisual text-to-speech synthesis. Lect Note Comput Sci, pp 125\u2013136","DOI":"10.1007\/978-3-540-85853-9_12"},{"issue":"4","key":"2118_CR18","doi-asserted-by":"crossref","first-page":"854","DOI":"10.1109\/TNN.2002.1021886","volume":"13","author":"S Nakamura","year":"2002","unstructured":"Nakamura S (2002) Statistical multimodal integration for audio-visual speech processing. IEEE Trans Neural Netw 13(4):854\u2013866","journal-title":"IEEE Trans Neural Netw"},{"key":"2118_CR19","doi-asserted-by":"crossref","unstructured":"Perez P, Gangnet M, Blake A (2003) Poisson image editing. ACM Trans Graph (SIGGRAPH\u201903) 22(3):313\u2013318","DOI":"10.1145\/882262.882269"},{"key":"2118_CR20","first-page":"75","volume":"98","author":"F Pighin","year":"1998","unstructured":"Pighin F et al (1998) Synthesizing realistic facial expressions from photographs. Proc ACM SIGGRAPH 98:75\u201384","journal-title":"Proc ACM SIGGRAPH"},{"key":"2118_CR21","doi-asserted-by":"crossref","unstructured":"Sako S et al (2000) HMM-based text-to-audio-visual speech synthesis. Proc 6th Int\u2019l Conf on Spoken Lang Process (ICSLP 00) Int\u2019l Speech Comm Assoc, pp 25\u201328","DOI":"10.21437\/ICSLP.2000-469"},{"key":"2118_CR22","doi-asserted-by":"crossref","unstructured":"Scott MR, Liu X, Zhou M (2011) Towards a specialized search engine for language learners. Proc IEEE, pp 1462\u20131465","DOI":"10.1109\/JPROC.2011.2160107"},{"key":"2118_CR23","doi-asserted-by":"crossref","first-page":"127","DOI":"10.1016\/j.specom.2004.07.002","volume":"44","author":"BJ Theobald","year":"2004","unstructured":"Theobald BJ et al (2004) Near videorealistic synthetic talking faces: implementation and evaluation. Speech Comm 44:127\u2013140","journal-title":"Speech Comm"},{"key":"2118_CR24","doi-asserted-by":"crossref","unstructured":"Theobald B et al (2008) LIPS2008: visual speech synthesis challenge. Proc 9th Ann Conf Int\u2019l Speech Comm Assoc (Interspeech 08) Int\u2019l Speech Comm Assoc, pp 2310\u20132313","DOI":"10.21437\/Interspeech.2008-590"},{"key":"2118_CR25","doi-asserted-by":"crossref","unstructured":"Toda T, Black A, Tokuda K Spectral conversion based on maximum likelihood estimation considering global variance of converted parameter. Proc 2005 I.E. Int\u2019l Conf Acoust Speech Signal Process (ICASSP 05) IEEE, pp 9\u201312","DOI":"10.1109\/ICASSP.2005.1415037"},{"key":"2118_CR26","unstructured":"Tokuda K et al (1996) Speech synthesis using HMMs with dynamic features Proc 1996 I.E. Int\u2019l Conf Acoust Speech Signal Process (ICASSP 96) IEEE, pp 389\u2013392"},{"key":"2118_CR27","unstructured":"Video demonstration of our synthesis results: http:\/\/research.microsoft.com\/en-us\/projects\/photo-real_talking_head\/"},{"key":"2118_CR28","unstructured":"Wang JQ et al (2004) A real-time cantonese text-to-audiovisual speech synthesizer. Proc 2004 I.E. Int\u2019l Conf Acoust Speech Signal Process (ICASSP 04) IEEE, pp I\u2013653\u2013I\u2013656"},{"issue":"12","key":"2118_CR29","doi-asserted-by":"crossref","first-page":"1533","DOI":"10.1109\/TCSVT.2006.885727","volume":"16","author":"Q Wang","year":"2006","unstructured":"Wang Q et al (2006) Real-time Bayesian 3-D pose tracking. IEEE Trans Circ Syst Video Technol 16(12):1533\u20131541","journal-title":"IEEE Trans Circ Syst Video Technol"},{"key":"2118_CR30","doi-asserted-by":"crossref","unstructured":"Wang LJ et al (2010) Synthesizing photo-real talking head via trajectory-guided sample selection. Proc 11th Ann Conf Int\u2019l Speech Comm Assoc (Interspeech 10) Int\u2019l Speech Comm Assoc, pp 446\u2013449","DOI":"10.21437\/Interspeech.2010-194"},{"key":"2118_CR31","doi-asserted-by":"crossref","unstructured":"Wang LJ et al (2011) Synthesizing visual speech trajectory with minimum generation error. Proc 2011 I.E. Int\u2019l Conf Acoust Speech Signal Process (ICASSP 11) IEEE, pp 4580\u20134583","DOI":"10.1109\/ICASSP.2011.5947374"},{"issue":"6","key":"2118_CR32","doi-asserted-by":"crossref","first-page":"38","DOI":"10.1109\/MC.2012.152","volume":"45","author":"LJ Wang","year":"2012","unstructured":"Wang LJ et al (2012) Computer-assisted audiovisual language learning. Computer 45(6):38\u201347, Computer Society","journal-title":"Computer"},{"key":"2118_CR33","unstructured":"Wu Y-J, Qin L, Tokuda K (2009) An improved minimum generation error based model adaptation for HMM-based speech synthesis. Proc 10th Ann Conf Int\u2019l Speech Comm Assoc (Interspeech 09) Int\u2019l Speech Comm Assoc, pp 1787\u20131790"},{"key":"2118_CR34","first-page":"89","volume":"I","author":"Y-J Wu","year":"2006","unstructured":"Wu Y-J, Wang R-H (2006) Minimum generation error training for HMM-based speech synthesis. Proc 2006 I.E. Int\u2019l Conf Acoust Speech Signal Process (ICASSP 06) IEEE I:89\u201392","journal-title":"Proc 2006 I.E. Int\u2019l Conf Acoust Speech Signal Process (ICASSP 06) IEEE"},{"key":"2118_CR35","doi-asserted-by":"crossref","unstructured":"Wu KK et al (2011) A sparse and low-rank approach to efficient face alignment for photo-real talking head synthesis. Proc 2011 I.E. Int\u2019l Conf Acoust Speech Signal Process (ICASSP 11) IEEE, pp 1397\u20131400","DOI":"10.1109\/ICASSP.2011.5946674"},{"key":"2118_CR36","unstructured":"Xie L, Liu ZQ (2006) Speech animation using coupled hidden Markov models. Proc 2006 Int\u2019l Conf Pattern Recognit (ICPR\u201906), pp 1128\u20131131"},{"issue":"3","key":"2118_CR37","doi-asserted-by":"crossref","first-page":"500","DOI":"10.1109\/TMM.2006.888009","volume":"9","author":"L Xie","year":"2007","unstructured":"Xie L, Liu ZQ (2007) Realistic mouth-synching for speech-driven talking face using articulatory modelling. IEEE Trans Multimed 9(3):500\u2013510","journal-title":"IEEE Trans Multimed"},{"key":"2118_CR38","doi-asserted-by":"crossref","unstructured":"Yan ZJ, Qian Y, Soong F (2010) Rich-context Unit Selection (RUS) approach to high quality TTS. Proc 2010 I.E. Int\u2019l Conf Acoust Speech Signal Process (ICASSP 10) IEEE, pp 4798\u20134801","DOI":"10.1109\/ICASSP.2010.5495150"},{"key":"2118_CR39","doi-asserted-by":"crossref","unstructured":"Zhang S et al (2007) Head movement synthesis based on semantic and prosodic features for a Chinese expressive avatar. Proc 2007 I.E. Int\u2019l Conf Acoust Speech Signal Process (ICASSP 07) IEEE, pp IV\u2013837\u2013IV\u2013840","DOI":"10.1109\/ICASSP.2007.367043"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-014-2118-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11042-014-2118-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-014-2118-8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,15]],"date-time":"2023-07-15T18:09:27Z","timestamp":1689444567000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11042-014-2118-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014,8,3]]},"references-count":39,"journal-issue":{"issue":"22","published-print":{"date-parts":[[2015,11]]}},"alternative-id":["2118"],"URL":"https:\/\/doi.org\/10.1007\/s11042-014-2118-8","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2014,8,3]]}}}