{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2023,7,4]],"date-time":"2023-07-04T04:27:11Z","timestamp":1688444831845},"reference-count":34,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2013,8,17]],"date-time":"2013-08-17T00:00:00Z","timestamp":1376697600000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2014,11]]},"DOI":"10.1007\/s11042-013-1609-3","type":"journal-article","created":{"date-parts":[[2013,8,16]],"date-time":"2013-08-16T08:12:54Z","timestamp":1376640774000},"page":"417-437","source":"Crossref","is-referenced-by-count":3,"title":["Speech-driven talking face using embedded confusable system for real time mobile multimedia"],"prefix":"10.1007","volume":"73","author":[{"given":"Po-Yi","family":"Shih","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Anand","family":"Paul","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jhing-Fa","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yi-Hung","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2013,8,17]]},"reference":[{"key":"1609_CR1","doi-asserted-by":"crossref","unstructured":"Bregler C, Covell M, Slaney M (1997) Video rewrite: Driving visual speech with audio. In Proc. ACM SIGGRAPH\u201997","DOI":"10.1145\/258734.258880"},{"key":"1609_CR2","unstructured":"Cambridge University Engineering Dept. HTK Toolkit 3.4. http:\/\/htk.eng.cam.ac.uk\/"},{"issue":"1","key":"1609_CR3","doi-asserted-by":"crossref","first-page":"9","DOI":"10.1109\/79.911195","volume":"18","author":"T Chen","year":"2001","unstructured":"Chen T (2001) Audiovisual speech processing: Lip reading and lip synchronization. IEEE Signal Process Mag 18(1):9\u201321","journal-title":"IEEE Signal Process Mag"},{"issue":"5","key":"1609_CR4","doi-asserted-by":"crossref","first-page":"837","DOI":"10.1109\/5.664274","volume":"86","author":"T Chen","year":"1998","unstructured":"Chen T, Rao RR (1998) Audio-visual integration in multimodal communication. Processing of the IEEE 86(5):837\u2013852","journal-title":"Processing of the IEEE"},{"issue":"1\u20132","key":"1609_CR5","doi-asserted-by":"crossref","first-page":"51","DOI":"10.1023\/A:1011171430700","volume":"29","author":"K Choi","year":"2001","unstructured":"Choi K, Luo Y, Hwang J-N (2001) Hidden Markov model inversion for audio-to-visual conversion in an MPEG-4 facial animation system. The Journal of VLSI Signal Processing 29(1\u20132):51\u201361","journal-title":"The Journal of VLSI Signal Processing"},{"key":"1609_CR6","doi-asserted-by":"crossref","first-page":"139","DOI":"10.1007\/978-4-431-66911-1_13","volume-title":"Models and techniques in computer animation","author":"MM Cohen","year":"1993","unstructured":"Cohen MM, Massaro DW (1993) Modeling coarticulation in synthetic visual speech. In: Magnenat-Thalmann M, Thalmann D (eds) Models and techniques in computer animation. Springer, Tokyo, pp 139\u2013156"},{"key":"1609_CR7","doi-asserted-by":"crossref","unstructured":"Cosatto E, Graf HP (1998) Sample-based synthesis of photo-realistic talking heads. in Proc. IEEE Computer Animation, pp. 103\u2013110","DOI":"10.1109\/CA.1998.681914"},{"key":"1609_CR8","doi-asserted-by":"crossref","first-page":"152","DOI":"10.1109\/6046.865480","volume":"2","author":"E Cosatto","year":"2000","unstructured":"Cosatto E, Graf HP (2000) Photo-realistic talking heads from image samples. IEEE Trans Multimedia 2:152\u2013163","journal-title":"IEEE Trans. Multimedia"},{"issue":"9","key":"1609_CR9","doi-asserted-by":"crossref","first-page":"1406","DOI":"10.1109\/JPROC.2003.817141","volume":"91","author":"E Cosatto","year":"2003","unstructured":"Cosatto E, Ostermann J, Graf HP, Schroeter J (2003) Lifelike talking faces for interactive services. Proc IEEE 91(9):1406\u20131428","journal-title":"Proc IEEE"},{"key":"1609_CR10","unstructured":"CrazyTalk, V 2.0 Lip-Sync, 2010. Http:\/\/www.reallusion.com\/Crazytalk\/ ."},{"key":"1609_CR11","unstructured":"Curinga S, Lavagetto F, Vignoli F (1996) Lip movements synthesis using time delay neural networks. in Proc. EUSIPCO 96\u2014Systems and Computers, pp. 36\u201346"},{"issue":"4","key":"1609_CR12","doi-asserted-by":"crossref","first-page":"251","DOI":"10.1109\/89.397090","volume":"3","author":"Y Ephraim","year":"1995","unstructured":"Ephraim Y, Trees HLV (1995) A signal subspace approach for speech enhancement. IEEE Transactions on Speech and Audio Processing 3(4):251\u2013266","journal-title":"IEEE Transactions on Speech and Audio Processing."},{"issue":"3","key":"1609_CR13","doi-asserted-by":"crossref","first-page":"388","DOI":"10.1145\/566654.566594","volume":"21","author":"T Ezzat","year":"2002","unstructured":"Ezzat T, Geiger G, Poggio T (2002) Trainable videorealistic speech animation. Proc ACM SIGGRAPH\u201902 21(3):388\u2013397","journal-title":"Proc ACM SIGGRAPH\u201902"},{"key":"1609_CR14","doi-asserted-by":"crossref","unstructured":"Imperl B, Horvat B (1999) The clustering algorithm for the definition of multilingual set of context dependent speech models. In Proceedings of the European Conference of Speech Communication and Technology, pp. 887\u2013890","DOI":"10.21437\/Eurospeech.1999-216"},{"key":"1609_CR15","doi-asserted-by":"crossref","unstructured":"Koster BE, Rodman RD, Bitzer D (1994) Automated lip-sync: Direct translation of speech-sound to mouth-shape. in Proc. 28th Annu. Asilomar Conf. Signals, pp. 583\u2013586","DOI":"10.1109\/ACSSC.1994.471519"},{"key":"1609_CR16","doi-asserted-by":"crossref","unstructured":"Lee S, Yook D (2002) Audio-to-visual conversion using hidden Markov models. In Proceedings of the 7th Pacific Rim International Conference on Artificial Intelligence, Springer-Verlag, pp. 563\u2013570","DOI":"10.1007\/3-540-45683-X_60"},{"key":"1609_CR17","volume-title":"Confusability of phonemes grouped according to their Viseme classes in noisy environments. Presented at tenth Australian international conference on speech science & technology","author":"P Lucey","year":"2004","unstructured":"Lucey P, Martin T, Sridharan S (2004) Confusability of phonemes grouped according to their Viseme classes in noisy environments. Presented at tenth Australian international conference on speech science & technology. Macquarie University, Sydney"},{"key":"1609_CR18","doi-asserted-by":"crossref","unstructured":"Mcallister DV, Rodman RD, Bitzer DL, Freeman AS (1997) Lip synchronization for Animation. Proc. SIGGRAPH 97, Los Angeles, CA, pp. 225\u2013228","DOI":"10.1145\/259081.259312"},{"key":"1609_CR19","unstructured":"Morishima S (1998) Real-time talking head driven by voice and its application to communication and entertainment. in Proc. AVSP 98, pp. 195\u2013200"},{"key":"1609_CR20","first-page":"826","volume":"3","author":"J Ostermann","year":"2004","unstructured":"Ostermann J, Weissenfeld A (2004) Talking faces-technologies and applications. In Proc of ICPR\u201904 3:826\u2013833","journal-title":"In Proc. of ICPR\u201904"},{"issue":"7","key":"1609_CR21","doi-asserted-by":"crossref","first-page":"1299","DOI":"10.1109\/TMM.2008.2004908","volume":"10","author":"J Park","year":"2008","unstructured":"Park J, Ko H (2008) Real-time continuous phoneme recognition system using class-dependent tied-mixture HMM with HBT structure for speech-driven lip-sync. IEEE Trans Multimedia 10(7):1299\u20131306","journal-title":"IEEE Trans. Multimedia"},{"key":"1609_CR22","doi-asserted-by":"crossref","unstructured":"Parke F, Waters K (1996) Computer facial animation","DOI":"10.1037\/e526112012-055"},{"key":"1609_CR23","unstructured":"Tamura M, Masuko T, Kobayashi T, Tokuday K (1998) Visual speech synthesis based on parameter generation from HMM: Speech driven and text-and-speech driven approaches. in Proc. Audio-Visual Speech Processing (AVSP 98), pp. 221\u2013226"},{"key":"1609_CR24","doi-asserted-by":"crossref","first-page":"127","DOI":"10.1016\/j.specom.2004.07.002","volume":"44","author":"B Theobald","year":"2004","unstructured":"Theobald B, Bangham A, Matthews I, Cawley G (2004) Near-videorealistic synthetic talking faces: Implementation and evaluation. Speech Communication 44:127\u2013140","journal-title":"Speech Communication"},{"key":"1609_CR25","volume-title":"A real-time speech-driven talking head using active appearance models. AVSP 2007, international conference on auditory-visual speech processing 2007","author":"BJ Theobald","year":"2007","unstructured":"Theobald BJ, Wilkinson N (2007) A real-time speech-driven talking head using active appearance models. AVSP 2007, international conference on auditory-visual speech processing 2007. Kasteel Groenendael, Hilvarenbeek"},{"key":"1609_CR26","doi-asserted-by":"crossref","unstructured":"Turunen E (2001) Survey of theory and applications of Lukasiewicz-Pavelka fuzzy logic. Lectures on Soft Computing and Fuzzy Logic. Advances in Soft Computing, pp. 313\u2013337","DOI":"10.1007\/978-3-7908-1818-5_17"},{"key":"1609_CR27","unstructured":"Wang HC (1997) MAT\u2014A project to collect Mandarin speech data through telephone networks. Computational Linguistics and Chinese Language Processing, Computational Linguistics Society of R.O.C., vol.2, no. 1, pp. 73\u201390."},{"issue":"1","key":"1609_CR28","doi-asserted-by":"crossref","first-page":"25","DOI":"10.1109\/TASE.2007.911680","volume":"5","author":"J-C Wang","year":"2008","unstructured":"Wang J-C, Lee H-P, Wang J-F, Lin C-B (2008) Robust environmental sound recognition for home automation. IEEE Transaction on Automation Science and Engineering 5(1):25\u201331","journal-title":"IEEE Transaction on Automation Science and Engineering"},{"issue":"7","key":"1609_CR29","doi-asserted-by":"crossref","first-page":"1055","DOI":"10.1093\/ietisy\/e90-d.7.1055","volume":"E90-D","author":"J-C Wang","year":"2007","unstructured":"Wang J-C, Lee H-P, Wang J-F, Yang C-H (2007) Critical band subspace-based speech enhancement using SNR and auditory masking aware technique. IEICE Trans Inf Syst E90-D(7):1055\u20131062","journal-title":"IEICE Trans Inf Syst"},{"issue":"3","key":"1609_CR30","doi-asserted-by":"crossref","first-page":"500","DOI":"10.1109\/TMM.2006.888009","volume":"9","author":"L Xie","year":"2007","unstructured":"Xie L, Liu Z (2007) Realistic mouth-synching for speech-driven talking face using articulatory modeling. IEEE Trans Multimedia 9(3):500\u2013510","journal-title":"IEEE Trans. Multimedia"},{"issue":"1\u20132","key":"1609_CR31","doi-asserted-by":"crossref","first-page":"105","DOI":"10.1016\/S0167-6393(98)00054-5","volume":"26","author":"E Yamamoto","year":"1998","unstructured":"Yamamoto E, Nakamura S, Shikano K (1998) Lip movement synthesis from speech based on Hidden Markov models. Speech Communication 26(1\u20132):105\u2013115","journal-title":"Speech Communication"},{"key":"1609_CR32","unstructured":"Ye J, Yao H, Jiang F (2004) Based on HMM and SVM multilayer architecture classifier for chinese sign language recognition with large vocabulary. Proc. Third Int\u2019l Conf. Image and Graphics (ICIG\u201904), 377\u2013380"},{"key":"1609_CR33","doi-asserted-by":"crossref","unstructured":"Zgank A, Imperl B, Johansen F (2001) Crosslingual speech recognition with multilingual acoustic models based on agglomerative and tree-based triphone clustering. In Proceedings of the European Conference of Speech Communication and Technology, pp. 2725\u20132728","DOI":"10.21437\/Eurospeech.2001-637"},{"key":"1609_CR34","doi-asserted-by":"crossref","unstructured":"Zhong D, Def\u00e9e I (2007) Performance of similarity measures based on histograms of local image feature vectors. J Patt Recog Lett 28(15)","DOI":"10.1016\/j.patrec.2007.05.019"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-013-1609-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11042-013-1609-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-013-1609-3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,3]],"date-time":"2023-07-03T22:25:09Z","timestamp":1688423109000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11042-013-1609-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2013,8,17]]},"references-count":34,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2014,11]]}},"alternative-id":["1609"],"URL":"https:\/\/doi.org\/10.1007\/s11042-013-1609-3","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2013,8,17]]}}}