{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T02:08:51Z","timestamp":1774058931440,"version":"3.50.1"},"publisher-location":"Cham","reference-count":25,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319937632","type":"print"},{"value":"9783319937649","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-319-93764-9_35","type":"book-chapter","created":{"date-parts":[[2018,6,5]],"date-time":"2018-06-05T07:09:12Z","timestamp":1528182552000},"page":"372-381","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":46,"title":["Generating Talking Face Landmarks from Speech"],"prefix":"10.1007","author":[{"given":"Sefik Emre","family":"Eskimez","sequence":"first","affiliation":[]},{"given":"Ross K.","family":"Maddox","sequence":"additional","affiliation":[]},{"given":"Chenliang","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Zhiyao","family":"Duan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,6,6]]},"reference":[{"issue":"4","key":"35_CR1","doi-asserted-by":"publisher","first-page":"342","DOI":"10.1177\/000348949210100410","volume":"101","author":"PJ Blamey","year":"1992","unstructured":"Blamey, P.J., Pyman, B.C., Clark, G.M., Dowell, R.C., Gordon, M., Brown, A.M., Hollow, R.D.: Factors predicting postoperative sentence scores in postlinguistically deaf adult cochlear implant patients. Ann. Otol. Rhinol. Laryngol. 101(4), 342\u2013348 (1992)","journal-title":"Ann. Otol. Rhinol. Laryngol."},{"key":"35_CR2","doi-asserted-by":"crossref","unstructured":"Brand, M.: Voice puppetry. In: Proceedings of the 26th Annual Conference on Computer Graphics and Interactive Techniques, pp. 21\u201328. ACM Press\/Addison-Wesley Publishing Co. (1999)","DOI":"10.1145\/311535.311537"},{"key":"35_CR3","doi-asserted-by":"publisher","first-page":"193","DOI":"10.1016\/j.cviu.2015.08.011","volume":"148","author":"S Cassidy","year":"2016","unstructured":"Cassidy, S., Stenger, B., Dongen, L.V., Yanagisawa, K., Anderson, R., Wan, V., Baron-Cohen, S., Cipolla, R.: Expressive visual text-to-speech as an assistive technology for individuals with autism spectrum conditions. Comput. Vis. Image Underst. 148, 193\u2013200 (2016)","journal-title":"Comput. Vis. Image Underst."},{"key":"35_CR4","doi-asserted-by":"publisher","first-page":"51","DOI":"10.1023\/A:1011171430700","volume":"29","author":"K Choi","year":"2001","unstructured":"Choi, K., Luo, Y., Hwang, J.N.: Hidden Markov model inversion for audio-to-visual conversion in an MPEG-4 facial animation system. J. VLSI Signal Process. Syst. Signal Image Video Technol. 29, 51\u201361 (2001)","journal-title":"J. VLSI Signal Process. Syst. Signal Image Video Technol."},{"key":"35_CR5","unstructured":"Chung, J.S., Jamaludin, A., Zisserman, A.: You said that? (2017). arXiv preprint: arXiv:1705.02966"},{"issue":"5","key":"35_CR6","doi-asserted-by":"publisher","first-page":"2421","DOI":"10.1121\/1.2229005","volume":"120","author":"M Cooke","year":"2006","unstructured":"Cooke, M., Barker, J., Cunningham, S., Shao, X.: An audio-visual corpus for speech perception and automatic speech recognition. J. Acoust. Soc. Am. 120(5), 2421\u20132424 (2006)","journal-title":"J. Acoust. Soc. Am."},{"key":"35_CR7","doi-asserted-by":"crossref","unstructured":"Cosker, D., Marshall, D., Rosin, P.L., Hicks, Y.: Speech driven facial animation using a Hidden Markov coarticulation model. In: Proceedings of the 17th International Conference on Pattern Recognition (ICPR), vol. 1, pp. 128\u2013131. IEEE (2004)","DOI":"10.1109\/ICPR.2004.1334024"},{"key":"35_CR8","unstructured":"Cosker, D., Marshall, D., Rosin, P., Hicks, Y.: Video realistic talking heads using hierarchical non-linear speech-appearance models, Mirage, France, vol. 147 (2003)"},{"key":"35_CR9","volume-title":"Hearing by Eye: The Psychology of Lip-Reading","author":"BE Dodd","year":"1987","unstructured":"Dodd, B.E., Campbell, R.E.: Hearing by Eye: The Psychology of Lip-Reading. Lawrence Erlbaum Associates, Inc., Hillsdale (1987)"},{"key":"35_CR10","doi-asserted-by":"crossref","unstructured":"Fan, B., Wang, L., Soong, F.K., Xie, L.: Photo-real talking head with deep bidirectional LSTM. In: International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4884\u20134888. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178899"},{"key":"35_CR11","doi-asserted-by":"crossref","unstructured":"Garofalo, J.S., Lamel, L.F., Fisher, W.M., Fiscus, J.G., Pallett, D.S., Dahlgren, N.L.: The darpa timit acoustic-phonetic continuous speech corpus CD-ROM. Linguistic Data Consortium (1993)","DOI":"10.6028\/NIST.IR.4930"},{"issue":"8","key":"35_CR12","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"35_CR13","first-page":"1755","volume":"10","author":"DE King","year":"2009","unstructured":"King, D.E.: Dlib-ml: a machine learning toolkit. J. Mach. Learn. Res. 10, 1755\u20131758 (2009)","journal-title":"J. Mach. Learn. Res."},{"key":"35_CR14","doi-asserted-by":"crossref","unstructured":"Maddox, R.K., Atilgan, H., Bizley, J.K., Lee, A.K.: Auditory selective attention is enhanced by a task-irrelevant temporally coherent visual stimulus in human listeners. eLife 4 (2015)","DOI":"10.7554\/eLife.04995"},{"key":"35_CR15","unstructured":"Mallick, S.: Face morph using opencv c++\/python (2016). http:\/\/www.learnopencv.com\/face-morph-using-opencv-cpp-python\/"},{"key":"35_CR16","doi-asserted-by":"crossref","unstructured":"Pham, H.X., Cheung, S., Pavlovic, V.: Speech-driven 3d facial animation with implicit emotional awareness: a deep learning approach. In: The 1st DALCOM Workshop, CVPR (2017)","DOI":"10.1109\/CVPRW.2017.287"},{"key":"35_CR17","unstructured":"Pham, H.X., Wang, Y., Pavlovic, V.: End-to-end learning for 3d facial animation from raw waveforms of speech (2017). arXiv preprint: arXiv:1710.00920"},{"key":"35_CR18","unstructured":"Richie, S., Warburton, C., Carter, M.: Audiovisual database of spoken American English. Linguistic Data Consortium (2009)"},{"issue":"4","key":"35_CR19","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn, S., Seitz, S.M., Kemelmacher-Shlizerman, I.: Synthesizing Obama: learning lip sync from audio. ACM Trans. Graph. (TOG) 36(4), 95 (2017)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"35_CR20","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1007\/978-3-540-88190-2_9","volume-title":"Advances in Artificial Intelligence - SBIA 2008","author":"LD Terissi","year":"2008","unstructured":"Terissi, L.D., G\u00f3mez, J.C.: Audio-to-visual conversion via HMM inversion for speech-driven facial animation. In: Zaverucha, G., da Costa, A.L. (eds.) SBIA 2008. LNCS (LNAI), vol. 5249, pp. 33\u201342. Springer, Heidelberg (2008). https:\/\/doi.org\/10.1007\/978-3-540-88190-2_9"},{"key":"35_CR21","doi-asserted-by":"crossref","unstructured":"Tillman, T.W., Carhart, R.: An expanded test for speech discrimination utilizing CNC monosyllabic words: Northwestern University auditory test no. 6. Technical report, Northwestern University Evanston Auditory Research Lab (1966)","DOI":"10.21236\/AD0639638"},{"key":"35_CR22","doi-asserted-by":"crossref","unstructured":"Wan, V., Anderson, R., Blokland, A., Braunschweiler, N., Chen, L., Kolluru, B., Latorre, J., Maia, R., Stenger, B., Yanagisawa, K., et al.: Photo-realistic expressive text to talking head synthesis. In: INTERSPEECH, pp. 2667\u20132669 (2013)","DOI":"10.1145\/2503385.2503473"},{"key":"35_CR23","unstructured":"Wang, L., Han, W., Soong, F.K., Huo, Q.: Text driven 3d photo-realistic talking head. In: Twelfth Annual Conference of the International Speech Communication Association (2011)"},{"key":"35_CR24","doi-asserted-by":"publisher","first-page":"2325","DOI":"10.1016\/j.patcog.2006.12.001","volume":"40","author":"L Xie","year":"2007","unstructured":"Xie, L., Liu, Z.Q.: A coupled HMM approach to video-realistic speech animation. Pattern Recogn. 40, 2325\u20132340 (2007)","journal-title":"Pattern Recogn."},{"key":"35_CR25","doi-asserted-by":"crossref","unstructured":"Zhang, X., Wang, L., Li, G., Seide, F., Soong, F.K.: A new language independent, photo-realistic talking head driven by voice only. In: Interspeech, pp. 2743\u20132747 (2013)","DOI":"10.21437\/Interspeech.2013-629"}],"container-title":["Lecture Notes in Computer Science","Latent Variable Analysis and Signal Separation"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-93764-9_35","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,3]],"date-time":"2023-09-03T00:03:34Z","timestamp":1693699414000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-93764-9_35"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783319937632","9783319937649"],"references-count":25,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-93764-9_35","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]}}}