{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,21]],"date-time":"2026-07-21T14:56:46Z","timestamp":1784645806888,"version":"3.55.0"},"publisher-location":"Cham","reference-count":45,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319541839","type":"print"},{"value":"9783319541846","type":"electronic"}],"license":[{"start":{"date-parts":[[2017,1,1]],"date-time":"2017-01-01T00:00:00Z","timestamp":1483228800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017]]},"DOI":"10.1007\/978-3-319-54184-6_6","type":"book-chapter","created":{"date-parts":[[2017,3,9]],"date-time":"2017-03-09T15:44:25Z","timestamp":1489074265000},"page":"87-103","source":"Crossref","is-referenced-by-count":339,"title":["Lip Reading in the Wild"],"prefix":"10.1007","author":[{"given":"Joon Son","family":"Chung","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Andrew","family":"Zisserman","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2017,3,10]]},"reference":[{"key":"6_CR1","doi-asserted-by":"crossref","unstructured":"Anina, I., Zhou, Z., Zhao, G., Pietik\u00e4inen, M.: OuluVS2: a multi-view audiovisual database for non-rigid mouth motion analysis. In: 2015 11th IEEE International Conference and Workshops on Automatic Face and Gesture Recognition (FG), vol. 1, pp. 1\u20135. IEEE (2015)","DOI":"10.1109\/FG.2015.7163155"},{"key":"6_CR2","doi-asserted-by":"crossref","unstructured":"Buehler, P., Everingham, M., Zisserman, A.: Learning sign language by watching TV (using weakly aligned subtitles). In: Proceedings of CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206523"},{"key":"6_CR3","doi-asserted-by":"crossref","unstructured":"Chatfield, K., Simonyan, K., Vedaldi, A., Zisserman, A.: Return of the devil in the details: delving deep into convolutional nets. In: Proceedings of BMVC (2014)","DOI":"10.5244\/C.28.6"},{"issue":"5","key":"6_CR4","doi-asserted-by":"crossref","first-page":"2421","DOI":"10.1121\/1.2229005","volume":"120","author":"M Cooke","year":"2006","unstructured":"Cooke, M., Barker, J., Cunningham, S., Shao, X.: An audio-visual corpus for speech perception and automatic speech recognition. J. Acoust. Soc. Am. 120(5), 2421\u20132424 (2006)","journal-title":"J. Acoust. Soc. Am."},{"key":"6_CR5","doi-asserted-by":"crossref","unstructured":"Donahue, J., Anne Hendricks, L., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., Darrell, T.: Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2625\u20132634 (2015)","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"6_CR6","doi-asserted-by":"crossref","unstructured":"Everingham, M., Sivic, J., Zisserman, A.: \u201cHello! My name is.. Buffy\u201d - automatic naming of characters in TV video. In: Proceedings of BMVC (2006)","DOI":"10.5244\/C.20.92"},{"issue":"1","key":"6_CR7","doi-asserted-by":"crossref","first-page":"91","DOI":"10.1109\/TIFS.2007.916280","volume":"3","author":"Y Fu","year":"2008","unstructured":"Fu, Y., Yan, S., Huang, T.S.: Classification and feature extraction by simplexization. IEEE Trans. Inf. Forensics Secur. 3(1), 91\u2013100 (2008)","journal-title":"IEEE Trans. Inf. Forensics Secur."},{"key":"6_CR8","doi-asserted-by":"crossref","first-page":"505","DOI":"10.1007\/978-3-662-13015-5_39","volume-title":"Speechreading by Humans and Machines","author":"AJ Goldschen","year":"1996","unstructured":"Goldschen, A.J., Garcia, O.N., Petajan, E.D.: Rationale for phoneme-viseme mapping and feature selection in visual speech recognition. In: Stork, D.G., Hennecke, M.E. (eds.) Speechreading by Humans and Machines, pp. 505\u2013515. Springer, Heidelberg (1996)"},{"issue":"4","key":"6_CR9","doi-asserted-by":"crossref","first-page":"1738","DOI":"10.1121\/1.399423","volume":"87","author":"H Hermansky","year":"1990","unstructured":"Hermansky, H.: Perceptual linear predictive (PLP) analysis of speech. J. Acoust. Soc. Am. 87(4), 1738\u20131752 (1990)","journal-title":"J. Acoust. Soc. Am."},{"key":"6_CR10","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. arXiv preprint arXiv:1502.03167 (2015)"},{"key":"6_CR11","unstructured":"Jaderberg, M., Simonyan, K., Vedaldi, A., Zisserman, A.: Synthetic data and artificial neural networks for natural scene text recognition. In: Workshop on Deep Learning, NIPS (2014)"},{"issue":"1","key":"6_CR12","doi-asserted-by":"crossref","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji, S., Xu, W., Yang, M., Yu, K.: 3D convolutional neural networks for human action recognition. IEEE PAMI 35(1), 221\u2013231 (2013)","journal-title":"IEEE PAMI"},{"key":"6_CR13","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: Proceedings of the IEEE conference on Computer Vision and Pattern Recognition, pp. 1725\u20131732 (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"6_CR14","doi-asserted-by":"crossref","unstructured":"Kazemi, V., Sullivan, J.: One millisecond face alignment with an ensemble of regression trees. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1867\u20131874 (2014)","DOI":"10.1109\/CVPR.2014.241"},{"key":"6_CR15","first-page":"1755","volume":"10","author":"DE King","year":"2009","unstructured":"King, D.E.: Dlib-ml: a machine learning toolkit. J. Acoust. Soc. Am. 10, 1755\u20131758 (2009)","journal-title":"J. Acoust. Soc. Am."},{"key":"6_CR16","doi-asserted-by":"crossref","unstructured":"Koller, O., Ney, H., Bowden, R.: Deep learning of mouth shapes for sign language. In: Proceedings of the IEEE International Conference on Computer Vision Workshops, pp. 85\u201391 (2015)","DOI":"10.1109\/ICCVW.2015.69"},{"key":"6_CR17","doi-asserted-by":"crossref","unstructured":"Kondrak, G.: A new algorithm for the alignment of phonetic sequences. In: Proceedings of the 1st North American chapter of the Association for Computational Linguistics conference, pp. 288\u2013295. Association for Computational Linguistics (2000)","DOI":"10.3115\/1073336.1073350"},{"key":"6_CR18","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: ImageNet classification with deep convolutional neural networks. In: NIPS, pp. 1106\u20131114 (2012)"},{"key":"6_CR19","doi-asserted-by":"crossref","unstructured":"Lee, B., Hasegawa-Johnson, M., Goudeseune, C., Kamdar, S., Borys, S., Liu, M., Huang, T.S.: AVICAR: audio-visual speech corpus in a car environment. In: INTERSPEECH. Citeseer (2004)","DOI":"10.21437\/Interspeech.2004-424"},{"key":"6_CR20","doi-asserted-by":"crossref","first-page":"469","DOI":"10.1142\/S021946780100027X","volume":"1","author":"R Lienhart","year":"2001","unstructured":"Lienhart, R.: Reliable transition detection in videos: a survey and practitioner\u2019s guide. Images Graph. 1, 469\u2013486 (2001)","journal-title":"Images Graph."},{"key":"6_CR21","unstructured":"Lucey, P., Martin, T., Sridharan, S.: Confusability of phonemes grouped according to their viseme classes in noisy environments. In: Proceedings of Australian International Conference on Speech Science & Technical, pp. 265\u2013270 (2004)"},{"issue":"2","key":"6_CR22","doi-asserted-by":"crossref","first-page":"198","DOI":"10.1109\/34.982900","volume":"24","author":"I Matthews","year":"2002","unstructured":"Matthews, I., Cootes, T.F., Bangham, J.A., Cox, S., Harvey, R.: Extraction of visual features for lipreading. IEEE Trans. Pattern Anal. Mach. Intell. 24(2), 198\u2013213 (2002)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"6_CR23","doi-asserted-by":"crossref","first-page":"746","DOI":"10.1038\/264746a0","volume":"264","author":"H McGurk","year":"1976","unstructured":"McGurk, H., MacDonald, J.: Hearing lips and seeing voices. Nature 264, 746\u2013748 (1976)","journal-title":"Nature"},{"key":"6_CR24","unstructured":"Ngiam, J., Khosla, A., Kim, M., Nam, J., Lee, H., Ng, A.Y.: Multimodal deep learning. In: Proceedings of the 28th International Conference on Machine Learning (ICML 2011), pp. 689\u2013696 (2011)"},{"key":"6_CR25","doi-asserted-by":"crossref","unstructured":"Noda, K., Yamaguchi, Y., Nakadai, K., Okuno, H.G., Ogata, T.: Lipreading using convolutional neural network. In: INTERSPEECH, pp. 1149\u20131153 (2014)","DOI":"10.21437\/Interspeech.2014-293"},{"issue":"3","key":"6_CR26","doi-asserted-by":"crossref","first-page":"423","DOI":"10.1109\/TASL.2008.2011515","volume":"17","author":"G Papandreou","year":"2009","unstructured":"Papandreou, G., Katsamanis, A., Pitsikalis, V., Maragos, P.: Adaptive multimodal fusion by uncertainty compensation with application to audiovisual speech recognition. IEEE Trans. Audio Speech Lang. Process. 17(3), 423\u2013435 (2009)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"6_CR27","doi-asserted-by":"crossref","unstructured":"Patterson, E.K., Gurbuz, S., Tufekci, Z., Gowdy, J.N.: CUAVE: a new audio-visual database for multimodal human-computer interface research. In: 2002 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), vol. 2, pp. II-2017. IEEE (2002)","DOI":"10.1109\/ICASSP.2002.5745028"},{"key":"6_CR28","doi-asserted-by":"crossref","unstructured":"Pei, Y., Kim, T.K., Zha, H.: Unsupervised random forest manifold alignment for lipreading. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 129\u2013136 (2013)","DOI":"10.1109\/ICCV.2013.23"},{"key":"6_CR29","doi-asserted-by":"crossref","unstructured":"Petridis, S., Pantic, M.: Deep complementary bottleneck features for visual speech recognition. ICASSP, pp. 2304\u20132308 (2016)","DOI":"10.1109\/ICASSP.2016.7472088"},{"key":"6_CR30","doi-asserted-by":"crossref","unstructured":"Rubin, S., Berthouzoz, F., Mysore, G.J., Li, W., Agrawala, M.: Content-based tools for editing audio stories. In: Proceedings of the 26th Annual ACM Symposium on User Interface Software and Technology, pp. 113\u2013122. ACM (2013)","DOI":"10.1145\/2501988.2501993"},{"key":"6_CR31","unstructured":"Simonyan, K., Vedaldi, A., Zisserman, A.: Deep inside convolutional networks: visualising image classification models and saliency maps. In: Workshop at International Conference on Learning Representations (2014)"},{"key":"6_CR32","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: International Conference on Learning Representations (2015)"},{"key":"6_CR33","doi-asserted-by":"crossref","unstructured":"Tamura, S., Ninomiya, H., Kitaoka, N., Osuga, S., Iribe, Y., Takeda, K., Hayamizu, S.: Audio-visual speech recognition using deep bottleneck features and high-performance lipreading. In: 2015 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA), pp. 575\u2013582. IEEE (2015)","DOI":"10.1109\/APSIPA.2015.7415335"},{"key":"6_CR34","unstructured":"Lucas, B.D., Kanade, T.: An iterative image registration technique with an application to stereo vision, Vancouver, BC, Canada (1981)"},{"key":"6_CR35","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"6_CR36","unstructured":"Ukai, N., Seko, T., Tamura, S., Hayamizu, S.: GIF-LR: GA-based informative feature for lipreading. In: Signal & Information Processing Association Annual Summit and Conference (APSIPA ASC), pp. 1\u20134. IEEE (2012)"},{"key":"6_CR37","doi-asserted-by":"crossref","unstructured":"Vedaldi, A., Lenc, K.: Matconvnet - convolutional neural networks for matlab. CoRR abs\/1412.4564 (2014)","DOI":"10.1145\/2733373.2807412"},{"key":"6_CR38","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"6_CR39","doi-asserted-by":"crossref","unstructured":"Wand, M., Koutn\u00edk, J., Schmidhuber, J.: Lipreading with long short-term memory. arXiv preprint arXiv:1601.08188 (2016)","DOI":"10.1109\/ICASSP.2016.7472852"},{"key":"6_CR40","unstructured":"Woodland, P.C., Leggetter, C., Odell, J., Valtchev, V., Young, S.J.: The 1994 HTK large vocabulary speech recognition system. In: 1995 International Conference on Acoustics, Speech, and Signal Processing, 1995. ICASSP-95, vol. 1, pp. 73\u201376. IEEE (1995)"},{"issue":"5","key":"6_CR41","first-page":"3878","volume":"123","author":"J Yuan","year":"2008","unstructured":"Yuan, J., Liberman, M.: Speaker identification on the scotus corpus. IEEE Trans. Audio Speech Lang. Process. 123(5), 3878 (2008)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"6_CR42","doi-asserted-by":"crossref","unstructured":"Yue-Hei Ng, J., Hausknecht, M., Vijayanarasimhan, S., Vinyals, O., Monga, R., Toderici, G.: Beyond short snippets: deep networks for video classification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4694\u20134702 (2015)","DOI":"10.1109\/CVPR.2015.7299101"},{"issue":"7","key":"6_CR43","first-page":"1254","volume":"11","author":"G Zhao","year":"2009","unstructured":"Zhao, G., Barnard, M., Pietik\u00e4inen, M.: Lipreading with local spatiotemporal descriptors. IEEE Trans. Audio Speech Lang. Process. 11(7), 1254\u20131265 (2009)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"1","key":"6_CR44","first-page":"1","volume":"36","author":"Z Zhou","year":"2014","unstructured":"Zhou, Z., Hong, X., Zhao, G., Pietik\u00e4inen, M.: A compact representation of visual speech data using latent variables. IEEE Trans. Audio Speech Lang. Process. 36(1), 1\u20131 (2014)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"9","key":"6_CR45","first-page":"590","volume":"32","author":"Z Zhou","year":"2014","unstructured":"Zhou, Z., Zhao, G., Hong, X., Pietik\u00e4inen, M.: A review of recent advances in visual speech decoding. IEEE Trans. Audio Speech Lang. Process. 32(9), 590\u2013605 (2014)","journal-title":"IEEE Trans. Audio Speech Lang. Process."}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2016"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-54184-6_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,16]],"date-time":"2025-06-16T14:26:22Z","timestamp":1750083982000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-54184-6_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017]]},"ISBN":["9783319541839","9783319541846"],"references-count":45,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-54184-6_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2017]]}}}