{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,21]],"date-time":"2025-10-21T15:30:13Z","timestamp":1761060613421,"version":"3.37.3"},"reference-count":32,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2018,7,16]],"date-time":"2018-07-16T00:00:00Z","timestamp":1531699200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Multimodal User Interfaces"],"published-print":{"date-parts":[[2018,12]]},"DOI":"10.1007\/s12193-018-0266-2","type":"journal-article","created":{"date-parts":[[2018,7,16]],"date-time":"2018-07-16T07:11:41Z","timestamp":1531725101000},"page":"309-318","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Experimenting with lipreading for large vocabulary continuous speech recognition"],"prefix":"10.1007","volume":"12","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0693-6603","authenticated-orcid":false,"given":"Karel","family":"Pale\u010dek","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2018,7,16]]},"reference":[{"key":"266_CR1","unstructured":"Assael YM, Shillingford B, Whiteson S, de\u00a0Freitas N (2016) Lipnet: sentence-level lipreading. In: CoRR abs\/1611.01599"},{"key":"266_CR2","doi-asserted-by":"crossref","unstructured":"Cao X, Wei Y, Wen F, Sun J(2012) Face alignment by explicit shape regression. In: CVPR","DOI":"10.1007\/s11263-013-0667-3"},{"key":"266_CR3","doi-asserted-by":"crossref","unstructured":"Chung JS, Senior AW, Vinyals O, Zisserman A (2016) Lip reading sentences in the wild. In: CoRR","DOI":"10.1109\/CVPR.2017.367"},{"key":"266_CR4","unstructured":"C\u00edsa\u0159 P (2006) Application of lipreading methods for speech recognition. Ph.D. thesis"},{"issue":"5","key":"266_CR5","doi-asserted-by":"publisher","first-page":"2421","DOI":"10.1121\/1.2229005","volume":"120","author":"M Cooke","year":"2006","unstructured":"Cooke M, Barker J, Cunningham S, Shao X (2006) An audio-visual corpus for speech perception and automatic speech recognition. J Acoust Soc Am 120(5):2421\u20132424","journal-title":"J Acoust Soc Am"},{"issue":"4","key":"266_CR6","doi-asserted-by":"publisher","first-page":"1145","DOI":"10.1109\/TASL.2011.2172427","volume":"20","author":"V Estellers","year":"2012","unstructured":"Estellers V, Gurban M, Thiran J (2012) On dynamic stream weighting for audio-visual speech recognition. IEEE Trans Audio Speech Lang Process 20(4):1145\u20131157","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"266_CR7","unstructured":"Galatas G, Potamianos G, Makedon F (2012) Audio-visual speech recognition incorporating facial depth information captured by the kinect. In: Proceedings of the 20th European signal processing conference (EUSIPCO), pp 2714\u20132717"},{"key":"266_CR8","doi-asserted-by":"crossref","unstructured":"Glotin H, Vergyr D, Neti C, Potamianos G, Luettin J (2001) Weighting schemes for audio-visual fusion in speech recognition. In: 2001 IEEE international conference on acoustics, speech, and signal processing (ICASSP \u201901), vol 1, pp 173\u2013176","DOI":"10.1109\/ICASSP.2001.940795"},{"issue":"5","key":"266_CR9","doi-asserted-by":"publisher","first-page":"603","DOI":"10.1109\/TMM.2015.2407694","volume":"17","author":"N Harte","year":"2015","unstructured":"Harte N, Gillen E (2015) Tcd-timit: an audio-visual corpus of continuous speech. IEEE Trans Multimed 17(5):603\u2013615","journal-title":"IEEE Trans Multimed"},{"key":"266_CR10","unstructured":"Lan Y, Theobald B, Harvey R, Bowden R (2010) Improving visual features for lip-reading. In: Proceedings of the international conference on auditory-visual speech processing, 2010, pp 142\u2013147"},{"key":"266_CR11","doi-asserted-by":"crossref","unstructured":"Lee B, Hasegawa-Johnson M, Goudeseune C, Kamdar S, Borys S, Liu M, Huang TS (2004) AVICAR: audio-visual speech corpus in a car environment. In: INTERSPEECH, pp 2489\u20132492","DOI":"10.21437\/Interspeech.2004-424"},{"issue":"3","key":"266_CR12","doi-asserted-by":"publisher","first-page":"495","DOI":"10.1109\/TMM.2005.846777","volume":"7","author":"S Lucey","year":"2005","unstructured":"Lucey S, Chen T, Sridharan S, Chandran V (2005) Integration strategies for audio-visual speech processing: applied to text-dependent speaker recognition. IEEE Trans Multimed 7(3):495\u2013506","journal-title":"IEEE Trans Multimed"},{"key":"266_CR13","doi-asserted-by":"publisher","first-page":"746","DOI":"10.1038\/264746a0","volume":"264","author":"H McGurk","year":"1976","unstructured":"McGurk H, MacDonald J (1976) Hearing lips and seeing voices. Nature 264:746\u2013748","journal-title":"Nature"},{"key":"266_CR14","unstructured":"Ngiam J, Khosla A, Kim M, Nam J, Lee H, Ng AY (2011)Multimodal deep learning. In: Proceedings of the 28th international conference on machine learning, ICML 2011, Bellevue, Washington, USA, June 28-July 2, 2011, pp 689\u2013696"},{"key":"266_CR15","doi-asserted-by":"crossref","unstructured":"Noda K, Yamaguchi Y, Nakadai K, Okuno, H, Ogata, T (2014) Lipreading using convolutional neural network. In: International speech and communication association, pp 1149\u20131153","DOI":"10.21437\/Interspeech.2014-293"},{"key":"266_CR16","unstructured":"Nouza, J, Psutka J, Uhl\u00ed\u0159 (1997) Phonetic alphabet for speech recognition of czech. Radioengineering 6(4):16\u201320"},{"key":"266_CR17","doi-asserted-by":"crossref","unstructured":"Ong E, Bowden, R (2011) Learning sequential patterns for lipreading. In: Proceedings of the British machine vision conference, BMVC 2011, Dundee, UK, August 29-September 2, 2011, pp 1\u201310","DOI":"10.5244\/C.25.55"},{"key":"266_CR18","doi-asserted-by":"crossref","unstructured":"Palecek K (2016) Lipreading using spatiotemporal histogram of oriented gradients. In: EUSIPCO 2016, Budapest, Hungary, 2016, pp 1882\u20131885","DOI":"10.1109\/EUSIPCO.2016.7760575"},{"key":"266_CR19","first-page":"438","volume-title":"Spatiotemporal convolutional features for lipreading","author":"K Pale\u010dek","year":"2017","unstructured":"Pale\u010dek K (2017) Spatiotemporal convolutional features for lipreading. Springer, Cham, pp 438\u2013446"},{"key":"266_CR20","doi-asserted-by":"publisher","first-page":"767","DOI":"10.1007\/978-3-319-66429-3_77","volume-title":"Speech and computer","author":"K Pale\u010dek","year":"2017","unstructured":"Pale\u010dek K (2017) Utilizing lipreading in large vocabulary continuous speech recognition. In: Karpov A, Potapova R, Mporas I (eds) Speech and computer. Springer, Cham, pp 767\u2013776"},{"key":"266_CR21","doi-asserted-by":"crossref","unstructured":"Pei Y, Kim T, Zha H (2013) Unsupervised random forest manifold alignment for lipreading. In: IEEE international conference on computer vision, ICCV 2013, Sydney, Australia, December 1\u20138, 2013, pp 129\u2013136","DOI":"10.1109\/ICCV.2013.23"},{"key":"266_CR22","doi-asserted-by":"crossref","unstructured":"Petridis S, Li Z, Pantic M (2017) End-to-end visual speech recognition with LSTMS. In: 2017 IEEE international conference on acoustics, speech and signal processing (ICASSP) pp 2592\u20132596","DOI":"10.1109\/ICASSP.2017.7952625"},{"key":"266_CR23","doi-asserted-by":"crossref","unstructured":"Potamianos G, Neti C, Gravier G, Garg A, Senior AW (2003) Recent advances in the automatic recognition of audio-visual speech. In: Proceedings of the IEEE, pp 1306\u20131326","DOI":"10.1109\/JPROC.2003.817150"},{"key":"266_CR24","unstructured":"Ramage MD (2013) Disproving visemes as the basic visual unit of speech. Ph.D. thesis"},{"key":"266_CR25","doi-asserted-by":"crossref","unstructured":"Saenko K, Livescu K, Siracusa M, Wilson K, Glass J, Darrell T (2005) Visual speech recognition with loosely synchronized feature streams. In: Proceedings of the tenth IEEE international conference on computer vision, ICCV \u201905, vol 2. IEEE Computer Society, Washington, DC, USA, pp 1424\u20131431","DOI":"10.1109\/ICCV.2005.251"},{"key":"266_CR26","doi-asserted-by":"crossref","unstructured":"Stolcke A (2002) SRILM: an extensible language modeling toolkit. In: Proceedings of ICSLP, vol 2. Denver, USA, pp 901\u2013904","DOI":"10.21437\/ICSLP.2002-303"},{"key":"266_CR27","first-page":"377","volume-title":"Visual speech feature representations: recent advances","author":"C Sui","year":"2016","unstructured":"Sui C, Bennamoun M, Togneri R (2016) Visual speech feature representations: recent advances. Springer, Cham, pp 377\u2013396"},{"key":"266_CR28","volume-title":"Hearing by eye: the psychology of lip-reading","author":"Q Summerfield","year":"1987","unstructured":"Summerfield Q (1987) Some preliminaries to a comprehensive account of audio-visual speech perception. In: Dodd B (ed) Hearing by eye: the psychology of lip-reading. Lawrence Erlbaum Associates, Hillsdale"},{"key":"266_CR29","doi-asserted-by":"crossref","unstructured":"Wand M, Koutn\u00edk J Schmidhuber J (2016) Lipreading with long short-term memory. In: CoRR","DOI":"10.1109\/ICASSP.2016.7472852"},{"issue":"7","key":"266_CR30","doi-asserted-by":"publisher","first-page":"1254","DOI":"10.1109\/TMM.2009.2030637","volume":"11","author":"G Zhao","year":"2009","unstructured":"Zhao G, Barnard M, Pietik\u00e4inen M (2009) Lipreading with local spatiotemporal descriptors. IEEE Trans Multimed 11(7):1254\u20131265","journal-title":"IEEE Trans Multimed"},{"issue":"9","key":"266_CR31","doi-asserted-by":"publisher","first-page":"590","DOI":"10.1016\/j.imavis.2014.06.004","volume":"32","author":"Z Zhou","year":"2014","unstructured":"Zhou Z, Zhao G, Hong X, Pietikinen M (2014) A review of recent advances in visual speech decoding. Image Vis Comput 32(9):590\u2013605","journal-title":"Image Vis Comput"},{"key":"266_CR32","doi-asserted-by":"crossref","unstructured":"Zhou Z, Zhao G, Pietikainen M (2011)Towards a practical lipreading system. In: Proceedings of the 2011 IEEE conference on computer vision and pattern recognition, CVPR \u201911. IEEE Computer Society, Washington, DC, USA, pp 137\u2013144","DOI":"10.1109\/CVPR.2011.5995345"}],"container-title":["Journal on Multimodal User Interfaces"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s12193-018-0266-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s12193-018-0266-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s12193-018-0266-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,27]],"date-time":"2022-08-27T16:59:16Z","timestamp":1661619556000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s12193-018-0266-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,7,16]]},"references-count":32,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2018,12]]}},"alternative-id":["266"],"URL":"https:\/\/doi.org\/10.1007\/s12193-018-0266-2","relation":{},"ISSN":["1783-7677","1783-8738"],"issn-type":[{"type":"print","value":"1783-7677"},{"type":"electronic","value":"1783-8738"}],"subject":[],"published":{"date-parts":[[2018,7,16]]},"assertion":[{"value":"31 October 2017","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 June 2018","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 July 2018","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}