{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:16:26Z","timestamp":1777655786352,"version":"3.51.4"},"publisher-location":"Cham","reference-count":51,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012243","type":"print"},{"value":"9783030012250","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01225-0_32","type":"book-chapter","created":{"date-parts":[[2018,10,8]],"date-time":"2018-10-08T08:39:54Z","timestamp":1538987994000},"page":"536-552","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":23,"title":["Zero-Shot Keyword Spotting for Visual Speech Recognition In-the-wild"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9227-3588","authenticated-orcid":false,"given":"Themos","family":"Stafylakis","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1803-5338","authenticated-orcid":false,"given":"Georgios","family":"Tzimiropoulos","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"key":"32_CR1","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Senior, A., Vinyals, O., Zisserman, A.: Lip reading sentences in the wild. In: Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.367"},{"key":"32_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"87","DOI":"10.1007\/978-3-319-54184-6_6","volume-title":"Computer Vision \u2013 ACCV 2016","author":"JS Chung","year":"2017","unstructured":"Chung, J.S., Zisserman, A.: Lip reading in the wild. In: Lai, S.-H., Lepetit, V., Nishino, K., Sato, Y. (eds.) ACCV 2016. LNCS, vol. 10112, pp. 87\u2013103. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-54184-6_6"},{"key":"32_CR3","doi-asserted-by":"crossref","unstructured":"Anina, I., Zhou, Z., Zhao, G., Pietik\u00e4inen, M.: OuluVS2: a multi-view audiovisual database for non-rigid mouth motion analysis. In: 2015 11th IEEE International Conference and Workshops on Automatic Face and Gesture Recognition (FG), vol. 1, pp. 1\u20135. IEEE (2015)","DOI":"10.1109\/FG.2015.7163155"},{"issue":"8","key":"32_CR4","doi-asserted-by":"publisher","first-page":"1351","DOI":"10.1109\/JSTSP.2017.2759726","volume":"11","author":"K Audhkhasi","year":"2017","unstructured":"Audhkhasi, K., Rosenberg, A., Sethy, A., Ramabhadran, B., Kingsbury, B.: End-to-end ASR-free keyword search from speech. IEEE J. Selected Top. Signal Process. 11(8), 1351\u20131359 (2017)","journal-title":"IEEE J. Selected Top. Signal Process."},{"key":"32_CR5","doi-asserted-by":"crossref","unstructured":"Audhkhasi, K., Ramabhadran, B., Saon, G., Picheny, M., Nahamoo, D.: Direct acoustics-to-word models for english conversational speech recognition. In: Interspeech (2017)","DOI":"10.21437\/Interspeech.2017-546"},{"key":"32_CR6","doi-asserted-by":"crossref","unstructured":"Soltau, H., Liao, H., Sak, H.: Neural speech recognizer: acoustic-to-word LSTM model for large vocabulary speech recognition. In: Interspeech (2017)","DOI":"10.21437\/Interspeech.2017-1566"},{"key":"32_CR7","unstructured":"Socher, R., Ganjoo, M., Manning, C.D., Ng, A.: Zero-shot learning through cross-modal transfer. In: Advances in Neural Information Processing Systems (NIPS) (2013)"},{"key":"32_CR8","unstructured":"Chung, J.S., Zisserman, A.: Lipreading Sentences in the wild (link to LRS2). http:\/\/www.robots.ox.ac.uk\/~vgg\/data\/lip_reading_sentences\/"},{"key":"32_CR9","unstructured":"Assael, Y.M., Shillingford, B., Whiteson, S., de Freitas, N.: Lipnet: Sentence-level lipreading. arXiv preprint arXiv:1611.01599 (2016)"},{"issue":"5","key":"32_CR10","doi-asserted-by":"publisher","first-page":"2421","DOI":"10.1121\/1.2229005","volume":"120","author":"M Cooke","year":"2006","unstructured":"Cooke, M., Barker, J., Cunningham, S., Shao, X.: An audio-visual corpus for speech perception and automatic speech recognition. J. Acoust. Soc. America 120(5), 2421\u20132424 (2006)","journal-title":"J. Acoust. Soc. America"},{"key":"32_CR11","unstructured":"Graves, A., Jaitly, N.: Towards end-to-end speech recognition with recurrent neural networks. In: International Conference on Machine Learning, pp. 1764\u20131772 (2014)"},{"key":"32_CR12","doi-asserted-by":"crossref","unstructured":"Zweig, G., Yu, C., Droppo, J., Stolcke, A.: Advances in all-neural speech recognition. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4805\u20134809. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7953069"},{"key":"32_CR13","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Zisserman, A.: Lip reading in profile. In: British Machine Vision Conference (BMVC) (2017)","DOI":"10.1007\/978-3-319-54184-6_6"},{"key":"32_CR14","doi-asserted-by":"crossref","unstructured":"Chan, W., Jaitly, N., Le, Q., Vinyals, O.: Listen, attend and spell: A neural network for large vocabulary conversational speech recognition. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4960\u20134964 (2016)","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"32_CR15","doi-asserted-by":"crossref","unstructured":"Bear, H.L., Harvey, R.: Decoding visemes: improving machine lip-reading. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2009\u20132013. IEEE (2016)","DOI":"10.1109\/ICASSP.2016.7472029"},{"key":"32_CR16","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems (NIPS), pp. 5998\u20136008 (2017)"},{"key":"32_CR17","doi-asserted-by":"crossref","unstructured":"Koumparoulis, A., Potamianos, G., Mroueh, Y., Rennie, S.J.: Exploring ROI size in deep learning based lipreading. In: AVSP (2017)","DOI":"10.21437\/AVSP.2017-13"},{"key":"32_CR18","doi-asserted-by":"crossref","unstructured":"Petridis, S., Stafylakis, T., Ma, P., Cai, F., Tzimiropoulos, G., Pantic, M.: End-to-end audiovisual speech recognition. In: International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2018)","DOI":"10.1109\/ICASSP.2018.8461326"},{"key":"32_CR19","doi-asserted-by":"crossref","unstructured":"Wand, M., Schmidhuber, J.: Improving speaker-independent lipreading with domain-adversarial training. In: Interspeech (2017)","DOI":"10.21437\/Interspeech.2017-421"},{"key":"32_CR20","doi-asserted-by":"crossref","unstructured":"Afouras, T., Chung, J.S., Zisserman, A.: Deep lip reading: a comparison of models and an online application. arXiv preprint arXiv:1806.06053 (2018)","DOI":"10.21437\/Interspeech.2018-1943"},{"key":"32_CR21","doi-asserted-by":"crossref","unstructured":"Xu, K., Li, D., Cassimatis, N., Wang, X.: LCANet: end-to-end lipreading with cascaded attention-CTC. In: 13th IEEE International Conference on Automatic Face & Gesture Recognition (FG), pp. 548\u2013555. IEEE (2018)","DOI":"10.1109\/FG.2018.00088"},{"key":"32_CR22","doi-asserted-by":"crossref","unstructured":"Sterpu, G., Saam, C., Harte, N.: Can DNNs learn to lipread full sentences? arXiv preprint arXiv:1805.11685 (2018)","DOI":"10.1109\/ICIP.2018.8451388"},{"issue":"7","key":"32_CR23","doi-asserted-by":"publisher","first-page":"1290","DOI":"10.1109\/TASLP.2018.2815268","volume":"26","author":"Fei Tao","year":"2018","unstructured":"Tao, F., Busso, C.: Gating neural network for large vocabulary audiovisual speech recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. (TASLP) 26(7), 1286\u20131298 (2018)","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"32_CR24","doi-asserted-by":"crossref","unstructured":"Mroueh, Y., Marcheret, E., Goel, V.: Deep multimodal learning for audio-visual speech recognition. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2130\u20132134. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178347"},{"key":"32_CR25","doi-asserted-by":"crossref","unstructured":"Bengio, S., Heigold, G.: Word embeddings for speech recognition. In: Interspeech (2014)","DOI":"10.21437\/Interspeech.2014-273"},{"key":"32_CR26","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.: Glove: Global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"32_CR27","unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G.S., Dean, J.: Distributed representations of words and phrases and their compositionality. In: Advances in Neural Information Processing Systems (NIPS), pp. 3111\u20133119 (2013)"},{"key":"32_CR28","doi-asserted-by":"crossref","unstructured":"Palaz, D., Synnaeve, G., Collobert, R.: Jointly learning to locate and classify words using convolutional networks. In: Interspeech, pp. 2741\u20132745 (2016)","DOI":"10.21437\/Interspeech.2016-968"},{"key":"32_CR29","doi-asserted-by":"crossref","unstructured":"Sun, M., et al.: Compressed time delay neural network for small-footprint keyword spotting. In: Interspeech, pp. 3607\u20133611 (2017)","DOI":"10.21437\/Interspeech.2017-480"},{"key":"32_CR30","doi-asserted-by":"crossref","unstructured":"Sun, M., Nagaraja, V., Hoffmeister, B., Vitaladevuni, S.: Model shrinking for embedded keyword spotting. In: IEEE 14th International Conference on Machine Learning and Applications (ICMLA), pp. 369\u2013374. IEEE (2015)","DOI":"10.1109\/ICMLA.2015.121"},{"key":"32_CR31","doi-asserted-by":"crossref","unstructured":"Chen, G., Parada, C., Heigold, G.: Small-footprint keyword spotting using deep neural networks. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4087\u20134091 (2014)","DOI":"10.1109\/ICASSP.2014.6854370"},{"key":"32_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"220","DOI":"10.1007\/978-3-540-74695-9_23","volume-title":"Artificial Neural Networks \u2013 ICANN 2007","author":"S Fern\u00e1ndez","year":"2007","unstructured":"Fern\u00e1ndez, S., Graves, A., Schmidhuber, J.: An application of recurrent neural networks to discriminative keyword spotting. In: de S\u00e1, J.M., Alexandre, L.A., Duch, W., Mandic, D. (eds.) ICANN 2007. LNCS, vol. 4669, pp. 220\u2013229. Springer, Heidelberg (2007). https:\/\/doi.org\/10.1007\/978-3-540-74695-9_23"},{"key":"32_CR33","doi-asserted-by":"crossref","unstructured":"Jha, A., Namboodiri, V.P., Jawahar, C.: Word spotting in silent lip videos. In: 2018 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 150\u2013159. IEEE (2018)","DOI":"10.1109\/WACV.2018.00023"},{"key":"32_CR34","doi-asserted-by":"crossref","unstructured":"Stafylakis, T., Tzimiropoulos, G.: Combining Residual Networks with LSTMs for Lipreading. In: Interspeech (2017)","DOI":"10.21437\/Interspeech.2017-85"},{"key":"32_CR35","doi-asserted-by":"crossref","unstructured":"Stafylakis, T., Tzimiropoulos, G.: Deep word embeddings for visual speech recognition. In: International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2018)","DOI":"10.1109\/ICASSP.2018.8461347"},{"key":"32_CR36","unstructured":"Chung, J.S., Zisserman, A.: Lipreading in the wild (link to LRW), http:\/\/www.robots.ox.ac.uk\/~vgg\/data\/lip_reading\/"},{"key":"32_CR37","doi-asserted-by":"crossref","unstructured":"Xian, Y., Lampert, C.H., Schiele, B., Akata, Z.: Zero-shot learning-a comprehensive evaluation of the good, the bad and the ugly. arXiv preprint arXiv:1707.00600 (2017)","DOI":"10.1109\/CVPR.2017.328"},{"key":"32_CR38","unstructured":"Frome, A., et al.: Devise: A deep visual-semantic embedding model. In: Advances in Neural Information Processing Systems (NIPS), pp. 2121\u20132129 (2013)"},{"issue":"7","key":"32_CR39","doi-asserted-by":"publisher","first-page":"1425","DOI":"10.1109\/TPAMI.2015.2487986","volume":"38","author":"Z Akata","year":"2016","unstructured":"Akata, Z., Perronnin, F., Harchaoui, Z., Schmid, C.: Label-embedding for image classification. IEEE Trans. Pattern Anal. Mach. Intell. 38(7), 1425\u20131438 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"32_CR40","doi-asserted-by":"crossref","unstructured":"Mahasseni, B., Todorovic, S.: Regularizing long short term memory with 3D human-skeleton sequences for action recognition. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2016","DOI":"10.1109\/CVPR.2016.333"},{"key":"32_CR41","doi-asserted-by":"crossref","unstructured":"Yao, K., Zweig, G.: Sequence-to-sequence neural net models for grapheme-to-phoneme conversion. arXiv preprint arXiv:1506.00196 (2015)","DOI":"10.21437\/Interspeech.2015-134"},{"key":"32_CR42","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: Advances in Neural Information Processing Systems, pp. 3104\u20133112 (2014)"},{"key":"32_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"630","DOI":"10.1007\/978-3-319-46493-0_38","volume-title":"Computer Vision \u2013 ECCV 2016","author":"K He","year":"2016","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Identity mappings in deep residual networks. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9908, pp. 630\u2013645. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46493-0_38"},{"issue":"8","key":"32_CR44","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"32_CR45","unstructured":"Gal, Y., Ghahramani, Z.: A theoretically grounded application of dropout in recurrent neural networks. In: Advances in Neural Information Processing Systems (NIPS), pp. 1019\u20131027 (2016)"},{"key":"32_CR46","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: ICLR (2014)"},{"key":"32_CR47","doi-asserted-by":"crossref","unstructured":"Watanabe, S., et al.: ESPnet: end-to-end speech processing toolkit. arXiv preprint arXiv:1804.00015 (2018)","DOI":"10.21437\/Interspeech.2018-1456"},{"issue":"8","key":"32_CR48","doi-asserted-by":"publisher","first-page":"1240","DOI":"10.1109\/JSTSP.2017.2763455","volume":"11","author":"S Watanabe","year":"2017","unstructured":"Watanabe, S., Hori, T., Kim, S., Hershey, J.R., Hayashi, T.: Hybrid CTC\/attention architecture for end-to-end speech recognition. IEEE J. Selected Top. Signal Process. 11(8), 1240\u20131253 (2017)","journal-title":"IEEE J. Selected Top. Signal Process."},{"key":"32_CR49","doi-asserted-by":"crossref","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: Librispeech: an ASR corpus based on public domain audio books. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5206\u20135210. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"32_CR50","doi-asserted-by":"crossref","unstructured":"Miller, D.R., et al.: Rapid and accurate spoken term detection. In: Eighth Annual Conference of the International Speech Communication Association (2007)","DOI":"10.21437\/Interspeech.2007-174"},{"key":"32_CR51","doi-asserted-by":"crossref","unstructured":"Zhuang, Y., Chang, X., Qian, Y., Yu, K.: Unrestricted vocabulary keyword spotting using LSTM-CTC. In: Interspeech, pp. 938\u2013942 (2016)","DOI":"10.21437\/Interspeech.2016-753"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01225-0_32","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,8]],"date-time":"2022-10-08T00:13:34Z","timestamp":1665188014000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01225-0_32"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012243","9783030012250"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01225-0_32","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}