{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,20]],"date-time":"2025-10-20T10:22:37Z","timestamp":1760955757367,"version":"3.41.0"},"publisher-location":"Singapore","reference-count":32,"publisher":"Springer Singapore","isbn-type":[{"type":"print","value":"9789811072987"},{"type":"electronic","value":"9789811072994"}],"license":[{"start":{"date-parts":[[2017,1,1]],"date-time":"2017-01-01T00:00:00Z","timestamp":1483228800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2017,1,1]],"date-time":"2017-01-01T00:00:00Z","timestamp":1483228800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017]]},"DOI":"10.1007\/978-981-10-7299-4_54","type":"book-chapter","created":{"date-parts":[[2017,11,29]],"date-time":"2017-11-29T16:27:13Z","timestamp":1511972833000},"page":"650-661","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Deep Temporal Architecture for Audiovisual Speech Recognition"],"prefix":"10.1007","author":[{"given":"Chunlin","family":"Tian","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuan","family":"Yuan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoqiang","family":"Lu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,11,30]]},"reference":[{"key":"54_CR1","doi-asserted-by":"crossref","unstructured":"Amer, M.R., Siddiquie, B., Khan, S., Divakaran, A., Sawhney, H.: Multimodal fusion using dynamic hybrid models, pp. 556\u2013563 (2014)","DOI":"10.1109\/WACV.2014.6836053"},{"key":"54_CR2","unstructured":"Amodei, D., Anubhai, R., Battenberg, E., Case, C.J., Casper, J., Catanzaro, B., Chen, J., Chrzanowski, M., Coates, A., Diamos, G., et al.: Deep speech 2: end-to-end speech recognition in English and mandarin, pp. 173\u2013182 (2015)"},{"issue":"6","key":"54_CR3","doi-asserted-by":"publisher","first-page":"345","DOI":"10.1007\/s00530-010-0182-0","volume":"16","author":"PK Atrey","year":"2010","unstructured":"Atrey, P.K., Hossain, M.A., El Saddik, A., Kankanhalli, M.S.: Multimodal fusion for multimedia analysis: a survey. Multimed. Syst. 16(6), 345\u2013379 (2010)","journal-title":"Multimed. Syst."},{"key":"54_CR4","volume-title":"Deep Learning","author":"Y Bengio","year":"2015","unstructured":"Bengio, Y., Goodfellow, I.J., Courville, A.: Deep Learning. MIT Press, Cambridge (2015). http:\/\/www.iro.umontreal.ca\/bengioy\/dlbook"},{"key":"54_CR5","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Senior, A., Vinyals, O., Zisserman, A.: Lip reading sentences in the wild (2016)","DOI":"10.1109\/CVPR.2017.367"},{"key":"54_CR6","doi-asserted-by":"crossref","unstructured":"Galatas, G., Potamianos, G., Makedon, F.: Audio-visual speech recognition incorporating facial depth information captured by the kinect. In: 2012 Proceedings of the 20th European Signal Processing Conference (EUSIPCO), pp. 2714\u20132717. IEEE (2012)","DOI":"10.1145\/2413097.2413100"},{"issue":"8","key":"54_CR7","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"54_CR8","doi-asserted-by":"crossref","unstructured":"Hu, D., Li, X., et al.: Temporal multimodal learning in audiovisual speech recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3574\u20133582 (2016)","DOI":"10.1109\/CVPR.2016.389"},{"key":"54_CR9","doi-asserted-by":"crossref","unstructured":"Kruthiventi, S.S., Ayush, K., Babu, R.V.: DeepFix: a fully convolutional neural network for predicting human eye fixations. IEEE Trans. Image Process. (2017)","DOI":"10.1109\/TIP.2017.2710620"},{"key":"54_CR10","doi-asserted-by":"crossref","unstructured":"Lu, X., Zheng, X., Yuan, Y.: Remote sensing scene classification by unsupervised representation learning. IEEE Trans. Geosci. Remote Sens. (2017)","DOI":"10.1109\/TGRS.2017.2702596"},{"key":"54_CR11","doi-asserted-by":"publisher","DOI":"10.1007\/978-0-387-76316-3","volume-title":"Multimodal Processing and Interaction: Audio, Video, Text","author":"P Maragos","year":"2008","unstructured":"Maragos, P., Potamianos, A., Gros, P.: Multimodal Processing and Interaction: Audio, Video, Text, vol. 33. Springer Science & Business Media, Heidelberg (2008). https:\/\/doi.org\/10.1007\/978-0-387-76316-3"},{"issue":"2","key":"54_CR12","doi-asserted-by":"publisher","first-page":"198","DOI":"10.1109\/34.982900","volume":"24","author":"I Matthews","year":"2002","unstructured":"Matthews, I., Cootes, T.F., Bangham, J.A., Cox, S., Harvey, R.: Extraction of visual features for lipreading. IEEE Trans. Pattern Anal. Mach. Intell. 24(2), 198\u2013213 (2002)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"54_CR13","doi-asserted-by":"publisher","first-page":"746","DOI":"10.1038\/264746a0","volume":"264","author":"H McGurk","year":"1976","unstructured":"McGurk, H., MacDonald, J.: Hearing lips and seeing voices. Nature 264, 746\u2013748 (1976)","journal-title":"Nature"},{"key":"54_CR14","doi-asserted-by":"crossref","unstructured":"Mroueh, Y., Marcheret, E., Goel, V.: Deep multimodal learning for audio-visual speech recognition, pp. 2130\u20132134 (2015)","DOI":"10.1109\/ICASSP.2015.7178347"},{"issue":"1","key":"54_CR15","doi-asserted-by":"publisher","first-page":"781","DOI":"10.1016\/j.neuroimage.2011.07.024","volume":"59","author":"AR Nath","year":"2012","unstructured":"Nath, A.R., Beauchamp, M.S.: A neural basis for interindividual differences in the McGurk effect, a multisensory speech illusion. NeuroImage 59(1), 781\u2013787 (2012)","journal-title":"NeuroImage"},{"issue":"11","key":"54_CR16","first-page":"1","volume":"2002","author":"AV Nefian","year":"2002","unstructured":"Nefian, A.V., Liang, L., Pi, X., Liu, X., Murphy, K.: Dynamic Bayesian networks for audio-visual speech recognition. EURASIP J. Adv. Sig. Process. 2002(11), 1\u201315 (2002)","journal-title":"EURASIP J. Adv. Sig. Process."},{"key":"54_CR17","unstructured":"Ngiam, J., Khosla, A., Kim, M., Nam, J., Lee, H., Ng, A.Y.: Multimodal deep learning. In: Proceedings of the 28th International Conference on Machine Learning, ICML 2011, pp. 689\u2013696 (2011)"},{"issue":"4","key":"54_CR18","doi-asserted-by":"publisher","first-page":"722","DOI":"10.1007\/s10489-014-0629-7","volume":"42","author":"K Noda","year":"2015","unstructured":"Noda, K., Yamaguchi, Y., Nakadai, K., Okuno, H.G., Ogata, T.: Audio-visual speech recognition using deep learning. Appl. Intell. 42(4), 722\u2013737 (2015)","journal-title":"Appl. Intell."},{"key":"54_CR19","unstructured":"Pascanu, R., Mikolov, T., Bengio, Y.: On the difficulty of training recurrent neural networks. In: ICML (3), vol. 28, pp. 1310\u20131318 (2013)"},{"key":"54_CR20","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1016\/j.inffus.2017.02.003","volume":"37","author":"S Poria","year":"2017","unstructured":"Poria, S., Cambria, E., Bajpai, R., Hussain, A.: A review of affective computing: from unimodal analysis to multimodal fusion. Inf. Fusion 37, 98\u2013125 (2017)","journal-title":"Inf. Fusion"},{"key":"54_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1007\/978-3-642-35289-8_5","volume-title":"Neural Networks: Tricks of the Trade","author":"L Prechelt","year":"2012","unstructured":"Prechelt, L.: Early stopping\u2014but when? In: Montavon, G., Orr, G.B., M\u00fcller, K.-R. (eds.) Neural Networks: Tricks of the Trade. LNCS, vol. 7700, 2nd edn, pp. 53\u201367. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-35289-8_5","edition":"2"},{"issue":"2","key":"54_CR22","doi-asserted-by":"publisher","first-page":"257","DOI":"10.1109\/5.18626","volume":"77","author":"LR Rabiner","year":"1989","unstructured":"Rabiner, L.R.: A tutorial on hidden Markov models and selected applications in speech recognition. Proc. IEEE 77(2), 257\u2013286 (1989)","journal-title":"Proc. IEEE"},{"key":"54_CR23","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"issue":"1","key":"54_CR24","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G.E., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Dropout: a simple way to prevent neural networks from overfitting. J. Mach. Learn. Res. 15(1), 1929\u20131958 (2014)","journal-title":"J. Mach. Learn. Res."},{"key":"54_CR25","unstructured":"Srivastava, N., Salakhutdinov, R.: Learning representations for multimodal data with deep belief nets. In: International Conference on Machine Learning Workshop (2012)"},{"key":"54_CR26","unstructured":"Srivastava, N., Salakhutdinov, R.R.: Multimodal learning with deep Boltzmann machines. In: Advances in Neural Information Processing Systems, pp. 2222\u20132230 (2012)"},{"key":"54_CR27","unstructured":"Sutskever, I., Vinyals, O., Le, Q.: Sequence to sequence learning with neural networks, pp. 3104\u20133112 (2014)"},{"key":"54_CR28","doi-asserted-by":"crossref","unstructured":"Tamura, S., Ninomiya, H., Kitaoka, N., Osuga, S., Iribe, Y., Takeda, K., Hayamizu, S.: Audio-visual speech recognition using deep bottleneck features and high-performance lipreading. In: 2015 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA), pp. 575\u2013582. IEEE (2015)","DOI":"10.1109\/APSIPA.2015.7415335"},{"key":"54_CR29","doi-asserted-by":"crossref","unstructured":"Viola, P., Jones, M.: Rapid object detection using a boosted cascade of simple features. In: Proceedings of the 2001 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, CVPR 2001, vol. 1, pp. I\u2013511. IEEE (2001)","DOI":"10.1109\/CVPR.2001.990517"},{"key":"54_CR30","doi-asserted-by":"crossref","unstructured":"Xia, G.S., Hu, J., Hu, F., Shi, B., Bai, X., Zhong, Y., Zhang, L., Lu, X.: AID: a benchmark data set for performance evaluation of aerial scene classification. IEEE Trans. Geosci. Remote Sens. (2017)","DOI":"10.1109\/TGRS.2017.2685945"},{"issue":"7","key":"54_CR31","doi-asserted-by":"publisher","first-page":"1254","DOI":"10.1109\/TMM.2009.2030637","volume":"11","author":"G Zhao","year":"2009","unstructured":"Zhao, G., Barnard, M., Pietikainen, M.: Lipreading with local spatiotemporal descriptors. IEEE Trans. Multimed. 11(7), 1254\u20131265 (2009)","journal-title":"IEEE Trans. Multimed."},{"key":"54_CR32","doi-asserted-by":"crossref","unstructured":"Zheng, X., Yuan, Y., Lu, X.: Dimensionality reduction by spatial-spectral preservation in selected bands. IEEE Trans. Geosci. Remote Sens. (2017)","DOI":"10.1109\/TGRS.2017.2703598"}],"container-title":["Communications in Computer and Information Science","Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-10-7299-4_54","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,27]],"date-time":"2025-06-27T20:18:21Z","timestamp":1751055501000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-10-7299-4_54"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017]]},"ISBN":["9789811072987","9789811072994"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-981-10-7299-4_54","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"type":"print","value":"1865-0929"},{"type":"electronic","value":"1865-0937"}],"subject":[],"published":{"date-parts":[[2017]]},"assertion":[{"value":"30 November 2017","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CCCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CCF Chinese Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tianjin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2017","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 October 2017","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 October 2017","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"cccv2017","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/ccf-cccv.org\/2017\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}