{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T22:40:03Z","timestamp":1749595203016,"version":"3.41.0"},"reference-count":31,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"10","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2016]]},"DOI":"10.1587\/transinf.2016slp0019","type":"journal-article","created":{"date-parts":[[2016,9,30]],"date-time":"2016-09-30T22:23:29Z","timestamp":1475274209000},"page":"2444-2451","source":"Crossref","is-referenced-by-count":4,"title":["Investigation of DNN-Based Audio-Visual Speech Recognition"],"prefix":"10.1587","volume":"E99.D","author":[{"given":"Satoshi","family":"TAMURA","sequence":"first","affiliation":[{"name":"Gifu University"}]},{"given":"Hiroshi","family":"NINOMIYA","sequence":"additional","affiliation":[{"name":"Nagoya University"}]},{"given":"Norihide","family":"KITAOKA","sequence":"additional","affiliation":[{"name":"Tokushima University"}]},{"given":"Shin","family":"OSUGA","sequence":"additional","affiliation":[{"name":"Aisin Seiki Co., Ltd."}]},{"given":"Yurie","family":"IRIBE","sequence":"additional","affiliation":[{"name":"Aichi Prefectural University"}]},{"given":"Kazuya","family":"TAKEDA","sequence":"additional","affiliation":[{"name":"Nagoya University"}]},{"given":"Satoru","family":"HAYAMIZU","sequence":"additional","affiliation":[{"name":"Gifu University"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"crossref","unstructured":"[1] B.D. Van Veen and K.M. Buckley, \u201cBeamforming: A versatile approach to spatial filtering,\u201d IEEE Acoust. Speech Signal Process. Mag., vol.5, no.2, pp.4-24, 1988.","DOI":"10.1109\/53.665"},{"key":"2","doi-asserted-by":"crossref","unstructured":"[2] S.F. Boll, \u201cSuppression of acoustic noise in speech using spectral subtraction,\u201d IEEE Trans. Acoust. Speech Signal Process., vol.27, no.2, pp.113-120, 1979.","DOI":"10.1109\/TASSP.1979.1163209"},{"key":"3","doi-asserted-by":"crossref","unstructured":"[3] B.S. Atal, \u201cEffectiveness of linear prediction characteristics of the speech wave for automatic speaker identification and verification,\u201d J. Acoust. Soc. Am., vol.55, no.6, pp.1304-1312, 1974.","DOI":"10.1121\/1.1914702"},{"key":"4","doi-asserted-by":"crossref","unstructured":"[4] J.-L. Gauvain and C.-H. Lee, \u201cMaximum a posteriori estimation for multivariate Gaussian mixture observations of Markov chains,\u201d IEEE Trans. Speech Audio Process. vol.2, no.2, pp.291-298, 1994.","DOI":"10.1109\/89.279278"},{"key":"5","doi-asserted-by":"crossref","unstructured":"[5] C.J. Leggette and P.C. Woodland, \u201cMaximum likelihood linear regression for speaker adaptation of continuous density hidden Markov models,\u201d Computer Speech and Language, vol.9, no.2, pp.171-185, 1995.","DOI":"10.1006\/csla.1995.0010"},{"key":"6","doi-asserted-by":"crossref","unstructured":"[6] G. Potamianos and C. Neti, \u201cStream confidence estimation for audio-visual speech recognition,\u201d Proc. ICSLP2000, vol.3, pp.746-749, 2000.","DOI":"10.21437\/ICSLP.2000-643"},{"key":"7","unstructured":"[7] C. Miyajima, K. Tokuda, and T. Kitamura, \u201cAudio-visual speech recognition using MCE-based HMMs and model-dependent stream weights,\u201d Proc. ICSLP2000, vol.2, pp.1023-1026, 2000."},{"key":"8","unstructured":"[8] K. Iwano, S. Tamura, and S. Furui, \u201cBimodal speech recognition using lip movement measured by optical-flow analysis,\u201d Proc. HSC2001, pp.187-190, 2001."},{"key":"9","unstructured":"[9] S. Tamura, C. Miyajima, N. Kitaoka, T. Yamada, S. Tsuge, T. Takiguchi, K. Yamamoto, T. Nishiura, M. Nakayama, Y. Denda, M. Fujimoto, S. Matsuda, T. Ogawa, S. Kuroiwa, K. Takeda, and S. Nakamura, \u201cCENSREC-1-AV: An audio-visual corpus for noisy bimodal speech recognition,\u201d Proc. AVSP2010, pp.85-88, 2010."},{"key":"10","doi-asserted-by":"crossref","unstructured":"[10] N. Harte and E. Gillen, \u201cTCD-TIMIT: An audio-visual corpus of continuous speech,\u201d IEEE Trans. Multimed., vol.17, no.5, pp.603-615, 2015.","DOI":"10.1109\/TMM.2015.2407694"},{"key":"11","doi-asserted-by":"crossref","unstructured":"[11] D. Burnham, \u201cBig data and resource sharing: A speech corpus and a virtual laboratory for facilitating human communication science research,\u201d Proc. Oriental COCOSDA 2014 (Keynote talk), p.10, 2014.","DOI":"10.1109\/ICSDA.2014.7051409"},{"key":"12","doi-asserted-by":"crossref","unstructured":"[12] C. Bregler and Y. Konig, \u201c\u201cEigenlips\u201d for robust speech recognition,\u201d Proc. ICASSP&apos;94, vol.2, pp.669-672, 1994.","DOI":"10.1109\/ICASSP.1994.389567"},{"key":"13","doi-asserted-by":"crossref","unstructured":"[13] C. Miyamoto Y. Komai, T. Takiguchi, Y. Ariki, and I. Li, \u201cMultimodal speech recognition of a person with articulation disorders using AAM and MAF,\u201d Proc. MMSP2010, pp.517-520, 2010.","DOI":"10.1109\/MMSP.2010.5662075"},{"key":"14","unstructured":"[14] Y. Lan, R. Harvey, B.-J. Theobald, E.-J. Ong, and R. Bowden, \u201cComparing visual features for lipreading,\u201d Proc. AVSP2009, pp.102-106, 2009."},{"key":"15","unstructured":"[15] S. Tamura, Y. Tagami, and S. Hayamizu., \u201cGIF-SP: GA-based informative feature for noisy speech recognition,\u201d Proc. APSIPA ASC 2012, PS.5-SLA.18.10, pp.1-4, 2012."},{"key":"16","unstructured":"[16] N. Ukai, T. Seko, S. Tamura, and S. Hayamizu, \u201cGIF-LR: GA-based informative feature for lipreading,\u201d Proc. APSIPA ASC 2012, PS.3-IVM.7.5, pp.1-4, 2012."},{"key":"17","unstructured":"[17] S. Tamura, M. Oonishi, and S. Hayamizu, \u201cAudio-visual interaction in model adaptation for multi-modal speech recognition,\u201d Proc. APSIPA ASC 2011, Thu-PM.PS2.7, pp.1-4, 2011."},{"key":"18","unstructured":"[18] S. Takeuchi, T. Hashiba, S. Tamura, and S. Hayamizu, \u201cVoice activity detection based on fusion of audio and visual information,\u201d Proc. AVSP2009, pp.151-154, 2009."},{"key":"19","unstructured":"[19] C.T. Ishi, M. Sato, N. Hagita, and S. Lao, \u201cReal-time audio-visual voice activity detection for speech recognition in noisy environments,\u201d Proc. AVSP2010, pp.81-84, 2010."},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] S. Tamura, T. Seko, and S. Hayamizu, \u201cData collection for mobile audio-visual speech recognition in various environments,\u201d Proc. Oriental COCOSDA 2014, pp.134-139, 2014.","DOI":"10.1109\/ICSDA.2014.7051434"},{"key":"21","doi-asserted-by":"crossref","unstructured":"[21] A.-R. Mohamed, G.E. Dahl, and G. Hinton, \u201cAcoustic modeling using deep belief networks,\u201d IEEE Trans. Audio Speech Language Process., vol.20, no.1, pp.14-21, 2012.","DOI":"10.1109\/TASL.2011.2109382"},{"key":"22","doi-asserted-by":"crossref","unstructured":"[22] D. Yu and M.L. Selzer, \u201cImproved bottleneck features using pretrained deep neural networks,\u201d Proc. INTERSPEECH2011, pp.237-240, 2011.","DOI":"10.21437\/Interspeech.2011-91"},{"key":"23","doi-asserted-by":"crossref","unstructured":"[23] T. Hayashi, N. Kitaoka, and K. Takeda, \u201cInvestigating the robustness of deep bottleneck features for recognizing speech of speakers of various ages,\u201d Proc. Forum Acusticum 2014, pp.1-6, 2014.","DOI":"10.1109\/APSIPA.2014.7041556"},{"key":"24","unstructured":"[24] J. Ngiam, A. Khosla, M. Kim, and A.Y. Ng, \u201cMultimodal deep learning,\u201d Proc. ICML2011, 2011."},{"key":"25","doi-asserted-by":"crossref","unstructured":"[25] J. Huang and B. Kingsbury, \u201cAudio-visual deep learning for noise robust speech recognition,\u201d Proc. ICASSP2013, pp.7596-7599, 2013.","DOI":"10.1109\/ICASSP.2013.6639140"},{"key":"26","doi-asserted-by":"crossref","unstructured":"[26] E. Marcheret, G. Potamianos, J. Vopicka, and V. Goel, \u201cDetecting audio-visual synchrony using deep neural networks,\u201d Proc. INTERSPEECH2015, pp.548-552, 2015.","DOI":"10.21437\/Interspeech.2015-201"},{"key":"27","doi-asserted-by":"crossref","unstructured":"[27] K. Noda, Y. Yamaguchi, K. Nakadai, H.G. Okuno, and T. Ogata, \u201cAudio-visual speech recognition using deep learning,\u201d Applied Intelligence, vol.42, no.4, pp.722-737, Springer, 2015.","DOI":"10.1007\/s10489-014-0629-7"},{"key":"28","doi-asserted-by":"crossref","unstructured":"[28] H. Ninomiya, N. Kitaoka, S. Tamura, Y. Iribe, and K. Takeda, \u201cIntegration of deep bottleneck features for audio-visual speech recognition,\u201d Proc. INTERSPEECH2015, pp.563-566, 2015.","DOI":"10.21437\/Interspeech.2015-204"},{"key":"29","doi-asserted-by":"crossref","unstructured":"[29] S. Tamura, H. Ninomiya, N. Kitaoka, S. Osuga, Y. Iribe, K. Takeda, and S. Hayamizu, \u201cAudio-visual speech recognition using deep bottleneck features and high-performance lipreading,\u201d Proc. APSIPA ASC 2015, pp.575-582, 2015.","DOI":"10.1109\/APSIPA.2015.7415335"},{"key":"30","doi-asserted-by":"crossref","unstructured":"[30] Y. Bengio, P. Lamblin, D. Popovici, and H. Larochelle, \u201cGreedy layer-wise training of deep networks,\u201d Proc. NIPS&apos;06, pp.153-160, 2007","DOI":"10.7551\/mitpress\/7503.003.0024"},{"key":"31","doi-asserted-by":"crossref","unstructured":"[31] G.E. Hinton, S. Osindero, and Y.-W. Teh, \u201cA fast learning algorithm for deep belief nets,\u201d Neural Comput., vol.18, no.7, pp.1527-1554, 2006.","DOI":"10.1162\/neco.2006.18.7.1527"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E99.D\/10\/E99.D_2016SLP0019\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T22:12:56Z","timestamp":1749593576000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E99.D\/10\/E99.D_2016SLP0019\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"references-count":31,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2016]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2016slp0019","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"type":"print","value":"0916-8532"},{"type":"electronic","value":"1745-1361"}],"subject":[],"published":{"date-parts":[[2016]]}}}