{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,3]],"date-time":"2025-06-03T22:40:03Z","timestamp":1748990403065,"version":"3.41.0"},"publisher-location":"Cham","reference-count":19,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319395159"},{"type":"electronic","value":"9783319395166"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-39516-6_16","type":"book-chapter","created":{"date-parts":[[2016,6,18]],"date-time":"2016-06-18T06:08:38Z","timestamp":1466230118000},"page":"170-179","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Bimodal Speech Recognition Fusing Audio-Visual Modalities"],"prefix":"10.1007","author":[{"given":"Alexey","family":"Karpov","sequence":"first","affiliation":[]},{"given":"Alexander","family":"Ronzhin","sequence":"additional","affiliation":[]},{"given":"Irina","family":"Kipyatkova","sequence":"additional","affiliation":[]},{"given":"Andrey","family":"Ronzhin","sequence":"additional","affiliation":[]},{"given":"Vasilisa","family":"Verkhodanova","sequence":"additional","affiliation":[]},{"given":"Anton","family":"Saveliev","sequence":"additional","affiliation":[]},{"given":"Milos","family":"Zelezny","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,6,19]]},"reference":[{"issue":"1","key":"16_CR1","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1016\/j.specom.2013.07.004","volume":"56","author":"A Karpov","year":"2014","unstructured":"Karpov, A., Markov, K., Kipyatkova, I., Vazhenina, D., Ronzhin, A.: Large vocabulary Russian speech recognition using syntactico-statistical language modeling. Speech Commun. 56(1), 213\u2013228 (2014)","journal-title":"Speech Commun."},{"issue":"1","key":"16_CR2","doi-asserted-by":"publisher","first-page":"85","DOI":"10.1016\/j.specom.2013.07.008","volume":"56","author":"L Besacier","year":"2014","unstructured":"Besacier, L., Barnard, E., Karpov, A., Schultz, T.: Automatic speech recognition for under-resourced languages: A survey. Speech Commun. 56(1), 85\u2013100 (2014)","journal-title":"Speech Commun."},{"issue":"9","key":"16_CR3","doi-asserted-by":"publisher","first-page":"1635","DOI":"10.1109\/JPROC.2015.2459017","volume":"103","author":"A Katsaggelos","year":"2015","unstructured":"Katsaggelos, A., Bahaadini, S., Molina, R.: Audio-visual fusion: challenges and new technologies. Proc. IEEE 103(9), 1635\u20131653 (2015)","journal-title":"Proc. IEEE"},{"issue":"2","key":"16_CR4","doi-asserted-by":"publisher","first-page":"175","DOI":"10.1109\/TCYB.2013.2250954","volume":"44","author":"D Stewart","year":"2014","unstructured":"Stewart, D., Seymour, R., Pass, A., Ming, J.: Robust audio-visual speech recognition under noisy audio-video conditions. IEEE Trans. Cybern. 44(2), 175\u2013184 (2014)","journal-title":"IEEE Trans. Cybern."},{"key":"16_CR5","doi-asserted-by":"publisher","first-page":"722","DOI":"10.1007\/s10489-014-0629-7","volume":"42","author":"K Noda","year":"2015","unstructured":"Noda, K., Yamaguchi, Y., Nakadai, K., Okuno, H., Ogata, T.: Audio-visual speech recognition using deep learning. Appl. Intell. 42, 722\u2013737 (2015)","journal-title":"Appl. Intell."},{"issue":"5","key":"16_CR6","doi-asserted-by":"publisher","first-page":"1060","DOI":"10.1109\/TASL.2013.2244083","volume":"21","author":"L Deng","year":"2013","unstructured":"Deng, L., Li, X.: Machine learning paradigms for speech recognition: an overview. IEEE Transa. Audio Speech Lang. Process. 21(5), 1060\u20131089 (2013)","journal-title":"IEEE Transa. Audio Speech Lang. Process."},{"key":"16_CR7","volume-title":"The HTK Book (for HTK Version 3.4)","author":"S Young","year":"2006","unstructured":"Young, S., et al.: The HTK Book (for HTK Version 3.4). Cambridge University Press, Cambridge (2006)"},{"key":"16_CR8","volume-title":"Learning OpenCV 3","author":"A Kaehler","year":"2015","unstructured":"Kaehler, A., Bradsky, G.: Learning OpenCV 3. O\u2019Reilly Media, California (2015)"},{"doi-asserted-by":"crossref","unstructured":"Viola, P., Jones, M.: Rapid object detection using a boosted cascade of simple features. In: Proceedings of the IEEE International Conference on Computer Vision and Pattern Recognition CVPR-2001, USA, pp. 511\u2013518 (2001)","key":"16_CR9","DOI":"10.1109\/CVPR.2001.990517"},{"unstructured":"Liang, L., Liu, X., Zhao, Y., Pi, X., Nefian, A.: Speaker independent audio-visual continuous speech recognition. In: Proceedings of the International Conferenceon Multimedia and Expo ICME 2002, Lausanne, Switzerland, pp. 25\u201328 (2002)","key":"16_CR10"},{"issue":"3","key":"16_CR11","first-page":"481","volume":"22","author":"M Castrillyn","year":"2011","unstructured":"Castrillyn, M., Deniz, O., Hernandez, D., Lorenzo, J.: A comparison of face and facial feature detectors based on the Viola-Jones general object detection framework. Mach. Vis. Appl. 22(3), 481\u2013494 (2011)","journal-title":"Mach. Vis. Appl."},{"doi-asserted-by":"crossref","unstructured":"Nefian, A.V., Liang, L.H., Pi, X., Xiaoxiang, X., Mao, C., Murphy, K.: A coupled HMM for audio-visual speech recognition. In: Proceedings of the International Conference ICASSP-2002, Orlando, USA, pp. 2013\u20132016 (2002)","key":"16_CR12","DOI":"10.1109\/ICASSP.2002.5745027"},{"issue":"12","key":"16_CR13","doi-asserted-by":"publisher","first-page":"2190","DOI":"10.1134\/S000511791412008X","volume":"75","author":"AA Karpov","year":"2014","unstructured":"Karpov, A.A.: An automatic multimodal speech recognition system with audio and video information. Autom. Remote Control 75(12), 2190\u20132200 (2014)","journal-title":"Autom. Remote Control"},{"key":"16_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"50","DOI":"10.1007\/978-3-319-11581-8_6","volume-title":"Speech and Computer","author":"A Karpov","year":"2014","unstructured":"Karpov, A., Kipyatkova, I., \u017delezn\u00fd, M.: A framework for recording audio-visual speech corpora with a microphone and a high-speed camera. In: Ronzhin, A., Potapova, R., Delic, V. (eds.) SPECOM 2014. LNCS, vol. 8773, pp. 50\u201357. Springer, Heidelberg (2014)"},{"unstructured":"Chu, S.M., Huang, T.S.: Multi-modal sensory fusion with application to audio-visual speech recognition. In: Proceedings of the Multi-Modal Speech Recognition Workshop-2002, Greensboro, USA (2002)","key":"16_CR15"},{"doi-asserted-by":"crossref","unstructured":"Karpov, A., Ronzhin, A., Markov, K., Zelezny, M.: Viseme-dependent weight optimization for CHMM-based audio-visual speech recognition. In: Proceedings of the International Conference, INTERSPEECH-2010, ISCA Association, Makuhari, Japan, pp. 2678\u20132681 (2010)","key":"16_CR16","DOI":"10.21437\/Interspeech.2010-710"},{"key":"16_CR17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-49127-9","volume-title":"Springer Handbook of Speech Processing","author":"J Benesty","year":"2008","unstructured":"Benesty, J., Sondhi, M., Huang, Y., et al.: Springer Handbook of Speech Processing. Springer, New York (2008)"},{"key":"16_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"369","DOI":"10.1007\/978-3-319-07437-5_35","volume-title":"Universal Access in Human-Computer Interaction","author":"A Karpov","year":"2014","unstructured":"Karpov, A., Ronzhin, A.: A universal assistive technology with multimodal input and multimedia output interfaces. In: Stephanidis, C., Antona, M. (eds.) UAHCI 2014, Part I. LNCS, vol. 8513, pp. 369\u2013378. Springer, Heidelberg (2014)"},{"key":"16_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"455","DOI":"10.1007\/978-3-319-20681-3_43","volume-title":"Universal Access in Human-Computer Interaction. Access to Interaction","author":"A Karpov","year":"2015","unstructured":"Karpov, A., Ronzhin, A., Kipyatkova, I.: Automatic analysis of speech and acoustic events for ambient assisted living. In: Antona, M., Stephanidis, C. (eds.) UAHCI 2015. LNCS, vol. 9176, pp. 455\u2013463. Springer, Heidelberg (2015)"}],"container-title":["Lecture Notes in Computer Science","Human-Computer Interaction. Interaction Platforms and Techniques"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-39516-6_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,3]],"date-time":"2025-06-03T22:08:53Z","timestamp":1748988533000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-39516-6_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319395159","9783319395166"],"references-count":19,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-39516-6_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2016]]},"assertion":[{"value":"19 June 2016","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}