{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,20]],"date-time":"2025-10-20T10:13:29Z","timestamp":1760955209332,"version":"3.38.0"},"publisher-location":"Berlin, Heidelberg","reference-count":22,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783540229452"},{"type":"electronic","value":"9783540286493"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2004]]},"DOI":"10.1007\/978-3-540-28649-3_60","type":"book-chapter","created":{"date-parts":[[2010,9,8]],"date-time":"2010-09-08T19:31:41Z","timestamp":1283974301000},"page":"488-495","source":"Crossref","is-referenced-by-count":6,"title":["Large Vocabulary Audio-Visual Speech Recognition Using the Janus Speech Recognition Toolkit"],"prefix":"10.1007","author":[{"given":"Jan","family":"Kratt","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Florian","family":"Metze","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rainer","family":"Stiefelhagen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alex","family":"Waibel","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","reference":[{"key":"60_CR1","doi-asserted-by":"crossref","unstructured":"McGurk, H., MacDonald, J.: Hearing lips and seeing voices. Nature (1976)","DOI":"10.1038\/264746a0"},{"key":"60_CR2","unstructured":"Potamianos, G., Neti, C., Deligne, S.: Joint Audio-Visual Speech Processing for Recognition and Enhancement. In: Proceedings of AVSP 2003 (2003)"},{"key":"60_CR3","doi-asserted-by":"crossref","unstructured":"Goecke, R., Potamianos, G., Neti, C.: Noisy Audio Feature Enhancement using Audio-Visual Speech Data. In: ICASSP 2002 (2002)","DOI":"10.1109\/ICASSP.2002.5745030"},{"key":"60_CR4","doi-asserted-by":"crossref","unstructured":"Hennecke, M.E., Prasad, K.V., Stork, D.G.: Using deformable templates to infer visual speech dynamics. In: 28th Annual Asimolar conference on Signal speech and Computers","DOI":"10.1109\/ACSSC.1994.471518"},{"key":"60_CR5","doi-asserted-by":"crossref","unstructured":"Goldschen, A.J., Gracia, O.N., Petajan, E.: Continuous optical automatic speech recognition by lipreading. In: 28th Annual Asimolar conference on Signal speech and Computers","DOI":"10.1109\/ACSSC.1994.471517"},{"key":"60_CR6","unstructured":"Movellan, J.R.: Visual speech recognition with stochastic networks. In: NIPS 1994 (1994)"},{"key":"60_CR7","doi-asserted-by":"crossref","unstructured":"Duchnowski, P., Meier, U., Waibel, A.: See me, hear me: Integrating automatic speech recognition and lip-reading. In: Internation Conference on Spoken Language Processing, ICSLP, pp. 547\u2013550 (1994)","DOI":"10.21437\/ICSLP.1994-139"},{"key":"60_CR8","doi-asserted-by":"crossref","unstructured":"Deligne, S., Potamianos, G., Neti, C.: Audio-Visual speech enhancement with avcdcn (Audio-Visual Codebook Dependent Cepstral Normalization). In: IEEE workshop on Sensor Array and Multichannel Signal Processing in August 2002, Washington DC and ICSLP (2002)","DOI":"10.21437\/ICSLP.2002-421"},{"key":"60_CR9","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1109\/6046.865479","volume":"2","author":"S. Dupont","year":"2000","unstructured":"Dupont, S., Luettin, J.: Audio-visual speech modeling for continuous speech recognition. IEEE Trans. Multimedia\u00a02, 141\u2013151 (2000)","journal-title":"IEEE Trans. Multimedia"},{"key":"60_CR10","doi-asserted-by":"crossref","unstructured":"Huang, J., Potamianos, G., Neti, C.: Improving Audio-Visual Speech Recognition with an Infrared Headset. In: Proceedings of AVSP 2003 (2003)","DOI":"10.21437\/Eurospeech.2003-410"},{"key":"#cr-split#-60_CR11.1","doi-asserted-by":"crossref","unstructured":"Meier, U., Stiefelhagen, R., Yang, J., Waibel, A.: Towards Unrestricted Lipreading. International Journal of pattern Recognition and Artificial Intelligence\u00a014(5), 571\u2013785 (2000);","DOI":"10.1142\/S0218001400000374"},{"key":"#cr-split#-60_CR11.2","unstructured":"Second International Conference on Multimodal Interfaces, ICMI 1999 (1999)"},{"key":"60_CR12","doi-asserted-by":"crossref","unstructured":"Bregler, C., Konig, Y.: Eigenlips for robust speech recognition. In: Proc. IEEE Intl. Conf. Acous. Speech Sig. Process, pp. 669\u2013672 (1994)","DOI":"10.1109\/ICASSP.1994.389567"},{"key":"60_CR13","doi-asserted-by":"crossref","unstructured":"Matthews, I., Bangham, J.A., Cox, S.: Audiovisual speech recognition using multiscale nonlinear image decomposition. In: Proc. 4th ICSLP, vol.\u00a01, pp. 38\u201341 (1996)","DOI":"10.1109\/ICSLP.1996.607019"},{"issue":"8","key":"60_CR14","first-page":"1417","volume":"E80A","author":"A. Ogihara","year":"1997","unstructured":"Ogihara, A., Asao, S.: An isolated word speech recognition based on fusion of visual and auditory information using 30-frames\/s and 24-bit color image. IEICE Trans. Fund. Electron., Commun. Comput. Sci.\u00a0E80A(8), 1417\u20131422 (1997)","journal-title":"IEICE Trans. Fund. Electron., Commun. Comput. Sci."},{"key":"60_CR15","volume-title":"Audio-Visual Speech Recognition - Workshop 2000 Final Report. Center for Language and Speech Processing","author":"C. Neti","year":"2000","unstructured":"Neti, C., Potamianos, G., et al.: Audio-Visual Speech Recognition - Workshop 2000 Final Report. Center for Language and Speech Processing. The Johns Hopkins University, Baltimore (2000)"},{"key":"60_CR16","doi-asserted-by":"crossref","unstructured":"Potamianos, G., Neti, C., Iyengar, G., Helmuth, E.: Large-Vocabulary Audio-Visual Speech Recognition by Machines and Humans. In: Proc. Eurospeech (2001)","DOI":"10.21437\/Eurospeech.2001-294"},{"key":"60_CR17","doi-asserted-by":"crossref","unstructured":"Potamianos, G., Verma, A., Neti, C., Iyengar, G., Basu, S.: A Cascade Image Transformation For Speaker Independent Automatic Speechreading. In: Proceedings of the IEEE International Conference on Multimedia and Expo, pp. 1097\u20131100 (2000)","DOI":"10.1109\/ICME.2000.871552"},{"key":"60_CR18","unstructured":"Finke, M., Geutner, P., Hild, H., Kemp, T., Ries, K., Westphal, M.: The Karlsruhe- VERBMOBIL Speech Recognition Engine. In: Proceedings of ICASSP, Munich, Germany (1997)"},{"key":"60_CR19","doi-asserted-by":"crossref","unstructured":"Soltau, H., Metze, F., F\u00fcgen, C., Waibel, A.: A One Pass-Decoder Based on Polymorphic Linguistic Context Assignment. In: Proc. of ASRU, Trento, Italy (2001)","DOI":"10.1109\/ASRU.2001.1034625"},{"key":"60_CR20","doi-asserted-by":"crossref","unstructured":"Stiefelhagen, R., Yang, J.: Gaze Tracking for Multimodal Human- Computer Interaction. In: Proc. of the International Conference on Acoustics, Speech and Signal Processing: ICASSP 1997, Munich, Germany (April 1997)","DOI":"10.1109\/ICASSP.1997.595325"},{"key":"60_CR21","doi-asserted-by":"crossref","unstructured":"Gravier, G., Potamianos, G., Neti, C.: Asynchrony modeling for audio-visual speech recognition. In: Proc. Human Language Technology Conference (2002)","DOI":"10.3115\/1289189.1289244"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-540-28649-3_60.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,25]],"date-time":"2025-02-25T17:09:21Z","timestamp":1740503361000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-540-28649-3_60"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2004]]},"ISBN":["9783540229452","9783540286493"],"references-count":22,"URL":"https:\/\/doi.org\/10.1007\/978-3-540-28649-3_60","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2004]]}}}