{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:20:40Z","timestamp":1777656040778,"version":"3.51.4"},"reference-count":134,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"9","license":[{"start":{"date-parts":[[2003,9,1]],"date-time":"2003-09-01T00:00:00Z","timestamp":1062374400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Proc. IEEE"],"published-print":{"date-parts":[[2003,9]]},"DOI":"10.1109\/jproc.2003.817150","type":"journal-article","created":{"date-parts":[[2003,9,11]],"date-time":"2003-09-11T19:12:07Z","timestamp":1063307527000},"page":"1306-1326","source":"Crossref","is-referenced-by-count":458,"title":["Recent advances in the automatic recognition of audiovisual speech"],"prefix":"10.1109","volume":"91","author":[{"given":"G.","family":"Pomianos","sequence":"first","affiliation":[]},{"given":"C.","family":"Neti","sequence":"additional","affiliation":[]},{"given":"G.","family":"Gravier","sequence":"additional","affiliation":[]},{"given":"A.","family":"Garg","sequence":"additional","affiliation":[]},{"given":"A.W.","family":"Senior","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(97)00021-6"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/0885-2308(91)90011-E"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4613-1367-0_15"},{"key":"ref4","volume-title":"Springer-Verlag","author":"Stork","year":"1996"},{"key":"ref5","volume-title":"Psychology","author":"Campbell","year":"1998"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1121\/1.1907309"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1038\/264746a0"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.4324\/9780203098752-24"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.4324\/9780203098752-21"},{"key":"ref10","first-page":"3","article-title":"Some preliminaries to a comprehensive account of audio-visual speech perception","volume-title":"Hearing by Eye: The Psychology of Lip-Reading","author":"Summerfield","year":"1987"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1511\/1998.25.861"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(98)00048-X"},{"key":"ref13","first-page":"112","article-title":"Estimation of speech acoustics from visual speech features: A comparison of linear and nonlinear models","volume-title":"Proc. Conf. Audio-Visual Speech Processing","author":"Barker"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1155\/S1110865702206046"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1016\/b978-0-444-87143-5.50019-6"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-13015-5_1"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-13015-5_25"},{"key":"ref18","first-page":"265","article-title":"Automatic lipreading to enhance speech recognition","volume-title":"Proc. Global Telecommunications Conf.","author":"Petajan"},{"key":"ref19","volume-title":"Fundamentals of Speech Recognition","author":"Rabiner","year":"1993"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/6046.985551"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-13015-5_35"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICSLP.1996.607020"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-13015-5_36"},{"key":"ref24","first-page":"53","article-title":"Combining noise compensation with visual information in speech recognition","volume-title":"Proc. Eur. Workshop Audio-Visual Speech Processing","author":"Cox"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.1998.738914"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/6046.865479"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.940796"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.2001.962800"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1155\/S1110865702206083"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1998.679695"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2000.871546"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-13015-5_39"},{"key":"ref33","first-page":"61","article-title":"Adaptive determination of audio and visual weights for automatic speech recognition","volume-title":"Proc. Eur. Workshop Audio-Visual Speech Processing","author":"Rogozan"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1994.389567"},{"key":"ref35","first-page":"57","article-title":"Neural architectures for sensorfusion in speech recognition","volume-title":"Proc. Eur. Workshop Audio-Visual Speech Processing","author":"Krone"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/icslp.2000-468"},{"key":"ref37","article-title":"Audio-visual speech recognition","author":"Neti","year":"2000","journal-title":"Center Lang. Speech Process., Johns Hopkins Univ. Baltimore, MD"},{"key":"ref38","first-page":"177","article-title":"Automatic speechreading of impaired speech","volume-title":"Proc. Conf. Audio-Visual Speech Processing","author":"Potamianos"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.1994-139"},{"key":"ref40","first-page":"173","article-title":"An image transform approach for HMM based automatic lipreading","volume-title":"Proc. Int. Conf. Image Processing","volume":"1","author":"Potamianos"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/83.605417"},{"key":"ref42","first-page":"30","article-title":"Lipreading using eigensequences","volume-title":"Proc. Int. Workshop Automatic Face Gesture Recognition","author":"Li"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.2001.962802"},{"key":"ref44","doi-asserted-by":"crossref","first-page":"193","DOI":"10.1023\/A:1011352422845","article-title":"A cascade visual front end for speaker independent automatic speechreading","volume":"4","author":"Potamianos","year":"2001","journal-title":"Int. J. Speech Technol."},{"key":"ref45","first-page":"69","article-title":"Word dependent acoustic-labial weights in HMM-based speech recognition","volume-title":"Proc. Eur. Workshop Audio-Visual Speech Processing","author":"Jourlin"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/89.799688"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1155\/S1110865702206150"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.2000-759"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1006\/cviu.1996.0570"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICSLP.1996.607022"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-13015-5_27"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1155\/S1110865702206162"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.2001.962703"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/BFb0054760"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.21437\/Eurospeech.1997-476"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/BF00133570"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/BF00127169"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1006\/cviu.1995.1004"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICSMC.1997.635160"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1155\/S1110865702206137"},{"key":"ref61","first-page":"1202","article-title":"Automatically building and evaluating statistical models for lipreading","volume":"2002","author":"Daubias","year":"2002","journal-title":"EURASIP J. Appl. Signal Processing"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/34.655647"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/34.655648"},{"key":"ref64","first-page":"154","article-title":"Face and feature finding for a face recognition system","volume-title":"Proc. Int. Conf. Audio Video-based Biometric Person Authentication","author":"Senior"},{"key":"ref65","volume-title":"Linear Statistical Inference and its Applications","author":"Rao","year":"1965"},{"key":"ref66","volume-title":"Introduction to Multivariate Analysis","author":"Chatfield","year":"1991"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2001.958098"},{"key":"ref68","volume-title":"Digital Image Processing","author":"Gonzalez","year":"1977"},{"key":"ref69","volume-title":"Wavelets","author":"Daubechies","year":"1992"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1996.543247"},{"key":"ref71","first-page":"751","article-title":"Dynamic features for visual speech-reading: A systematic comparison","volume-title":"Advances in Neural Information Processing Systems","volume":"9","author":"Gray","year":"1997"},{"key":"ref72","volume-title":"The HTK Book","author":"Young","year":"1999"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.4324\/9780203098752-10"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1998.675351"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1998.675411"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1996.543250"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.21437\/icslp.1998-270"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/79.911195"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1155\/S1110865702207039"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2002.5745026"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1155\/S1110865702206101"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1142\/S021821309900004X"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/89.536928"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.2307\/2984875"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1986.1169179"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1142\/S0218001494000024"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.21437\/Eurospeech.1995-381"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1121\/1.1358887"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2002.5745030"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.2002-421"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/21.155943"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/34.667881"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/34.824819"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1007\/0-306-47044-6_1"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.1996-90"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.2000-83"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.21437\/icslp.2000-446"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1998.674472"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.940795"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1990.115970"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.1997.609450"},{"key":"ref102","first-page":"132","article-title":"Speech intelligibility derived from asynchronous processing of auditory-visual information","volume-title":"Proc. Conf. Audio-Visual Speech Processing","author":"Grant"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.2002-123"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.3115\/1289189.1289244"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-45344-X_20"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2002.5743873"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1093\/comjnl\/7.4.308"},{"key":"ref108","first-page":"711","article-title":"A new SNR-feature mapping for robust multistream speech recognition","volume-title":"Proc. Int. Congress Phonetic Sciences","author":"Berthommier"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.2000-643"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/89.279278"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1006\/csla.1995.0010"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.21437\/Eurospeech.1995-282"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1997.596119"},{"key":"ref114","article-title":"Maximum likelihood multiple projection schemes for hidden Markov models","author":"Gales","year":"1999","journal-title":"Cambridge University Cambridge, U.K."},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.21437\/Eurospeech.1999-303"},{"key":"ref116","doi-asserted-by":"crossref","first-page":"403","DOI":"10.1007\/BFb0016021","article-title":"The M2VTS multimodal face database","volume-title":"Audio-and Video-based Biometric Person Authentication","author":"Pigeon","year":"1997"},{"key":"ref117","first-page":"72","article-title":"XM2VTS: The extended M2VTS database","volume-title":"Proc. Int. Conf. Audio Video-based Biometric Person Authentication","author":"Messer"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.2001.962704"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/89.221368"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/5.664274"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1998.679685"},{"key":"ref122","first-page":"55","article-title":"Multi-sensor biometric person recognition in an access control system","volume-title":"Proc. Int. Conf. Audio Video-based Biometric Person Authentication","author":"Fr\u00f6ba"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.1999.793814"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/ACSSC.1994.471516"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/97.376913"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2000.871439"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/6046.865480"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/TNN.2002.1021891"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/icassp.2000.859318"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.1999.793797"},{"key":"ref131","first-page":"67","article-title":"Audiovisual speech coder: Using vector quantization to exploit the audio\/video correlation","volume-title":"Proc. Conf. Audio-Visual Speech Processing","author":"Foucher"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1155\/S1110865702207015"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1995.479827"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1155\/s1110865702206058"}],"container-title":["Proceedings of the IEEE"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx5\/5\/27570\/01230212.pdf?arnumber=1230212","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T01:58:54Z","timestamp":1743040734000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/1230212\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2003,9]]},"references-count":134,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2003,9]]}},"URL":"https:\/\/doi.org\/10.1109\/jproc.2003.817150","relation":{},"ISSN":["0018-9219"],"issn-type":[{"value":"0018-9219","type":"print"}],"subject":[],"published":{"date-parts":[[2003,9]]}}}