{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,26]],"date-time":"2026-03-26T15:55:43Z","timestamp":1774540543832,"version":"3.50.1"},"reference-count":141,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100001261","name":"Chinasoft International Ltd., and London South Bank University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001261","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/access.2021.3107946","type":"journal-article","created":{"date-parts":[[2021,8,25]],"date-time":"2021-08-25T19:56:46Z","timestamp":1629921406000},"page":"121184-121205","source":"Crossref","is-referenced-by-count":62,"title":["Deep Learning-Based Automated Lip-Reading: A Survey"],"prefix":"10.1109","volume":"9","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6725-0405","authenticated-orcid":false,"given":"Souheil","family":"Fenghour","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0030-1199","authenticated-orcid":false,"given":"Daqing","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1436-1742","authenticated-orcid":false,"given":"Kun","family":"Guo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1415-4444","authenticated-orcid":false,"given":"Bo","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9036-3061","authenticated-orcid":false,"given":"Perry","family":"Xiao","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","article-title":"LRS3-TED: A large-scale dataset for visual speech recognition","author":"afouras","year":"2018","journal-title":"arXiv 1809 00496"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.367"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178347"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.2991\/jcis.2008.61"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-43958-7_40"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1044\/jshr.1404.677"},{"key":"ref37","first-page":"1","article-title":"Improving visual features for lip-reading","author":"lan","year":"2010","journal-title":"Proc Int Conf Auditory-Visual Speech Process"},{"key":"ref36","first-page":"1","article-title":"The IV2 multimodal biometric database (including iris, 2D, 3D, stereoscopic, and talking face data), and the IV2-2007 evaluation campaign","author":"petrovska-delacretaz","year":"2008","journal-title":"Proc IEEE 2nd Int Conf Biometrics Theory Appl Syst"},{"key":"ref35","first-page":"69","article-title":"Patch-based analysis of visual speech from multiple views","author":"lucey","year":"2008","journal-title":"Proc Int Conf Auditory-Visual Speech Process"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2004.10.007"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1121\/1.2229005"},{"key":"ref27","article-title":"BT David database-internal rep","author":"chibelushi","year":"1996"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1121\/1.5042758"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2019.12.122"},{"key":"ref22","article-title":"BL-database: A French audiovisual database for speech driven lip animation systems INRIA","author":"benezeth","year":"2011"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-44887-X_74"},{"key":"ref24","first-page":"85","article-title":"CENSREC-1-AV: An audio-visual corpus for noisy bimodal speech recognition","author":"tamura","year":"2010","journal-title":"Proc Int Conf Auditory-Visual Speech Process"},{"key":"ref23","first-page":"42","article-title":"Chinese audiovisual bimodal speeeh database CAVSR1.0","volume":"25","author":"yanjun","year":"2000","journal-title":"Acta Acustica-Peking"},{"key":"ref101","author":"graves","year":"2006","journal-title":"Connectionist temporal classification labelling unsegmented sequence data with recurrent neural networks"},{"key":"ref26","first-page":"ii-2017","article-title":"CUAVE: A new audio-visual database for multimodal human-computer interface research","author":"patterson","year":"2002","journal-title":"Proc IEEE Int Conf Acoust Speech Signal Process"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2007.366941"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2015.7163155"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2010.5650963"},{"key":"ref59","article-title":"The VidTIMIT database","author":"sanderson","year":"2002"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/11527923_81"},{"key":"ref57","first-page":"1239","article-title":"Design and recording of Czech audio-visual database with impaired conditions for continuous speech recognition","author":"trojanova","year":"2008","journal-title":"Proc Int Conf Lang Resour Eval"},{"key":"ref56","first-page":"1","article-title":"Design and recording of Czech speech corpus for audio-visual continuous speech recognition","author":"cisar","year":"2005","journal-title":"Proc Auditory-Visual Speech Process Int Conf"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2011.06.011"},{"key":"ref54","first-page":"851","article-title":"Visual speech recognition with stochastic networks","author":"movellan","year":"1994","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2407694"},{"key":"ref52","article-title":"Confusion modelling for lip-reading, University of East Anglia","author":"howell","year":"2015"},{"key":"ref40","first-page":"87","article-title":"Lip reading in the wild","author":"chung","year":"2015","journal-title":"Proc Asian Conf Comput Vis"},{"key":"ref4","first-page":"1","article-title":"Comparison of phoneme and viseme based acoustic units for speech driven realistic lip animation","author":"bozkurt","year":"2007","journal-title":"Proc 3DTV Conf"},{"key":"ref3","article-title":"Audio visual speech recognition","author":"neti","year":"2000"},{"key":"ref6","first-page":"1","article-title":"Multimodal deep learning","author":"ngiam","year":"2011","journal-title":"Proc 28th Int Conf Mach Learn (ICML)"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-45683-X_60"},{"key":"ref8","article-title":"Large-scale visual speech recognition","author":"shillingford","year":"2018","journal-title":"arXiv 1807 05162"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2009.2030637"},{"key":"ref7","first-page":"873","article-title":"Sparse deep belief net model for visual area V2","author":"lee","year":"2008","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3040906"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/s10844-016-0438-z"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW.2012.116"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15760-8_33"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.5244\/C.31.155"},{"key":"ref42","first-page":"1065","article-title":"Multipose audio-visual speech recognition","author":"estellers","year":"2011","journal-title":"Proc 19th Eur Signal Process Conf"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2019.8756582"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-11755-3_3"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.1999.822914"},{"key":"ref127","first-page":"420","article-title":"Mutual information maximization for effective lip reading","author":"zhao","year":"2020","journal-title":"Proc 15th IEEE Int Conf Autom Face Gesture Recognit (FG)"},{"key":"ref126","first-page":"273","article-title":"Pseudo-convolutional policy gradient for sequence-to-sequence lip-reading","author":"luo","year":"2020","journal-title":"Proc 15th IEEE Int Conf Autom Face Gesture Recognit (FG)"},{"key":"ref125","first-page":"364","article-title":"Deformation flow based two-stream network for lip reading","author":"xiao","year":"2020","journal-title":"Proc 15th IEEE Int Conf Autom Face Gesture Recognit (FG)"},{"key":"ref124","first-page":"356","article-title":"Can we read speech beyond the lips? Rethinking RoI selection for deep visual speech recognition","author":"zhang","year":"2020","journal-title":"Proc 15th IEEE Int Conf Autom Face Gesture Recognit (FG)"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/AVSS.2007.4425340"},{"key":"ref72","first-page":"1385","article-title":"Naive Bayes face\/nonface classifier: A study of preprocessing and feature extraction techniques","author":"phung","year":"2004","journal-title":"Proc Int Conf Image Process (ICIP)"},{"key":"ref129","first-page":"7608","article-title":"Towards practical lipreading with distilled and efficient models","author":"ma","year":"2021","journal-title":"Proc IEEE Int Conf Acoust Speech Signal Process (ICASSP)"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2014.06.004"},{"key":"ref128","first-page":"2857","article-title":"Lip-reading with densely connected temporal convolutional networks","author":"ma","year":"2021","journal-title":"Proc IEEE Winter Conf Appl Comput Vis (WACV)"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/6046.865479"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-020-01813-1"},{"key":"ref130","first-page":"1030","article-title":"Complementary sum sampling for likelihood approximation in large scale classification","volume":"54","author":"botev","year":"2017","journal-title":"Proc 20th Int Conf Artif Intell Statist"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952625"},{"key":"ref74","first-page":"1","article-title":"A review on face detection methods","volume":"11","author":"rizvi","year":"2011","journal-title":"J Manage Develop Inf Technol"},{"key":"ref75","doi-asserted-by":"crossref","first-page":"84","DOI":"10.1109\/TENCON.1999.818355","article-title":"Face detection and recognition using PCA","volume":"1","author":"lee","year":"1999","journal-title":"Proc IEEE Region 10 Conf TENCON Multimedia Technol Asia&#x2013;Pacific Inf Infrastruct"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1007\/978-94-015-8935-2_14"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2016.03.003"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553380"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.21437\/AVSP.2017-8"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1016\/0010-0277(93)90058-4"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.5244\/C.31.161"},{"key":"ref136","first-page":"1","article-title":"Comparing phonemes and visemes with DNN-based lipreading","author":"thangthai","year":"2017","journal-title":"Proc Brit Mach Vis Conf"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-106"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.389"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.23"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2017.34"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2008.2011515"},{"key":"ref62","first-page":"965","article-title":"XM2VTSDB: The extended M2VTS database","author":"messer","year":"1999","journal-title":"Proc Int Conf Audio Video-Based Biometric Person Authentication"},{"key":"ref61","first-page":"1","article-title":"WAPUSK20&#x2014;A database for robust audiovisual speech recognition","author":"wang","year":"2010","journal-title":"Proc Int Conf Lang Resour Eval"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.3390\/app9081599"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/s11760-019-01630-1"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/PRIA.2017.7983045"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1080\/02699200500266745"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2016.2520091"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(90)90010-7"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2004-433"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-0-387-76316-3_4"},{"key":"ref2","author":"jeffers","year":"1971","journal-title":"Speechreading (Lipreading)"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2012.192"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/1027933.1027972"},{"key":"ref109","first-page":"127","article-title":"Improving lip-reading performance for robust audiovisual speech recognition using DNNs","author":"thangthai","year":"2015","journal-title":"Proc Int Conf Auditory-Visual Speech Process"},{"key":"ref95","first-page":"2377","article-title":"Training very deep networks","author":"srivastava","year":"2015","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref108","article-title":"Multimodal transfer deep learning with applications in audio-visual recognition","author":"moon","year":"2015","journal-title":"Proc MMML Workshop Neural Inf Process Syst"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2018.00088"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639140"},{"key":"ref93","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2014","journal-title":"arXiv 1409 0473"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461326"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2017.2761539"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053841"},{"key":"ref91","first-page":"1","article-title":"LipNet: End-to-end sentence level lipreading","author":"assael","year":"2016","journal-title":"Proc ICLR Conf"},{"key":"ref104","first-page":"7613","article-title":"End-To-end audio-visual speech recognition with conformers","author":"ma","year":"2021","journal-title":"Proc IEEE Int Conf Acoust Speech Signal Process (ICASSP)"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-51369-6_34"},{"key":"ref103","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc NIPS"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1943"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472088"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472852"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472172"},{"key":"ref98","article-title":"LipReading with 3D-2D-CNN BLSTM-HMM and word-CTC models","author":"kumar margam","year":"2019","journal-title":"arXiv 1906 12170"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-85"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461347"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2018.07.002"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3036865"},{"key":"ref12","first-page":"163","article-title":"Audiovisual database of Polish speech recordings","volume":"33","author":"igras","year":"2012","journal-title":"Studia Informatica"},{"key":"ref13","first-page":"1","article-title":"AusTalk: An audio-visual corpus of Australian English","author":"estival","year":"2014","journal-title":"Proc Int Conf Lang Resour Eval"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461596"},{"key":"ref15","first-page":"1","article-title":"AV@CAR: A Spanish multichannel multimodal corpus for in-vehicle automatic audio-visual speech recognition","author":"ortega","year":"2004","journal-title":"Proc Int Conf Lang Resour Eval"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2018.00289"},{"key":"ref16","first-page":"7","article-title":"Audio visual Arabic speech (AVAS) database for human-computer interaction applications","volume":"3","author":"antar","year":"2013","journal-title":"Int J Adv Res Comput Sci Softw Eng"},{"key":"ref82","article-title":"Lip reading using CNN and LSTM","author":"garg","year":"2016"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2018.8486470"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2004-424"},{"key":"ref81","first-page":"1149","article-title":"Lipreading using convolutional neural network","author":"noda","year":"2014","journal-title":"Proc Conf Int Speech Commun Assoc"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/34.982900"},{"key":"ref84","first-page":"251","article-title":"Out of time: Automated lip sync in the wild","author":"chung","year":"2016","journal-title":"Proc Asian Conf Comput Vis"},{"key":"ref119","article-title":"Learning from videos with deep convolutional LSTM networks","author":"courtney","year":"2019","journal-title":"arXiv 1904 04817"},{"key":"ref19","first-page":"179","article-title":"The challenge of multispeaker lip-reading","author":"cox","year":"2008","journal-title":"Proc Int Conf Auditory-Visual Speech Process"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICIS.2016.7550888"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/IC3.2018.8530509"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-421"},{"key":"ref116","article-title":"Deep audio-visual speech recognition","author":"afouras","year":"2018","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-21735-7_7"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461900"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2927166"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019211"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683733"},{"key":"ref122","first-page":"1","article-title":"Learning spatio-temporal features with two-stream deep 3D CNNs for lipreading","author":"weng","year":"2019","journal-title":"Proc Brit Mach Vis Conf"},{"key":"ref123","first-page":"1","article-title":"Multi-grained spatio-temporal modelling for lip-reading","author":"wang","year":"2019","journal-title":"Proc Brit Mach Vis Conf"},{"key":"ref85","first-page":"290","article-title":"Multi-view automatic lip-reading using neural network","author":"lee","year":"2016","journal-title":"Proc Asian Conf Comput Vis"},{"key":"ref86","first-page":"277","article-title":"Concatenated frame image based CNN for visual speech recognition","author":"saitoh","year":"2016","journal-title":"Proc Asian Conf Comput Vis"},{"key":"ref87","article-title":"Network in network","author":"lin","year":"2013","journal-title":"arXiv 1312 4400"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2019.04.010"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/9312710\/09522117.pdf?arnumber=9522117","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,12,17]],"date-time":"2021-12-17T19:57:40Z","timestamp":1639771060000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9522117\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":141,"URL":"https:\/\/doi.org\/10.1109\/access.2021.3107946","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]}}}