{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,8]],"date-time":"2026-01-08T08:00:10Z","timestamp":1767859210294,"version":"3.49.0"},"reference-count":102,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2024,5,6]],"date-time":"2024-05-06T00:00:00Z","timestamp":1714953600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"},{"start":{"date-parts":[[2024,5,6]],"date-time":"2024-05-06T00:00:00Z","timestamp":1714953600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"funder":[{"DOI":"10.13039\/501100016386","name":"Conselleria de Innovaci\u00f3n, Universidades, Ciencia y Sociedad Digital, Generalitat Valenciana","doi-asserted-by":"publisher","award":["CIACIF\/2021\/295"],"award-info":[{"award-number":["CIACIF\/2021\/295"]}],"id":[{"id":"10.13039\/501100016386","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004837","name":"Ministerio de Ciencia e Innovaci\u00f3n","doi-asserted-by":"publisher","award":["PID2021-124719OB-I0"],"award-info":[{"award-number":["PID2021-124719OB-I0"]}],"id":[{"id":"10.13039\/501100004837","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J AUDIO SPEECH MUSIC PROC."],"abstract":"<jats:title>Abstract<\/jats:title><jats:p>Visual speech recognition (VSR) is a challenging task that has received increasing interest during the last few decades. Current state of the art employs powerful end-to-end architectures based on deep learning which depend on large amounts of data and high computational resources for their estimation. We address the task of VSR for data scarcity scenarios with limited computational resources by using traditional approaches based on hidden Markov models. We present a novel learning strategy that employs information obtained from previous acoustic temporal alignments to improve the visual system performance. Furthermore, we studied multiple visual speech representations and how image resolution or frame rate affect its performance. All these experiments were conducted on the limited data VLRF corpus, a database which offers an audio-visual support to address continuous speech recognition in Spanish. The results show that our approach significantly outperforms the best results achieved on the task to date.<\/jats:p>","DOI":"10.1186\/s13636-024-00345-7","type":"journal-article","created":{"date-parts":[[2024,5,6]],"date-time":"2024-05-06T06:02:28Z","timestamp":1714975348000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Continuous lipreading based on acoustic temporal alignments"],"prefix":"10.1186","volume":"2024","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7375-9515","authenticated-orcid":false,"given":"David","family":"Gimeno-G\u00f3mez","sequence":"first","affiliation":[]},{"given":"Carlos-D.","family":"Mart\u00ednez-Hinarejos","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,5,6]]},"reference":[{"issue":"3","key":"345_CR1","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1109\/6046.865479","volume":"2","author":"S Dupont","year":"2000","unstructured":"S. Dupont, J. Luettin, Audio-visual speech modeling for continuous speech recognition. IEEE Trans. Multimed. 2(3), 141\u2013151 (2000). https:\/\/doi.org\/10.1109\/6046.865479","journal-title":"IEEE Trans. Multimed."},{"key":"345_CR2","doi-asserted-by":"crossref","unstructured":"J. Besle, A. Fort, C. Delpuech, M.-H. Giard, Bimodal speech: early suppressive visual effects in human auditory cortex. Eur. J. NeuroSci. 20(8), 2225\u20132234 (2004).\u00a0\u00a0https:\/\/doi.org\/10.1111%2Fj.1460-9568.2004.03670.x","DOI":"10.1111\/j.1460-9568.2004.03670.x"},{"issue":"5588","key":"345_CR3","doi-asserted-by":"publisher","first-page":"746","DOI":"10.1038\/264746a0","volume":"264","author":"H McGurk","year":"1976","unstructured":"H. McGurk, J. MacDonald, Hearing lips and seeing voices. Nature. 264(5588), 746\u2013748 (1976). https:\/\/doi.org\/10.1038\/264746a0","journal-title":"Nature."},{"key":"345_CR4","doi-asserted-by":"publisher","unstructured":"M. Gales, Maximum likelihood linear transformations for HMM-based speech recognition. Comput. Speech Lang. 12 (2), 75-98 (1998). https:\/\/doi.org\/10.1006\/csla.1998.0043","DOI":"10.1006\/csla.1998.0043"},{"issue":"3","key":"345_CR5","doi-asserted-by":"publisher","first-page":"251","DOI":"10.2307\/1268779","volume":"33","author":"BH Juang","year":"1991","unstructured":"B.H. Juang, L.R. Rabiner, Hidden Markov models for speech recognition. Echnometrics. 33(3), 251\u2013272 (1991). https:\/\/doi.org\/10.2307\/1268779","journal-title":"Echnometrics."},{"key":"345_CR6","doi-asserted-by":"crossref","unstructured":"W. Chan, N. Jaitly, Q. Le, O. Vinyals, ICASSP. Listen, attend and spell: a neural network for large vocabulary conversational speech recognition (2016), pp. 4960-4964","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"345_CR7","doi-asserted-by":"crossref","unstructured":"P. Ma, S. Petridis, M. Pantic, ICASSP. End-to-end audio-visual speech recognition with conformers (IEEE, 2021), pp. 7613\u20137617","DOI":"10.1109\/ICASSP39728.2021.9414567"},{"key":"345_CR8","unstructured":"A. Radford, J.W. Kim, T. Xu, G. Brockman, C. McLeavey, I. Sutskever, Robust speech recognition via large-scale weak supervision (2022). arXiv preprint arXiv:2212.04356"},{"key":"345_CR9","doi-asserted-by":"crossref","unstructured":"M. Anwar, B. Shi, V. Goswami, W. Hsu, J. Pino, C. Wang, Interspeech. MuAViC: a multilingual audio-visual corpus for robust speech recognition and robust speech-to-text translation (ISCA, 2023), pp. 4064\u20134068","DOI":"10.21437\/Interspeech.2023-2279"},{"key":"345_CR10","doi-asserted-by":"crossref","unstructured":"M. Burchi, R. Timofte, Wacv. Audio-visual efficient conformer for robust speech recognition (2023), pp. 2257-2266","DOI":"10.1109\/WACV56688.2023.00229"},{"key":"345_CR11","doi-asserted-by":"publisher","unstructured":"B. Juang, Speech recognition in adverse environments. Comput. Speech Lang. 5(3), 275\u2013294 (1991). https:\/\/doi.org\/10.1016\/0885-2308(91)90011-E","DOI":"10.1016\/0885-2308(91)90011-E"},{"key":"345_CR12","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.288905","author":"T Afouras","year":"2018","unstructured":"T. Afouras, J.S. Chung, A. Senior, O. Vinyals, A. Zisserman, Deep audiovisual speech recognition. Trans. PAMI. (2018). https:\/\/doi.org\/10.1109\/TPAMI.2018.288905","journal-title":"Deep audiovisual speech recognition. Trans. PAMI."},{"issue":"9","key":"345_CR13","doi-asserted-by":"publisher","first-page":"1306","DOI":"10.1109\/JPROC.2003.817150","volume":"91","author":"G Potamianos","year":"2003","unstructured":"G. Potamianos, C. Neti, G. Gravier, A. Garg, A. Senior, Recent advances in the automatic recognition of audiovisual speech. IEEE. 91(9), 1306\u20131326 (2003). https:\/\/doi.org\/10.1109\/JPROC.2003.817150","journal-title":"IEEE."},{"key":"345_CR14","unstructured":"B. Shi, W.N. Hsu, K. Lakhotia, A. Mohamed, Learning audio-visual speech representation by masked multimodal cluster prediction (2022). arXiv preprint arXiv:2201.02184"},{"key":"345_CR15","volume-title":"ICANN","author":"P Eickhoff","year":"2023","unstructured":"P. Eickhoff, M. M\u00f6ller, T.P. Rosin, J. Twiefel, S. Wermter, ICANN (Introducing Noise Robustness to Pretrained Automatic Speech Recognition (Springer, Nature Switzerland, Bring the Noise, 2023)"},{"key":"345_CR16","doi-asserted-by":"crossref","unstructured":"Z. Huang, S. Watanabe, S.-W. Yang, P. Garc\u00eda, S. Khudanpur, ICASSP. Investigating self-supervised learning for speech enhancement and separation (2022), pp. 6837-6841","DOI":"10.1109\/ICASSP43922.2022.9746303"},{"key":"345_CR17","doi-asserted-by":"crossref","unstructured":"S. Pascual, A. Bonafonte, J. Serr\u00e1, Interspeech. SEGAN: speech enhancement generative adversarial network (ISCA, 2017), pp. 3642\u20133646","DOI":"10.21437\/Interspeech.2017-1428"},{"key":"345_CR18","doi-asserted-by":"crossref","unstructured":"H. Yen, F. Germain, G. Wichern, J. Roux, ICASSP. Cold diffusion for speech enhancement (IEEE, 2023), pp. 1-5","DOI":"10.1109\/ICASSP49357.2023.10096064"},{"key":"345_CR19","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1016\/j.imavis.2018.07.002","volume":"78","author":"A Fernandez-Lopez","year":"2018","unstructured":"A. Fernandez-Lopez, F.M. Sukno, Survey on automatic lip-reading in the era of deep learning. Image Vision Comput. 78, 53\u201372 (2018). https:\/\/doi.org\/10.1016\/j.imavis.2018.07.002","journal-title":"Image Vision Comput."},{"key":"345_CR20","doi-asserted-by":"crossref","unstructured":"A. Fernandez-Lopez, F.M. Sukno, International Joint Conference on Computer Vision, Imaging and Computer Graphics. Optimizing phoneme-to-viseme mapping for continuous lip-reading in spanish (Elsevier, 2017), pp. 305\u2013328","DOI":"10.1007\/978-3-030-12209-6_15"},{"key":"345_CR21","unstructured":"K. Thangthai, Computer lipreading via hybrid deep neural network hidden Markov models (Unpublished doctoral dissertation) (University of East Anglia, 2018)"},{"key":"345_CR22","doi-asserted-by":"publisher","unstructured":"P. Ma, S. Petridis, M. Pantic. Visual speech recognition for multiple languages in the wild. Nat. Mach. Intel. 4(11), 930\u2013939 (2022). https:\/\/doi.org\/10.1038\/s42256-022-00550-z","DOI":"10.1038\/s42256-022-00550-z"},{"key":"345_CR23","doi-asserted-by":"publisher","first-page":"55354","DOI":"10.1109\/ACCESS.2020.2982359","volume":"8","author":"M Ezz","year":"2020","unstructured":"M. Ezz, A.M. Mostafa, A.A. Nasr, A silent password recognition framework based on lip analysis. IEEE Access 8, 55354\u201355371 (2020). https:\/\/doi.org\/10.1109\/ACCESS.2020.2982359","journal-title":"IEEE Access"},{"key":"345_CR24","doi-asserted-by":"crossref","unstructured":"T. Stafylakis, G. Tzimiropoulos, ECCV. Zero-shot keyword spotting for visual speech recognition in-the-wild (2018), pp. 513\u2013529","DOI":"10.1109\/ICASSP.2018.8461347"},{"issue":"4","key":"345_CR25","doi-asserted-by":"publisher","first-page":"270","DOI":"10.1016\/j.specom.2009.08.002","volume":"52","author":"B Denby","year":"2010","unstructured":"B. Denby, T. Schultz, K. Honda, T. Hueber, J.M. Gilbert, J.S. Brumberg, Silent speech interfaces. Speech Commun. 52(4), 270\u2013287 (2010). https:\/\/doi.org\/10.1016\/j.specom.2009.08.002","journal-title":"Speech Commun."},{"key":"345_CR26","doi-asserted-by":"publisher","first-page":"177995","DOI":"10.1109\/ACCESS.2020.3026579","volume":"8","author":"J.A. Gonzalez-Lopez","year":"2020","unstructured":"J.A. Gonzalez-Lopez, A. Gomez-Alanis, J.M. Mart\u00edn Do\u00f1as, J.L. P\u00e9rez-C\u00f3rdoba, A.M. Gomez, Silent speech interfaces for speech restoration: a review. IEEE Access. 8, 177995\u2013178021 (2020). https:\/\/doi.org\/10.1109\/ACCESS.2020.3026579","journal-title":"IEEE Access."},{"issue":"6","key":"345_CR27","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MSP.2012.2205597","volume":"29","author":"G Hinton","year":"2012","unstructured":"G. Hinton, L. Deng, D. Yu, G.E. Dahl, A. Mohamed, N. Jaitly, B. Kingsbury, Deep neural networks for acoustic modeling in speech recognition: the shared views of four research groups. IEEE Signal Proc. Mag. 29(6), 82\u201397 (2012). https:\/\/doi.org\/10.1109\/MSP.2012.2205597","journal-title":"IEEE Signal Proc. Mag."},{"key":"345_CR28","doi-asserted-by":"crossref","unstructured":"K. Vesel\u00fd, A. Ghoshal, L. Burget, D. Povey, Interspeech. Sequence-discriminative training of deep neural networks (2013), pp. 2345\u20132349","DOI":"10.21437\/Interspeech.2013-548"},{"key":"345_CR29","doi-asserted-by":"crossref","unstructured":"R. Prabhavalkar, T. Hori, T. Sainath, R. Schl\u00fcter, S. Watanabe, End-to-end speech recognition: a survey (2023). arXiv preprint arXiv:2303.03329","DOI":"10.1109\/TASLP.2023.3328283"},{"key":"345_CR30","doi-asserted-by":"crossref","unstructured":"M. Gales, S. Young, The application of hidden Markov models in speech recognition (Now Publishers Inc.,\u00a0Now Foundations and Trends, 2008)","DOI":"10.1561\/9781601981219"},{"issue":"4","key":"345_CR31","doi-asserted-by":"publisher","first-page":"796","DOI":"10.1044\/jshr.1104.796","volume":"11","author":"C Fisher","year":"1968","unstructured":"C. Fisher, Confusions among visually perceived consonants. J. Speech Hear. Res. 11(4), 796\u2013804 (1968). https:\/\/doi.org\/10.1044\/jshr.1104.796","journal-title":"J. Speech Hear. Res."},{"key":"345_CR32","doi-asserted-by":"crossref","unstructured":"H. Bear, R. Harvey, B. Theobald, Y. Lan, International Symposium on Visual Computing. Which phoneme-to-viseme maps best improve visual-only computer lip-reading? (Springer, 2014), pp. 230\u2013239","DOI":"10.1007\/978-3-319-14364-4_22"},{"key":"345_CR33","unstructured":"L. Cappelletta, N. Harte, 19th European Signal Processing Conference. Viseme definitions comparison for visual-only speech recognition (2011), pp. 2109-2113"},{"key":"345_CR34","doi-asserted-by":"publisher","unstructured":"D. Howell, S. Cox, B. Theobald, Visual units and confusion modelling for automatic lip-reading. Image Vision Comput. 51, 1\u201312 (2016). https:\/\/doi.org\/10.1016\/j.imavis.2016.03.003","DOI":"10.1016\/j.imavis.2016.03.003"},{"key":"345_CR35","doi-asserted-by":"crossref","unstructured":"K. Thangthai, R. Harvey, Interspeech. Building large-vocabulary speaker-independent lipreading systems (ISCA, 2018), pp. 2648\u20132652","DOI":"10.21437\/Interspeech.2018-2112"},{"key":"345_CR36","unstructured":"K. Thangthai, R. Harvey, S. Cox, B. Theobald, AVSP. Improving lip-reading performance for robust audiovisual speech recognition using DNNs (2015), pp. 127\u2013131"},{"key":"345_CR37","doi-asserted-by":"crossref","unstructured":"H. Bear, R. Harvey, ICASSP. Decoding visemes: improving machine lip-reading (2016), pp.2009\u20132013","DOI":"10.1109\/ICASSP.2016.7472029"},{"key":"345_CR38","doi-asserted-by":"crossref","unstructured":"H. Bear, R. Harvey, B. Theobald, Y. Lan, ICIP. Resolution limits on visual speech recognition (IEEE, 2014), pp. 1371\u20131375","DOI":"10.1109\/ICIP.2014.7025274"},{"issue":"2","key":"345_CR39","doi-asserted-by":"publisher","first-page":"198","DOI":"10.1109\/34.982900","volume":"24","author":"I Matthews","year":"2002","unstructured":"I. Matthews, T.F. Cootes, J.A. Bangham, S. Cox, R. Harvey, Extraction of visual features for lipreading. IEEE Trans. PAMI 24(2), 198\u2013213 (2002). https:\/\/doi.org\/10.1109\/34.982900","journal-title":"IEEE Trans. PAMI"},{"key":"345_CR40","doi-asserted-by":"crossref","unstructured":"A.A. Shaikh, D.K. Kumar, W.C. Yau, C. Azemin, J. Gubbi, 3rd CISP. Lip reading using optical flow and support vector machines.\u00a0IEEE.\u00a01, 327\u2013330 (2010)","DOI":"10.1109\/CISP.2010.5646264"},{"key":"345_CR41","doi-asserted-by":"crossref","unstructured":"D. Parekh, A. Gupta, S. Chhatpar, A. Yash, M. Kulkarni, 5th I2CT. Lip reading using convolutional auto encoders as feature extractor (2019), pp. 1\u20136","DOI":"10.1109\/I2CT45611.2019.9033664"},{"key":"345_CR42","doi-asserted-by":"crossref","unstructured":"P. Ma, R. Mira, S. Petridis, B.W. Schuller, M. Pantic, Interspeech. LiRA: learning visual speech representations from audio through self-supervision (2021), pp.3011\u20133015","DOI":"10.21437\/Interspeech.2021-1360"},{"issue":"4","key":"345_CR43","doi-asserted-by":"publisher","first-page":"487","DOI":"10.1109\/10.828148","volume":"47","author":"P Duchnowski","year":"2000","unstructured":"P. Duchnowski, D.S. Lum, J.C. Krause, M.G. Sexton, M.S. Bratakos, L.D. Braida, Development of speechreading supplements based on automatic speech recognition. IEEE rans. Biomed. Eng. 47(4), 487\u2013496 (2000). https:\/\/doi.org\/10.1109\/10.828148","journal-title":"IEEE rans. Biomed. Eng."},{"key":"345_CR44","unstructured":"Y. Lan, R. Harvey, B. Theobald, E. Ong, R. Bowden, International Conference on Auditory-Visual Speech Processing. Comparing visual features for lipreading (2009), pp. 102\u2013106"},{"key":"345_CR45","doi-asserted-by":"crossref","unstructured":"K. Thangthai, R. Harvey, Interspeech. Improving computer lipreading via DNN sequence discriminative training techniques (2017), pp. 3657\u20133661","DOI":"10.21437\/Interspeech.2017-106"},{"issue":"5","key":"345_CR46","doi-asserted-by":"publisher","first-page":"603","DOI":"10.1109\/TMM.2015.2407694","volume":"17","author":"N Harte","year":"2015","unstructured":"N. Harte, E. Gillen, TCD-TIMIT: an audio-visual corpus of continuous speech. IEEE Trans. Multimed. 17(5), 603\u2013615 (2015). https:\/\/doi.org\/10.1109\/TMM.2015.2407694","journal-title":"IEEE Trans. Multimed."},{"key":"345_CR47","unstructured":"K. Thangthai, H. Bear, R. Harvey, BMVC. Comparing phonemes and visemes with DNN-based lipreading (2017), pp. 4\u20137"},{"key":"345_CR48","doi-asserted-by":"crossref","unstructured":"P. Ma, A. Haliassos, A. Fernandez-Lopez, H. Chen, S. Petridis, M. Pantic, ICASSP. Auto-AVSR: audio-visual speech recognition with automatic labels (2023), pp.1\u20135","DOI":"10.1109\/ICASSP49357.2023.10096889"},{"key":"345_CR49","doi-asserted-by":"crossref","unstructured":"H. Chen, H. Zhou, J. Du, C.-H. Lee, J. Chen, S. Watanabe, C. Liu, ICASSP. The first multimodal information based speech processing (Misp) challenge: data, tasks, baselines and results (IEEE, 2022), pp. 9266-9270","DOI":"10.1109\/ICASSP43922.2022.9746683"},{"key":"345_CR50","doi-asserted-by":"crossref","unstructured":"K.R. Prajwal, T. Afouras, A. Zisserman, CVPR. Sub-word level lip reading with visual attention (IEEE, 2022), pp. 5162-5172","DOI":"10.1109\/CVPR52688.2022.00510"},{"key":"345_CR51","doi-asserted-by":"crossref","unstructured":"J. Son Chung, A. Senior, O. Vinyals, A. Zisserman, CVPR. Lip reading sentences in the wild (2017), pp. 6447\u20136456","DOI":"10.1109\/CVPR.2017.367"},{"key":"345_CR52","unstructured":"T. Afouras, J.-S. Chung, A. Zisserman, LRS3-TED: a large-scale dataset for visual speech recognition. (2018). arXiv preprint arXiv:1809.00496"},{"key":"345_CR53","doi-asserted-by":"crossref","unstructured":"S. Bhati, J. Villalba, L. Moro-Velazquez, T. Thebaud, N. Dehak, Leveraging pretrained image-text models for improving audio-visual learning (2023). arXiv preprint arXiv:2309.04628","DOI":"10.21437\/Interspeech.2023-135"},{"key":"345_CR54","unstructured":"J. Ngiam, A. Khosla, M. Kim, J. Nam, H. Lee, A. Ng, 28th ICML. Multimodal deep learning (PMLR, 2011), pp. 689\u2013696"},{"key":"345_CR55","unstructured":"A. Radford, J.W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, I. Sutskever, ICML. Learning transferable visual models from natural language supervision, vol. 139 (PMLR, 2021), pp. 8748\u20138763"},{"key":"345_CR56","unstructured":"E. Petajan, CVPR. Automatic lipreading to enhance speech recognition (IEEE, 1985), pp. 40\u201347"},{"key":"345_CR57","doi-asserted-by":"crossref","unstructured":"A. Adjoudani, C. Beno\u00eet, Speechreading by humans and machines. On the integration of auditory and visual parameters in an HMM-based ASR. (Springer, 1996), pp. 461\u2013471","DOI":"10.1007\/978-3-662-13015-5_35"},{"issue":"6","key":"345_CR58","doi-asserted-by":"publisher","first-page":"629","DOI":"10.1109\/89.799688","volume":"7","author":"P Teissier","year":"1999","unstructured":"P. Teissier, J. Robert-Ribes, J. Schwartz, A. Gu\u00e9rin-Dugu\u00e9, Comparing models for audiovisual fusion in a noisy-vowel recognition task. IEEE Trans. Speech Audio Process. 7(6), 629\u2013642 (1999). https:\/\/doi.org\/10.1109\/89.799688","journal-title":"IEEE Trans. Speech Audio Process."},{"key":"345_CR59","doi-asserted-by":"crossref","unstructured":"T. Afouras, J.S. Chung, A. Zisserman, ICASSP. ASR is all you need: cross-modal distillation for lip reading (2020), pp.2143\u20132147","DOI":"10.1109\/ICASSP40776.2020.9054253"},{"key":"345_CR60","unstructured":"Y.A.D. Djilali, S. Narayan, H. Boussaid, E. Almazrouei, M. Debbah, ICCV. Lip2Vec: efficient and robust visual speech recognition via Latentto-Latent Visual to Audio Representation Mapping (IEEE, 2023), pp. 13790-13801"},{"key":"345_CR61","doi-asserted-by":"crossref","unstructured":"C. Sui, M. Bennamoun, R. Togneri, ICCV. Listening with your eyes: towards a practical visual speech recognition system using deep Boltzmann machines (2015), pp. 154\u2013162","DOI":"10.1109\/ICCV.2015.26"},{"key":"345_CR62","unstructured":"A. Thanda, S. Venkatesan, Multi-task learning of deep neural networks for audio visual automatic speech recognition (2017). arXiv preprint arXiv:1701.02477"},{"issue":"1","key":"345_CR63","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1023\/A:1007379606734","volume":"28","author":"R Caruana","year":"1997","unstructured":"R. Caruana, Multitask learning. Mach. Learn. 28(1), 41\u201375 (1997). https:\/\/doi.org\/10.1023\/A:1007379606734","journal-title":"Mach. Learn."},{"key":"345_CR64","doi-asserted-by":"crossref","unstructured":"A. Fernandez-Lopez, O. Martinez, F.M. Sukno, 12th FG. Towards estimating the upper bound of visual-speech recognition: the visual lip-reading feasibility database (2017), pp.208\u2013215","DOI":"10.1109\/FG.2017.34"},{"key":"345_CR65","doi-asserted-by":"publisher","first-page":"2076","DOI":"10.1109\/TASLP.2022.3182274","volume":"30","author":"A Fernandez-Lopez","year":"2022","unstructured":"A. Fernandez-Lopez, F. Sukno, End-to-end lip-reading without large-scale data. IEEE\/ACM TASLP. 30, 2076\u20132090 (2022). https:\/\/doi.org\/10.1109\/TASLP.2022.3182274","journal-title":"IEEE\/ACM TASLP."},{"key":"345_CR66","doi-asserted-by":"crossref","unstructured":"D. Gimeno-Gomez, C.-D. Martinez-Hinarejos, IberSPEECH. Speaker-adapted endto-end visual speech recognition for continuous Spanish (2022), pp. 41\u201345","DOI":"10.21437\/IberSPEECH.2022-9"},{"key":"345_CR67","doi-asserted-by":"crossref","unstructured":"D. Gimeno-G\u00f3mez, C.-D. Mart\u00ednez-Hinarejos, IberSPEECH. Analysis of visual features for continuous lipreading in Spanish (2021), pp. 220\u2013224","DOI":"10.21437\/IberSPEECH.2021-47"},{"key":"345_CR68","unstructured":"D. Gimeno-G\u00f3mez, C.-D. Mart\u00ednez-Hinarejos, LREC. LIP-RTVE: an audiovisual database for continuous Spanish in the wild (ELRA, 2022), pp.2750\u20132758"},{"issue":"1","key":"345_CR69","doi-asserted-by":"publisher","first-page":"90","DOI":"10.1109\/T-C.1974.223784","volume":"100","author":"N Ahmed","year":"1974","unstructured":"N. Ahmed, T. Natarajan, K. Rao, Discrete cosine transform. IEEE Trans. Comput. 100(1), 90\u201393 (1974). https:\/\/doi.org\/10.1109\/T-C.1974.223784","journal-title":"IEEE Trans. Comput."},{"key":"345_CR70","doi-asserted-by":"publisher","unstructured":"D. Lowe, Distinctive image features from scale-invariant keypoints. Int. J. Comput. Vis. 60(2), 91\u2013110 (2004). https:\/\/doi.org\/10.1023\/B:VISI.0000029664.99615.94","DOI":"10.1023\/B:VISI.0000029664.99615.94"},{"key":"345_CR71","doi-asserted-by":"crossref","unstructured":"P. Wiggers, J.C. Wojdel, L. Rothkrantz, 7th ICSLP. Medium vocabulary continuous audio-visual speech recognition (ISCA, 2002), pp. 1921\u20131924","DOI":"10.21437\/ICSLP.2002-433"},{"key":"345_CR72","unstructured":"G. Bradski, The opencv library. Dr Dobb\u2019s J. Softw. Tools. 25, 120\u2013125 (2000)"},{"key":"345_CR73","first-page":"1755","volume":"10","author":"D King","year":"2009","unstructured":"D. King, Dlib-ml: a machine learning toolkit. J. Mach. Learn. Res. 10, 1755\u20131758 (2009)","journal-title":"J. Mach. Learn. Res."},{"key":"345_CR74","doi-asserted-by":"crossref","unstructured":"V. Kazemi, J. Sullivan, CVPR. One millisecond face alignment with an ensemble of regression trees (2014), pp. 1867\u20131874","DOI":"10.1109\/CVPR.2014.241"},{"key":"345_CR75","doi-asserted-by":"publisher","first-page":"108","DOI":"10.1016\/j.cviu.2015.09.013","volume":"141","author":"O Koller","year":"2015","unstructured":"O. Koller, J. Forster, H. Ney, Continuous sign language recognition: towards large vocabulary statistical recognition systems handling multiple signers. Comput. Vision Image Underst. 141, 108\u2013125 (2015). https:\/\/doi.org\/10.1016\/j.cviu.2015.09.013","journal-title":"Comput. Vision Image Underst."},{"key":"345_CR76","doi-asserted-by":"publisher","first-page":"2","DOI":"10.5772\/36466","volume":"3","author":"A Chitu","year":"2009","unstructured":"A. Chitu, L. Rothkrantz, Visual speech recognition automatic system for lip reading of Dutch. J. Inf. Technol. Control. 3, 2\u20139 (2009). https:\/\/doi.org\/10.5772\/36466","journal-title":"J. Inf. Technol. Control."},{"key":"345_CR77","doi-asserted-by":"crossref","unstructured":"K. Delac, M. Grgic, P. Liatsis, 47th ELMAR. Appearance-based statistical methods for face recognition (IEEE, 2005), pp. 151\u2013158","DOI":"10.1109\/ELMAR.2005.193665"},{"issue":"1\u20133","key":"345_CR78","doi-asserted-by":"publisher","first-page":"37","DOI":"10.1016\/0169-7439(87)80084-9","volume":"2","author":"S Wold","year":"1987","unstructured":"S. Wold, K. Esbensen, P. Geladi, Principal component analysis. Chemometr. Intell. Lab. Syst. 2(1\u20133), 37\u201352 (1987). https:\/\/doi.org\/10.1016\/0169-7439(87)80084-9","journal-title":"Chemometr. Intell. Lab. Syst."},{"key":"345_CR79","doi-asserted-by":"crossref","unstructured":"I. Fung, B. Mak, IEEE ICASSP. End-to-end low-resource lip-reading with maxout CNN and LSTM. (IEEE, 2018), pp. 2511\u20132515","DOI":"10.1109\/ICASSP.2018.8462280"},{"key":"345_CR80","doi-asserted-by":"crossref","unstructured":"K. Pale\u010dek, International Conference on Speech and Computer. Extraction of features for lip-reading using autoencoders (2014), pp. 209\u2013216","DOI":"10.1007\/978-3-319-11581-8_26"},{"issue":"1","key":"345_CR81","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1561\/2200000006","volume":"2","author":"Y Bengio","year":"2009","unstructured":"Y. Bengio, Learning deep architectures for AI. Found. Trends Mach. Learn. 2(1), 1\u2013127 (2009). https:\/\/doi.org\/10.1561\/2200000006","journal-title":"Found. Trends Mach. Learn."},{"key":"345_CR82","unstructured":"G. Potamianos, J. Luettin, C. Neti, ICASSP. Hierarchical discriminant features for audio-visual LVCSR, vol. 1 (2001), pp. 165\u2013168"},{"key":"345_CR83","unstructured":"D. Povey, A. Ghoshal, G. Boulianne, L. Burget, O. Glembek, N. Goel, K. Vesely, ASRU. The Kaldi Speech Recognition Toolkit (IEEE Signal Processing Society, 2011)"},{"key":"345_CR84","volume-title":"Linear statistical inference and is applications","author":"C Rao","year":"1965","unstructured":"C. Rao, Linear statistical inference and is applications (John Wiley & Sons, New York, 1965)"},{"key":"345_CR85","first-page":"661","volume":"2","author":"R Gopinath","year":"1998","unstructured":"R. Gopinath, ICASSP. Maximum likelihood modeling with Gaussian distributions for classification 2, 661\u2013664 (1998)","journal-title":"Maximum likelihood modeling with Gaussian distributions for classification"},{"key":"345_CR86","doi-asserted-by":"crossref","unstructured":"T. Anastasakos, J. McDonough, J. Makhoul, ICASSP. Speaker adaptive training: a maximum likelihood approach to speaker normalization.\u00a0IEEE.\u00a02, 1043\u20131046 (1997)","DOI":"10.1109\/ICASSP.1997.596119"},{"key":"345_CR87","doi-asserted-by":"crossref","unstructured":"G.E. Hinton, in Neural Networks: Tricks of the Trade: Second Edition. A practical guide to training restricted Boltzmann machines (Berlin, Heidelberg, Springer Berlin Heidelberg, 2012), pp. 599\u2013619","DOI":"10.1007\/978-3-642-35289-8_32"},{"key":"345_CR88","doi-asserted-by":"crossref","unstructured":"B. Kingsbury, ICASSP. Lattice-based optimization of sequence classification criteria for neural-network acoustic modeling (IEEE, 2009), pp. 3761-3764","DOI":"10.1109\/ICASSP.2009.4960445"},{"key":"345_CR89","doi-asserted-by":"crossref","unstructured":"G. Wang, K.C. Sim, Interspeech. Sequential classification criteria for NNs in automatic speech recognition (ISCA, 2011), pp. 441\u2013444","DOI":"10.21437\/Interspeech.2011-170"},{"key":"345_CR90","first-page":"49","volume":"11","author":"L Bahl","year":"1986","unstructured":"L. Bahl, P. Brown, P. de Souza, R. Mercer, ICASSP. Maximum mutual information estimation of hidden Markov model parameters for speech recognition 11, 49\u201352 (1986)","journal-title":"Maximum mutual information estimation of hidden Markov model parameters for speech recognition"},{"key":"345_CR91","doi-asserted-by":"crossref","unstructured":"D. Povey, P. Woodland, ICASSP. Minimum phone error and I-smoothing for improved discriminative training, vol. 1 (2002), pp. I-105-I-108","DOI":"10.1109\/ICASSP.2002.1005687"},{"key":"345_CR92","first-page":"887","volume":"2","author":"J Kaiser","year":"2000","unstructured":"J. Kaiser, B. Horvat, Z. Kacic, ICSLP. A novel loss function for the overall risk criterion based discriminative training of HMM models 2, 887\u2013890 (2000)","journal-title":"A novel loss function for the overall risk criterion based discriminative training of HMM models"},{"key":"345_CR93","doi-asserted-by":"crossref","unstructured":"D. Povey, B. Kingsbury, ICASSP. Evaluation of proposed modifications to MPE for large scale discriminative training, vol 4 (IEEE, 2007), pp. IV-321-IV-324","DOI":"10.1109\/ICASSP.2007.366914"},{"key":"345_CR94","doi-asserted-by":"crossref","unstructured":"M. Mohri, F. Pereira, M. Riley, Springer Handbook of Speech Processing. Speech recognition with weighted finite-state transducers (Springer, 2008), pp. 559\u2013584","DOI":"10.1007\/978-3-540-49127-9_28"},{"key":"345_CR95","unstructured":"A. Quilis, Principios de fonolog\u00eda y fon\u00e9tica espa\u00f1olas, vol. 43 (Arco Libros, 1997)"},{"key":"345_CR96","doi-asserted-by":"crossref","unstructured":"A. Stolcke, ICSLP. SRILM \u2013 an extensible language modeling toolkit (ISCA, 2002), pp. 901\u2013904","DOI":"10.21437\/ICSLP.2002-303"},{"key":"345_CR97","doi-asserted-by":"crossref","unstructured":"M. Bisani, H. Ney, ICASSP. Bootstrap estimates for confidence intervals in ASR performance evaluation.\u00a0IEEE.\u00a01, 409\u2013412 (2004)","DOI":"10.1109\/ICASSP.2004.1326009"},{"key":"345_CR98","unstructured":"A. Zadeh, Y. Cao, S. Hessner, P. Liang, S. Poria, L. Morency, EMNLP. MOSEAS: a multimodal language dataset for Spanish, Portuguese, German and French (ACL, 2020), pp. 1801\u20131812"},{"key":"345_CR99","doi-asserted-by":"crossref","unstructured":"H. Hadian, H. Sameti, D. Povey, S. Khudanpur, Interspeech. End-to-end speech recognition using lattice-free MMI (ISCA, 2018), pp. 12\u201316","DOI":"10.21437\/Interspeech.2018-1423"},{"issue":"11","key":"345_CR100","doi-asserted-by":"publisher","first-page":"1949","DOI":"10.1109\/TASLP.2018.2848701","volume":"26","author":"H Hadian","year":"2018","unstructured":"H. Hadian, H. Sameti, D. Povey, S. Khudanpur, Flat-start single-stage discriminatively trained HMM-based models for ASR. IEEE\/ACM TASLP. 26(11), 1949\u20131961 (2018). https:\/\/doi.org\/10.1109\/TASLP.2018.2848701","journal-title":"IEEE\/ACM TASLP."},{"key":"345_CR101","doi-asserted-by":"crossref","unstructured":"O. Hrinchuk, M. Popova, B. Ginsburg, ICASSP. Correction of automatic speech recognition with transformer sequence-to-sequence model (IEEE, 2020), pp. 7074\u20137078","DOI":"10.1109\/ICASSP40776.2020.9053051"},{"key":"345_CR102","doi-asserted-by":"crossref","unstructured":"L. Mai, J. Carson-Berndsen, Enhancing conversational quality in language learning chatbots: an evaluation of GPT4 for ASR error correction (2023). arXiv preprint arXiv:2307.09744","DOI":"10.1109\/ICASSP48485.2024.10447641"}],"container-title":["EURASIP Journal on Audio, Speech, and Music Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s13636-024-00345-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1186\/s13636-024-00345-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s13636-024-00345-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,18]],"date-time":"2024-11-18T02:18:55Z","timestamp":1731896335000},"score":1,"resource":{"primary":{"URL":"https:\/\/asmp-eurasipjournals.springeropen.com\/articles\/10.1186\/s13636-024-00345-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,6]]},"references-count":102,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2024,12]]}},"alternative-id":["345"],"URL":"https:\/\/doi.org\/10.1186\/s13636-024-00345-7","relation":{},"ISSN":["1687-4722"],"issn-type":[{"value":"1687-4722","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,5,6]]},"assertion":[{"value":"16 July 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 April 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 May 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"The authors declare that they have no competing interests.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"25"}}