{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,9]],"date-time":"2026-05-09T05:33:25Z","timestamp":1778304805906,"version":"3.51.4"},"reference-count":25,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,4]]},"DOI":"10.1109\/icassp.2018.8461326","type":"proceedings-article","created":{"date-parts":[[2018,9,21]],"date-time":"2018-09-21T22:24:48Z","timestamp":1537568688000},"page":"6548-6552","source":"Crossref","is-referenced-by-count":223,"title":["End-to-End Audiovisual Speech Recognition"],"prefix":"10.1109","author":[{"given":"Stavros","family":"Petridis","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Themos","family":"Stafylakis","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pingehuan","family":"Ma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Feipeng","family":"Cai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Georgios","family":"Tzimiropoulos","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Maja","family":"Pantic","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952625"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472852"},{"key":"ref12","author":"assael","year":"2016","journal-title":"Lipnet Sentence-level lipreading"},{"key":"ref13","first-page":"3652","article-title":"Combining residual networks with LSTMs for lipreading","volume":"9","author":"stafylakis","year":"2017","journal-title":"INTER-SPEECH"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472669"},{"key":"ref15","article-title":"Lip reading sentences in the wild","author":"chung","year":"2017","journal-title":"IEEE CVPR"},{"key":"ref16","doi-asserted-by":"crossref","DOI":"10.21437\/AVSP.2017-8","article-title":"End-to-end audiovisual fusion with LSTMs","author":"petridis","year":"2017","journal-title":"Proc Conf Auditory-Visual Speech Processing"},{"key":"ref17","first-page":"1","article-title":"Ouluvs2: A multi-view audiovisual database for nonrigid mouth motion analysis","author":"anina","year":"2015","journal-title":"IEEE FG"},{"key":"ref18","article-title":"End-to-end multimodal emotion recognition using deep neural networks","author":"tzirakis","year":"0","journal-title":"IEEE Journal of Selected Topics in Signal Processing 2017"},{"key":"ref19","first-page":"87","article-title":"Lip reading in the wild","author":"chung","year":"2016","journal-title":"ACCV"},{"key":"ref4","first-page":"689","article-title":"Multimodal deep learning","author":"ngiam","year":"2011","journal-title":"Proc of ICML"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2015.2446462"},{"key":"ref6","article-title":"Integration of deep bottleneck features for audio-visual speech recognition","author":"ninomiya","year":"2015","journal-title":"InterSpeech"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.389"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-721"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178347"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/6046.865479"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472088"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2003.817150"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1121\/1.2229005"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref21","first-page":"1189","article-title":"Moving-talker, speaker-independent feature study, and baseline results using the CUAVE multimodal speech corpus","volume":"2002","author":"patterson","year":"2002","journal-title":"EURASIP J Appl Signal Process"},{"key":"ref24","author":"kingma","year":"2014","journal-title":"Adam A method for stochastic optimization"},{"key":"ref23","first-page":"630","article-title":"Identity mappings in deep residual networks","author":"he","year":"2016","journal-title":"ECCV"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(93)90095-3"}],"event":{"name":"ICASSP 2018 - 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Calgary, AB","start":{"date-parts":[[2018,4,15]]},"end":{"date-parts":[[2018,4,20]]}},"container-title":["2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8450881\/8461260\/08461326.pdf?arnumber=8461326","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,8,24]],"date-time":"2020-08-24T01:07:50Z","timestamp":1598231270000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8461326\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,4]]},"references-count":25,"URL":"https:\/\/doi.org\/10.1109\/icassp.2018.8461326","relation":{},"subject":[],"published":{"date-parts":[[2018,4]]}}}