{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T01:13:35Z","timestamp":1780708415521,"version":"3.54.1"},"reference-count":37,"publisher":"IEEE","license":[{"start":{"date-parts":[[2019,12,1]],"date-time":"2019-12-01T00:00:00Z","timestamp":1575158400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2019,12,1]],"date-time":"2019-12-01T00:00:00Z","timestamp":1575158400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2019,12,1]],"date-time":"2019-12-01T00:00:00Z","timestamp":1575158400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019,12]]},"DOI":"10.1109\/asru46091.2019.9004036","type":"proceedings-article","created":{"date-parts":[[2020,2,21]],"date-time":"2020-02-21T02:01:33Z","timestamp":1582250493000},"page":"905-912","source":"Crossref","is-referenced-by-count":94,"title":["Recurrent Neural Network Transducer for Audio-Visual Speech Recognition"],"prefix":"10.1109","author":[{"given":"Takaki","family":"Makino","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hank","family":"Liao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yannis","family":"Assael","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Brendan","family":"Shillingford","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Basilio","family":"Garcia","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Otavio","family":"Braga","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Olivier","family":"Siohan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1987.1169544"},{"key":"ref32","article-title":"Ro-bust audio-visual speech recognition using bimodal DF-SMN with multi-condition training and dropout regularization","author":"zhang","year":"2019","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing"},{"key":"ref31","article-title":"Efficient computation of confidence intervals forword error rates","author":"vilar","year":"2008","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing"},{"key":"ref30","article-title":"A comparison of sequence-to-sequence models for speech recognition","author":"rohit","year":"2017","journal-title":"InterSpeech"},{"key":"ref37","year":"0","journal-title":"Regulation (EU) 2016\/679 of the European Parliament and of the Council of 27 April 2016 on the protection of natural persons with regard to the processing of personal data and on the free movement of such data and repealing Directive 95\/46\/EC (General Data Protection Regulation)"},{"key":"ref36","year":"0","journal-title":"Artificial Intelligence at Google Our Principles"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1400"},{"key":"ref34","volume":"37","author":"ephrat","year":"2018","journal-title":"Looking to listen at the cocktail party A speaker-independent audio-visual model for speech separation"},{"key":"ref10","article-title":"Large-scale visual speech recognition","author":"brendan","year":"2019","journal-title":"InterSpeech"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1943"},{"key":"ref12","article-title":"Sequence transduction with recurrent neural networks","author":"graves","year":"2012","journal-title":"International Conference on Machine Learning Representation Learning Workshop"},{"key":"ref13","article-title":"Comparison of parametric representations for monosyllable word recognition in continuously spoken sentences","volume":"28","author":"davis","year":"1980","journal-title":"IEEE Speech and Audio Processing"},{"key":"ref14","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2015","journal-title":"International Conference on Learning Representations"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1121\/1.1907309"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1511\/1998.25.861"},{"key":"ref17","author":"potamianos","year":"2003","journal-title":"Re-cent advances in the automatic recognition of audiovisual speech"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/s10844-016-0438-z"},{"key":"ref19","article-title":"LipNet: End-to-end sentence-level lipreading","author":"assael","year":"2017","journal-title":"GPU Technology Conference"},{"key":"ref28","author":"ba","year":"2016","journal-title":"Layer normalization"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639643"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_1"},{"key":"ref3","article-title":"Deep audiovisual speech recognition","author":"afouras","year":"2018","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"ref6","article-title":"Attention is all you need","author":"ashish","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref29","article-title":"ADAM: A method for stochastic optimization","author":"kingma","year":"2015","journal-title":"ICLRE"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref8","author":"afouras","year":"2018","journal-title":"LRS3- TED A large-scale dataset for visual speech recognition"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref2","volume":"abs 1807 5162","author":"brendan","year":"2018","journal-title":"Large-scale visual speech recognition"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2013.6707758"},{"key":"ref1","article-title":"Visual features for context-aware speech recognition","author":"gupta","year":"2017","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing"},{"key":"ref20","article-title":"Lip reading in the wild","author":"chung","year":"2016","journal-title":"Asian Conference on Computer Vision"},{"key":"ref22","article-title":"Lip reading in profile","author":"chung","year":"2017","journal-title":"British Machine Vision Conference"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.367"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"ref23","year":"0","journal-title":"Google Cloud Vision API"},{"key":"ref26","article-title":"Con-text dependent phone models for LSTM RNN acoustic modelling","author":"senior","year":"2015","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing"},{"key":"ref25","doi-asserted-by":"crossref","DOI":"10.1016\/0167-6393(93)90095-3","article-title":"Assess-ment for automatic speech recognition: II. NOISEX-92: A database and an experiment to study the effect of additive noise on speech recognition systems","volume":"12","author":"varga","year":"1993","journal-title":"Speech Communication"}],"event":{"name":"2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"SG, Singapore","start":{"date-parts":[[2019,12,14]]},"end":{"date-parts":[[2019,12,18]]}},"container-title":["2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8985378\/9003727\/09004036.pdf?arnumber=9004036","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T10:51:19Z","timestamp":1658141479000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9004036\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,12]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/asru46091.2019.9004036","relation":{},"subject":[],"published":{"date-parts":[[2019,12]]}}}