{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:31:15Z","timestamp":1776882675476,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":24,"publisher":"ACM","license":[{"start":{"date-parts":[[2018,10,2]],"date-time":"2018-10-02T00:00:00Z","timestamp":1538438400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"ADAPT Centre for Digital Content Technology","award":["Grant 13\/RC\/2106"],"award-info":[{"award-number":["Grant 13\/RC\/2106"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2018,10,2]]},"DOI":"10.1145\/3242969.3243014","type":"proceedings-article","created":{"date-parts":[[2018,10,2]],"date-time":"2018-10-02T12:09:29Z","timestamp":1538482169000},"page":"111-115","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":50,"title":["Attention-based Audio-Visual Fusion for Robust Automatic Speech Recognition"],"prefix":"10.1145","author":[{"given":"George","family":"Sterpu","sequence":"first","affiliation":[{"name":"Trinity College Dublin, Dublin, Ireland"}]},{"given":"Christian","family":"Saam","sequence":"additional","affiliation":[{"name":"Trinity College Dublin, Dublin, Ireland"}]},{"given":"Naomi","family":"Harte","sequence":"additional","affiliation":[{"name":"Trinity College Dublin, Dublin, Ireland"}]}],"member":"320","published-online":{"date-parts":[[2018,10,2]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"Dzmitry Bahdanau Kyunghyun Cho and Yoshua Bengio . 2018. Neural Machine Translation by Jointly Learning to Align and Translate International Conference on Learning Representations. deftempurl%http:\/\/arxiv.org\/abs\/1409.0473 tempurl  Dzmitry Bahdanau Kyunghyun Cho and Yoshua Bengio . 2018. Neural Machine Translation by Jointly Learning to Align and Translate International Conference on Learning Representations. deftempurl%http:\/\/arxiv.org\/abs\/1409.0473 tempurl"},{"key":"e_1_3_2_2_2_1","unstructured":"T. Baltruusaitis C. Ahuja and L. P. Morency . 2018. Multimodal Machine Learning: A Survey and Taxonomy. IEEE Transactions on Pattern Analysis and Machine Intelligence (2018) 1--1.  T. Baltruusaitis C. Ahuja and L. P. Morency . 2018. Multimodal Machine Learning: A Survey and Taxonomy. IEEE Transactions on Pattern Analysis and Machine Intelligence (2018) 1--1."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"T. Baltruusaitis P. Robinson and L. P. Morency . 2016. OpenFace: An open source facial behavior analysis toolkit 2016 IEEE Winter Conference on Applications of Computer Vision (WACV). 1--10.  T. Baltruusaitis P. Robinson and L. P. Morency . 2016. OpenFace: An open source facial behavior analysis toolkit 2016 IEEE Winter Conference on Applications of Computer Vision (WACV). 1--10.","DOI":"10.1109\/WACV.2016.7477553"},{"key":"e_1_3_2_2_4_1","unstructured":"BBC and Oxford University . 2017. The BBC-Oxford Multi-View Lip Reading Sentences 2 (LRS2) Dataset. http:\/\/www.robots.ox.ac.uk\/ vgg\/data\/lip_reading_sentences\/. (2017). Online Accessed: 11 August 2018.  BBC and Oxford University . 2017. The BBC-Oxford Multi-View Lip Reading Sentences 2 (LRS2) Dataset. http:\/\/www.robots.ox.ac.uk\/ vgg\/data\/lip_reading_sentences\/. (2017). Online Accessed: 11 August 2018."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"crossref","unstructured":"Chung-Cheng Chiu Tara Sainath Yonghui Wu Rohit Prabhavalkar Patrick Nguyen Zhifeng Chen Anjuli Kannan Ron J. Weiss Kanishka Rao Katya Gonina Navdeep Jaitly Bo Li Jan Chorowski and Michiel Bacchiani . 2018. State-of-the-art Speech Recognition With Sequence-to-Sequence Models ICASSP. deftempurl%https:\/\/arxiv.org\/pdf\/1712.01769.pdf tempurl  Chung-Cheng Chiu Tara Sainath Yonghui Wu Rohit Prabhavalkar Patrick Nguyen Zhifeng Chen Anjuli Kannan Ron J. Weiss Kanishka Rao Katya Gonina Navdeep Jaitly Bo Li Jan Chorowski and Michiel Bacchiani . 2018. State-of-the-art Speech Recognition With Sequence-to-Sequence Models ICASSP. deftempurl%https:\/\/arxiv.org\/pdf\/1712.01769.pdf tempurl","DOI":"10.1109\/ICASSP.2018.8462105"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"e_1_3_2_2_7_1","volume-title":"Lip Reading in the Wild. In Asian Conference on Computer Vision.","author":"Chung J. S.","year":"2016"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1049\/cp:19991218"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2407694"},{"key":"e_1_3_2_2_10_1","volume-title":"Computer Vision -- ECCV","author":"He Kaiming","year":"2016"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2015.2459017"},{"key":"e_1_3_2_2_12_1","volume-title":"Proceedings of the 28th International Conference on Machine Learning, ICML 2011. 689--696","author":"Ngiam Jiquan"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"crossref","unstructured":"Stavros Petridis Themos Stafylakis Pingchuan Ma Feipeng Cai Georgios mboxTzimiropoulos and Maja Pantic . 2018. End-to-end Audiovisual Speech Recognition. In ICASSP. deftempurl%http:\/\/arxiv.org\/abs\/1802.06424 tempurl  Stavros Petridis Themos Stafylakis Pingchuan Ma Feipeng Cai Georgios mboxTzimiropoulos and Maja Pantic . 2018. End-to-end Audiovisual Speech Recognition. In ICASSP. deftempurl%http:\/\/arxiv.org\/abs\/1802.06424 tempurl","DOI":"10.1109\/ICASSP.2018.8461326"},{"key":"e_1_3_2_2_14_1","first-page":"9","article-title":"Recent advances in the automatic recognition of audiovisual speech","volume":"91","author":"Potamianos G.","year":"2003","journal-title":"Proc. IEEE"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"crossref","unstructured":"Rohit Prabhavalkar Kanishka Rao Tara N. Sainath Bo Li Leif Johnson and Navdeep Jaitly . 2017. A Comparison of Sequence-to-Sequence Models for Speech Recognition Proc. Interspeech 2017. 939--943.  Rohit Prabhavalkar Kanishka Rao Tara N. Sainath Bo Li Leif Johnson and Navdeep Jaitly . 2017. A Comparison of Sequence-to-Sequence Models for Speech Recognition Proc. Interspeech 2017. 939--943.","DOI":"10.21437\/Interspeech.2017-233"},{"key":"e_1_3_2_2_16_1","volume-title":"Computer Vision textendash ECCV 2016 (Lecture Notes in Computer Science)","author":"Rajagopalan Shyam Sundar"},{"key":"e_1_3_2_2_17_1","volume-title":"On the Convergence of Adam and Beyond. In International Conference on Learning Representations. deftempurl%https:\/\/openreview.net\/forum?id=ryQu7f-RZ tempurl","author":"Reddi Sashank J.","year":"2018"},{"key":"e_1_3_2_2_18_1","volume-title":"Thirtieth AAAI Conference on Artificial Intelligence. deftempurl%https:\/\/www.aaai.org\/ocs\/index.php\/AAAI\/AAAI16\/paper\/view\/12386 tempurl 00014","author":"Ren Jimmy","year":"2016"},{"key":"e_1_3_2_2_19_1","volume-title":"Lip Reading Sentences in the Wild. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Chung Joon Son","year":"2017"},{"key":"e_1_3_2_2_20_1","volume-title":"Combining Residual Networks with LSTMs for Lipreading Proc. Interspeech","author":"Stafylakis Themos","year":"2017"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2018.2815268"},{"key":"e_1_3_2_2_22_1","volume-title":"Improving Speaker-Independent Lipreading with Domain-Adversarial Training Proc. Interspeech","author":"Wand Michael","year":"2017"},{"key":"e_1_3_2_2_23_1","volume-title":"Memory Fusion Network for Multi-view Sequential Learning AAAI Conference on Artificial Intelligence. deftempurl%https:\/\/aaai.org\/ocs\/index.php\/AAAI\/AAAI18\/paper\/view\/17341 tempurl","author":"Zadeh Amir","year":"2018"},{"key":"e_1_3_2_2_24_1","volume-title":"Multi-attention Recurrent Network for Human Communication Comprehension AAAI Conference on Artificial Intelligence. deftempurl%https:\/\/aaai.org\/ocs\/index.php\/AAAI\/AAAI18\/paper\/view\/17390 tempurl","author":"Zadeh Amir","year":"2018"}],"event":{"name":"ICMI '18: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","location":"Boulder CO USA","acronym":"ICMI '18","sponsor":["SIGCHI Specialist Interest Group in Computer-Human Interaction of the ACM"]},"container-title":["Proceedings of the 20th ACM International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3242969.3243014","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3242969.3243014","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T01:39:25Z","timestamp":1750210765000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3242969.3243014"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,10,2]]},"references-count":24,"alternative-id":["10.1145\/3242969.3243014","10.1145\/3242969"],"URL":"https:\/\/doi.org\/10.1145\/3242969.3243014","relation":{},"subject":[],"published":{"date-parts":[[2018,10,2]]},"assertion":[{"value":"2018-10-02","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}