{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:27:10Z","timestamp":1750220830043,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":30,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,10,14]],"date-time":"2019-10-14T00:00:00Z","timestamp":1571011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["3102019ZY1004"],"award-info":[{"award-number":["3102019ZY1004"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012659","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61571363,61571362"],"award-info":[{"award-number":["61571363,61571362"]}],"id":[{"id":"10.13039\/501100012659","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010228","name":"Natural Science Foundation of Shaanxi Province","doi-asserted-by":"publisher","award":["2018JM6015"],"award-info":[{"award-number":["2018JM6015"]}],"id":[{"id":"10.13039\/501100010228","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,10,14]]},"DOI":"10.1145\/3340555.3356102","type":"proceedings-article","created":{"date-parts":[[2019,10,17]],"date-time":"2019-10-17T12:49:48Z","timestamp":1571316588000},"page":"540-545","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Deep Audio-visual System for Closed-set Word-level Speech Recognition"],"prefix":"10.1145","author":[{"given":"Yougen","family":"Yuan","sequence":"first","affiliation":[{"name":"Northwestern Polytechnical University, China"}]},{"given":"Wei","family":"Tang","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University, China"}]},{"given":"Minhao","family":"Fan","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University, China"}]},{"given":"Yue","family":"Cao","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University, China"}]},{"given":"Peng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University, China"}]},{"given":"Lei","family":"Xie","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University, China"}]}],"member":"320","published-online":{"date-parts":[[2019,10,14]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Deep audio-visual speech recognition","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras , Joon\u00a0Son Chung , Andrew Senior , Oriol Vinyals , and Andrew Zisserman . 2018. Deep audio-visual speech recognition . IEEE transactions on pattern analysis and machine intelligence ( 2018 ). Triantafyllos Afouras, Joon\u00a0Son Chung, Andrew Senior, Oriol Vinyals, and Andrew Zisserman. 2018. Deep audio-visual speech recognition. IEEE transactions on pattern analysis and machine intelligence (2018)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472172"},{"key":"e_1_3_2_1_3_1","volume-title":"Lipnet: End-to-end sentence-level lipreading. arXiv preprint arXiv:1611.01599(2016).","author":"Assael M","year":"2016","unstructured":"Yannis\u00a0 M Assael , Brendan Shillingford , Shimon Whiteson , and Nando De\u00a0Freitas . 2016 . Lipnet: End-to-end sentence-level lipreading. arXiv preprint arXiv:1611.01599(2016). Yannis\u00a0M Assael, Brendan Shillingford, Shimon Whiteson, and Nando De\u00a0Freitas. 2016. Lipnet: End-to-end sentence-level lipreading. arXiv preprint arXiv:1611.01599(2016)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/6046.865479"},{"key":"e_1_3_2_1_5_1","volume-title":"Proc. ICML. 1764\u20131772","author":"Graves Alex","year":"2014","unstructured":"Alex Graves and Navdeep Jaitly . 2014 . Towards end-to-end speech recognition with recurrent neural networks . In Proc. ICML. 1764\u20131772 . Alex Graves and Navdeep Jaitly. 2014. Towards end-to-end speech recognition with recurrent neural networks. In Proc. ICML. 1764\u20131772."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_8_1","volume-title":"Deep neural networks for acoustic modeling in speech recognition","author":"Hinton Geoffrey","year":"2012","unstructured":"Geoffrey Hinton , Li Deng , Dong Yu , George Dahl , Abdel-rahman Mohamed, Navdeep Jaitly , Andrew Senior , Vincent Vanhoucke , Patrick Nguyen , Brian Kingsbury , 2012. Deep neural networks for acoustic modeling in speech recognition . IEEE Signal processing magazine 29 ( 2012 ). Geoffrey Hinton, Li Deng, Dong Yu, George Dahl, Abdel-rahman Mohamed, Navdeep Jaitly, Andrew Senior, Vincent Vanhoucke, Patrick Nguyen, Brian Kingsbury, 2012. Deep neural networks for acoustic modeling in speech recognition. IEEE Signal processing magazine 29 (2012)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639140"},{"key":"e_1_3_2_1_10_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980(2014).","author":"Kingma P","year":"2014","unstructured":"Diederik\u00a0 P Kingma and Jimmy Ba . 2014 . Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980(2014). Diederik\u00a0P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980(2014)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIS.2016.7550888"},{"key":"e_1_3_2_1_12_1","volume-title":"Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio.","author":"Lin Zhouhan","year":"2017","unstructured":"Zhouhan Lin , Minwei Feng , Cicero Nogueira\u00a0dos Santos , Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. 2017 . A structured self-attentive sentence embedding. arXiv preprint arXiv:1703.03130(2017). Zhouhan Lin, Minwei Feng, Cicero Nogueira\u00a0dos Santos, Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. 2017. A structured self-attentive sentence embedding. arXiv preprint arXiv:1703.03130(2017)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2010-343"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-204"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-293"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-014-0629-7"},{"key":"e_1_3_2_1_17_1","volume-title":"Proc. NIPS-W.","author":"Paszke Adam","year":"2017","unstructured":"Adam Paszke , Sam Gross , Soumith Chintala , Gregory Chanan , Edward Yang , Zachary DeVito , Zeming Lin , Alban Desmaison , Luca Antiga , and Adam Lerer . 2017 . Automatic differentiation in PyTorch . In Proc. NIPS-W. Adam Paszke, Sam Gross, Soumith Chintala, Gregory Chanan, Edward Yang, Zachary DeVito, Zeming Lin, Alban Desmaison, Luca Antiga, and Adam Lerer. 2017. Automatic differentiation in PyTorch. In Proc. NIPS-W."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952625"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","first-page":"45","DOI":"10.1109\/TAFFC.2015.2446462","article-title":"Prediction-based audiovisual fusion for classification of non-linguistic vocalisations","volume":"7","author":"Petridis Stavros","year":"2015","unstructured":"Stavros Petridis and Maja Pantic . 2015 . Prediction-based audiovisual fusion for classification of non-linguistic vocalisations . IEEE Transactions on Affective Computing 7 , 1 (2015), 45 \u2013 58 . Stavros Petridis and Maja Pantic. 2015. Prediction-based audiovisual fusion for classification of non-linguistic vocalisations. IEEE Transactions on Affective Computing 7, 1 (2015), 45\u201358.","journal-title":"IEEE Transactions on Affective Computing"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461326"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639643"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2003.817150"},{"key":"e_1_3_2_1_23_1","volume-title":"Audio-visual automatic speech recognition: An overview. Issues in visual and audio-visual speech processing 22","author":"Potamianos Gerasimos","year":"2004","unstructured":"Gerasimos Potamianos , Chalapathy Neti , Juergen Luettin , and Iain Matthews . 2004. Audio-visual automatic speech recognition: An overview. Issues in visual and audio-visual speech processing 22 ( 2004 ), 23. Gerasimos Potamianos, Chalapathy Neti, Juergen Luettin, and Iain Matthews. 2004. Audio-visual automatic speech recognition: An overview. Issues in visual and audio-visual speech processing 22 (2004), 23."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-80"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Themos Stafylakis and Georgios Tzimiropoulos. 2017. Combining residual networks with LSTMs for lipreading. arXiv preprint arXiv:1703.04105(2017). Themos Stafylakis and Georgios Tzimiropoulos. 2017. Combining residual networks with LSTMs for lipreading. arXiv preprint arXiv:1703.04105(2017).","DOI":"10.21437\/Interspeech.2017-85"},{"key":"e_1_3_2_1_26_1","volume-title":"Proc. AVSP. 127\u2013131","author":"Thangthai Kwanchiva","year":"2015","unstructured":"Kwanchiva Thangthai , Richard\u00a0 W Harvey , Stephen\u00a0 J Cox , and Barry-John Theobald . 2015 . Improving lip-reading performance for robust audiovisual speech recognition using DNNs .. In Proc. AVSP. 127\u2013131 . Kwanchiva Thangthai, Richard\u00a0W Harvey, Stephen\u00a0J Cox, and Barry-John Theobald. 2015. Improving lip-reading performance for robust audiovisual speech recognition using DNNs.. In Proc. AVSP. 127\u2013131."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Michael Wand and J\u00fcrgen Schmidhuber. 2017. Improving speaker-independent lipreading with domain-adversarial training. arXiv preprint arXiv:1708.01565(2017). Michael Wand and J\u00fcrgen Schmidhuber. 2017. Improving speaker-independent lipreading with domain-adversarial training. arXiv preprint arXiv:1708.01565(2017).","DOI":"10.21437\/Interspeech.2017-421"},{"key":"e_1_3_2_1_29_1","unstructured":"Shuang Yang Yuanhang Zhang Dalu Feng Mingmin Yang Chenhao Wang Jingyun Xiao Keyu Long Shiguang Shan and Xilin Chen. 2018. LRW-1000: A Naturally-Distributed Large-Scale Benchmark for Lip Reading in the Wild. arXiv preprint arXiv:1810.06990(2018). Shuang Yang Yuanhang Zhang Dalu Feng Mingmin Yang Chenhao Wang Jingyun Xiao Keyu Long Shiguang Shan and Xilin Chen. 2018. LRW-1000: A Naturally-Distributed Large-Scale Benchmark for Lip Reading in the Wild. arXiv preprint arXiv:1810.06990(2018)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2918638"}],"event":{"name":"ICMI '19: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","acronym":"ICMI '19","location":"Suzhou China"},"container-title":["2019 International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3340555.3356102","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3340555.3356102","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:13:29Z","timestamp":1750202009000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3340555.3356102"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,10,14]]},"references-count":30,"alternative-id":["10.1145\/3340555.3356102","10.1145\/3340555"],"URL":"https:\/\/doi.org\/10.1145\/3340555.3356102","relation":{},"subject":[],"published":{"date-parts":[[2019,10,14]]},"assertion":[{"value":"2019-10-14","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}