{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,13]],"date-time":"2026-02-13T23:28:34Z","timestamp":1771025314583,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2018AAA0100603"],"award-info":[{"award-number":["2018AAA0100603"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61836002, 62072397"],"award-info":[{"award-number":["61836002, 62072397"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Zhejiang Natural Science Foundation","award":["LR19F020006"],"award-info":[{"award-number":["LR19F020006"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475220","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T20:56:12Z","timestamp":1634590572000},"page":"1359-1367","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":11,"title":["SimulLR: Simultaneous Lip Reading Transducer with Attention-Guided Adaptive Memory"],"prefix":"10.1145","author":[{"given":"Zhijie","family":"Lin","sequence":"first","affiliation":[{"name":"Zhejiang University, HangZhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhou","family":"Zhao","sequence":"additional","affiliation":[{"name":"Zhejiang University, HangZhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haoyuan","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang University, HangZhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinglin","family":"Liu","sequence":"additional","affiliation":[{"name":"Zhejiang University, HangZhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Meng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, ShenZhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xingshan","family":"Zeng","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, ShenZhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaofei","family":"He","sequence":"additional","affiliation":[{"name":"Zhejiang University, HangZhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Andrew Senior, Oriol Vinyals, and Andrew Zisserman.","author":"Afouras Triantafyllos","year":"2018"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.5555\/3045390.3045410"},{"key":"e_1_3_2_2_3_1","volume-title":"Lipnet: End-to-end sentence-level lipreading. arXiv preprint arXiv:1611.01599","author":"Assael Yannis M","year":"2016"},{"key":"e_1_3_2_2_4_1","volume-title":"Variational memory addressing in generative models. arXiv preprint arXiv:1709.07116","author":"Bornschein J\u00f6rg","year":"2017"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413623"},{"key":"e_1_3_2_2_6_1","volume-title":"2020 b. Developing Real-time Streaming Transformer Transducer for Speech Recognition on Large-scale Dataset. arXiv preprint arXiv:2010.11395","author":"Chen Xie","year":"2020"},{"key":"e_1_3_2_2_7_1","volume-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555","author":"Chung Junyoung","year":"2014"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.367"},{"key":"e_1_3_2_2_9_1","volume-title":"Asian Conference on Computer Vision. Springer, 87--103","author":"Chung Joon Son","year":"2016"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.2229005"},{"key":"e_1_3_2_2_11_1","volume-title":"Incremental decoding and training methods for simultaneous translation in neural machine translation. arXiv preprint arXiv:1806.03661","author":"Dalvi Fahim","year":"2018"},{"key":"e_1_3_2_2_12_1","volume-title":"Sequence transduction with recurrent neural networks. arXiv preprint arXiv:1211.3711","author":"Graves Alex","year":"2012"},{"key":"e_1_3_2_2_13_1","volume-title":"Neural turing machines. arXiv preprint arXiv:1410.5401","author":"Graves Alex","year":"2014"},{"key":"e_1_3_2_2_14_1","volume-title":"Edward Grefenstette, Tiago Ramalho, John Agapiou, et al.","author":"Graves Alex","year":"2016"},{"key":"e_1_3_2_2_15_1","volume-title":"Proceedings of the 2014 Conference on empirical methods in natural language processing (EMNLP). 1342--1352","author":"He He Alvin Grissom II","year":"2014"},{"key":"e_1_3_2_2_16_1","volume-title":"Learning to translate in real-time with neural machine translation. arXiv preprint arXiv:1610.00388","author":"Gu Jiatao","year":"2016"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2407694"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054663"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.5555\/3157382.3157664"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.5555\/1577069.1755843"},{"key":"e_1_3_2_2_21_1","volume-title":"Variational memory encoder-decoder. arXiv preprint arXiv:1807.09950","author":"Le Hung","year":"2018"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054715"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.5555\/3045390.3045515"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413740"},{"key":"e_1_3_2_2_25_1","volume-title":"SimulMT to SimulST: Adapting Simultaneous Text Translation to End-to-End Simultaneous Speech Translation. arXiv preprint arXiv:2011.02048","author":"Ma Xutai","year":"2020"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053841"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683510"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.5555\/3305890.3305945"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P14-2090"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639643"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268935"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.350"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"crossref","unstructured":"Tara N Sainath Ruoming Pang David Rybach Yanzhang He Rohit Prabhavalkar Wei Li Mirk\u00f3 Visontai Qiao Liang Trevor Strohman Yonghui Wu etal 2019. Two-pass end-to-end speech recognition. arXiv preprint arXiv:1908.10992 (2019).  Tara N Sainath Ruoming Pang David Rybach Yanzhang He Rohit Prabhavalkar Wei Li Mirk\u00f3 Visontai Qiao Liang Trevor Strohman Yonghui Wu et al. 2019. Two-pass end-to-end speech recognition. arXiv preprint arXiv:1908.10992 (2019).","DOI":"10.21437\/Interspeech.2019-1341"},{"key":"e_1_3_2_2_34_1","volume-title":"Combining residual networks with LSTMs for lipreading. arXiv preprint arXiv:1703.04105","author":"Stafylakis Themos","year":"2017"},{"key":"e_1_3_2_2_35_1","volume-title":"MT, and TTS. arXiv preprint arXiv:2011.04845","author":"Sudoh Katsuhito","year":"2020"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_2_37_1","volume-title":"Matching networks for one shot learning. arXiv preprint arXiv:1606.04080","author":"Vinyals Oriol","year":"2016"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472852"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2018.00088"},{"key":"e_1_3_2_2_40_1","volume-title":"Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition. arXiv preprint arXiv:2012.05481","author":"Zhang Binbin","year":"2020"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00080"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.6174"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475220","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475220","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:16Z","timestamp":1750193296000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475220"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":42,"alternative-id":["10.1145\/3474085.3475220","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475220","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}