{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T17:54:37Z","timestamp":1770054877406,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":19,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China","award":["62171250"],"award-info":[{"award-number":["62171250"]}]},{"name":"National Natural Science Foundation of China","award":["62301075"],"award-info":[{"award-number":["62301075"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,12]]},"DOI":"10.1145\/3784833.3784842","type":"proceedings-article","created":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T05:22:31Z","timestamp":1770009751000},"page":"161-165","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ALIGNVSR: AUDIO-VISUAL CROSS-MODAL ALIGNMENT FOR VISUAL SPEECH RECOGNITION"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9660-8702","authenticated-orcid":false,"given":"Zehua","family":"Liu","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2307-2096","authenticated-orcid":false,"given":"Xiaolou","family":"Li","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3698-7636","authenticated-orcid":false,"given":"Chen","family":"Chen","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9723-3294","authenticated-orcid":false,"given":"Li","family":"Guo","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5546-8060","authenticated-orcid":false,"given":"Lantian","family":"Li","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1286-0644","authenticated-orcid":false,"given":"Dong","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2026,2]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Young\u00a0Jin Ahn Jungwoo Park Sangha Park Jonghyun Choi and Kee-Eung Kim. 2024. SyncVSR: Data-Efficient Visual Speech Recognition with End-to-End Crossmodal Audio Token Synchronization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.12233 (2024)."},{"key":"e_1_3_3_2_3_2","unstructured":"Yannis\u00a0M Assael Brendan Shillingford Shimon Whiteson and Nando De\u00a0Freitas. 2016. Lipnet: End-to-end sentence-level lipreading. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1611.01599 (2016)."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.116"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"crossref","unstructured":"Chen Chen Zehua Liu Xiaolou Li Lantian Li and Dong Wang. 2024. CNVSRC 2023: The First Chinese Continuous Visual Speech Recognition Challenge. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.10313 (2024).","DOI":"10.21437\/Interspeech.2024-2509"},{"key":"e_1_3_3_2_6_2","unstructured":"Jiankang Deng Jia Guo Yuxiang Zhou Jinke Yu Irene Kotsia and Stefanos Zafeiriou. 2019. Retinaface: Single-stage dense face localisation in the wild. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1905.00641 (2019)."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01268"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"crossref","unstructured":"Anmol Gulati James Qin Chung-Cheng Chiu Niki Parmar Yu Zhang Jiahui Yu Wei Han Shibo Wang Zhengdong Zhang Yonghui Wu et\u00a0al. 2020. Conformer: Convolution-augmented transformer for speech recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2005.08100 (2020).","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"crossref","unstructured":"Wei-Ning Hsu Benjamin Bolte Yao-Hung\u00a0Hubert Tsai Kushal Lakhotia Ruslan Salakhutdinov and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio speech and language processing 29 (2021) 3451\u20133460.","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448428"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096889"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414567"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Pingchuan Ma Stavros Petridis and Maja Pantic. 2022. Visual speech recognition for multiple languages in the wild. Nature Machine Intelligence 4 11 (2022) 930\u2013939.","DOI":"10.1038\/s42256-022-00550-z"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00510"},{"key":"e_1_3_3_2_16_2","unstructured":"Brendan Shillingford Yannis Assael Matthew\u00a0W Hoffman Thomas Paine C\u00edan Hughes Utsav Prabhu Hank Liao Hasim Sak Kanishka Rao Lorrayne Bennett et\u00a0al. 2018. Large-scale visual speech recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1807.05162 (2018)."},{"key":"e_1_3_3_2_17_2","first-page":"6447","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","author":"Son\u00a0Chung Joon","year":"2017","unstructured":"Joon Son\u00a0Chung, Andrew Senior, Oriol Vinyals, and Andrew Zisserman. 2017. Lip reading sentences in the wild. In Proceedings of the IEEE conference on computer vision and pattern recognition. 6447\u20136456."},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"crossref","unstructured":"Themos Stafylakis and Georgios Tzimiropoulos. 2017. Combining residual networks with LSTMs for lipreading. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1703.04105 (2017).","DOI":"10.21437\/Interspeech.2017-85"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"crossref","unstructured":"Shinji Watanabe Takaaki Hori Suyoun Kim John\u00a0R Hershey and Tomoki Hayashi. 2017. Hybrid CTC\/attention architecture for end-to-end speech recognition. IEEE Journal of Selected Topics in Signal Processing 11 8 (2017) 1240\u20131253.","DOI":"10.1109\/JSTSP.2017.2763455"},{"key":"e_1_3_3_2_20_2","unstructured":"Jeong\u00a0Hun Yeo Minsu Kim Jeongsoo Choi Dae\u00a0Hoe Kim and Yong\u00a0Man Ro. 2024. Akvsr: Audio knowledge empowered visual speech recognition by compressing audio knowledge of a pretrained model. IEEE Transactions on Multimedia (2024)."}],"event":{"name":"ICCIP 2025: 2025 the 11th International Conference on Communication and Information Processing","location":"Lingshui Hainan China","acronym":"ICCIP 2025"},"container-title":["Proceedings of the 2025 11th International Conference on Communication and Information Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3784833.3784842","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T07:47:02Z","timestamp":1770018422000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3784833.3784842"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,12]]},"references-count":19,"alternative-id":["10.1145\/3784833.3784842","10.1145\/3784833"],"URL":"https:\/\/doi.org\/10.1145\/3784833.3784842","relation":{},"subject":[],"published":{"date-parts":[[2025,11,12]]},"assertion":[{"value":"2026-02-01","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}