{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T16:58:57Z","timestamp":1777568337196,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":79,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681221","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"1311-1320","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Efficient Training for Multilingual Visual Speech Recognition: Pre-training with Discretized Visual Speech Representation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6514-0018","authenticated-orcid":false,"given":"Minsu","family":"Kim","sequence":"first","affiliation":[{"name":"Korea Advanced Institute of Science &amp; Technology, Daejeon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8135-6625","authenticated-orcid":false,"given":"Jeonghun","family":"Yeo","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science &amp; Technology, Daejeon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8467-3576","authenticated-orcid":false,"given":"Se Jin","family":"Park","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science &amp; Technology, Daejeon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9301-2760","authenticated-orcid":false,"given":"Hyeongseop","family":"Rha","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science &amp; Technology, Daejeon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5306-6853","authenticated-orcid":false,"given":"Yong Man","family":"Ro","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science &amp; Technology, Daejeon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1009"},{"key":"e_1_3_2_1_2_1","volume-title":"Andrew Senior, Oriol Vinyals, and Andrew Zisserman.","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, Andrew Senior, Oriol Vinyals, and Andrew Zisserman. 2018. Deep audio-visual speech recognition. IEEE transactions on pattern analysis and machine intelligence, Vol. 44, 12 (2018), 8717--8727."},{"key":"e_1_3_2_1_3_1","volume-title":"Joon Son Chung, and Andrew Zisserman","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, and Andrew Zisserman. 2018. LRS3-TED: a large-scale dataset for visual speech recognition. arXiv preprint arXiv:1809.00496 (2018)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054253"},{"key":"e_1_3_2_1_5_1","volume-title":"Lipnet: End-to-end sentence-level lipreading. arXiv preprint arXiv:1611.01599","author":"Assael Yannis M","year":"2016","unstructured":"Yannis M Assael, Brendan Shillingford, Shimon Whiteson, and Nando De Freitas. 2016. Lipnet: End-to-end sentence-level lipreading. arXiv preprint arXiv:1611.01599 (2016)."},{"key":"e_1_3_2_1_6_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems, Vol. 33 (2020), 12449--12460."},{"key":"e_1_3_2_1_7_1","volume-title":"Conformers are All You Need for Visual Speech Recogntion. arXiv preprint arXiv:2302.10915","author":"Chang Oscar","year":"2023","unstructured":"Oscar Chang, Hank Liao, Dmitriy Serdyuk, Ankit Shah, and Olivier Siohan. 2023. Conformers are All You Need for Visual Speech Recogntion. arXiv preprint arXiv:2302.10915 (2023)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2051"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-194"},{"key":"e_1_3_2_1_11_1","volume-title":"NIPS 2014 Workshop on Deep Learning","author":"Chung Junyoung","year":"2014","unstructured":"Junyoung Chung, Caglar Gulcehre, Kyunghyun Cho, and Yoshua Bengio. 2014. Empirical evaluation of gated recurrent neural networks on sequence modeling. In NIPS 2014 Workshop on Deep Learning, December 2014."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"e_1_3_2_1_13_1","volume-title":"Lip Reading Sentences in the Wild. In 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). IEEE.","author":"Chung Joon Son","year":"2017","unstructured":"Joon Son Chung, Andrew Senior, Oriol Vinyals, and Andrew Zisserman. 2017. Lip Reading Sentences in the Wild. In 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). IEEE."},{"key":"e_1_3_2_1_14_1","volume-title":"British Machine Vision Conference","author":"Chung Joon Son","year":"2017","unstructured":"Joon Son Chung and Andrew Zisserman. 2017. Lip reading in profile. In British Machine Vision Conference, 2017. British Machine Vision Association and Society for Pattern Recognition."},{"key":"e_1_3_2_1_15_1","volume-title":"Computer Vision--ACCV 2016: 13th Asian Conference on Computer Vision, Taipei, Taiwan, November 20--24","author":"Chung Joon Son","year":"2016","unstructured":"Joon Son Chung and Andrew Zisserman. 2017. Lip reading in the wild. In Computer Vision--ACCV 2016: 13th Asian Conference on Computer Vision, Taipei, Taiwan, November 20--24, 2016, Revised Selected Papers, Part II 13. Springer, 87--103."},{"key":"e_1_3_2_1_16_1","volume-title":"Cross-lingual language model pretraining. Advances in neural information processing systems","author":"Conneau Alexis","year":"2019","unstructured":"Alexis Conneau and Guillaume Lample. 2019. Cross-lingual language model pretraining. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00525"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"e_1_3_2_1_19_1","volume-title":"Lip2Vec: Efficient and Robust Visual Speech Recognition via Latent-to-Latent Visual to Audio Representation Mapping. arXiv preprint arXiv:2308.06112","author":"Dahou Djilali Yasser Abdelaziz","year":"2023","unstructured":"Yasser Abdelaziz Dahou Djilali, Sanath Narayan, Haithem Boussaid, Ebtessam Almazrouei, and Merouane Debbah. 2023. Lip2Vec: Efficient and Robust Visual Speech Recognition via Latent-to-Latent Visual to Audio Representation Mapping. arXiv preprint arXiv:2308.06112 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"Proc. Interspeech. 3655--3659","author":"Elizabeth Salesky","year":"2021","unstructured":"Salesky Elizabeth, Wiesner Matthew, Bremerman Jacob, Roldano Cattoni, Matteo Negri, Marco Turchi, Douglas W Oard, and Post Matt. 2021. The Multilingual TEDx Corpus for Speech Recognition and Translation. In Proc. Interspeech. 3655--3659."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201357"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414858"},{"key":"e_1_3_2_1_23_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Haliassos Alexandros","year":"2022","unstructured":"Alexandros Haliassos, Pingchuan Ma, Rodrigo Mira, Stavros Petridis, and Maja Pantic. 2022. Jointly Learning Visual and Auditory Speech Representations from Raw Data. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_25_1","volume-title":"Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_1_26_1","volume-title":"Long short-term memory. Neural computation","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation, Vol. 9, 8 (1997), 1735--1780."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01801"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_2_1_29_1","volume-title":"TranSpeech: Speech-to-Speech Translation With Bilateral Perturbation. In The Eleventh International Conference on Learning Representations.","author":"Huang Rongjie","year":"2022","unstructured":"Rongjie Huang, Jinglin Liu, Huadai Liu, Yi Ren, Lichao Zhang, Jinzheng He, and Zhou Zhao. 2022. TranSpeech: Speech-to-Speech Translation With Bilateral Perturbation. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003832"},{"key":"e_1_3_2_1_31_1","volume-title":"International Conference on Machine Learning. PMLR, 10120--10134","author":"Jia Ye","year":"2022","unstructured":"Ye Jia, Michelle Tadmor Ramanovich, Tal Remez, and Roi Pomerantz. 2022. Translatotron 2: High-quality direct speech-to-speech translation with voice preservation. In International Conference on Machine Learning. PMLR, 10120--10134."},{"key":"e_1_3_2_1_32_1","unstructured":"Ziyue Jiang Yi Ren Zhenhui Ye Jinglin Liu Chen Zhang Qian Yang Shengpeng Ji Rongjie Huang Chunfeng Wang Xiang Yin et al. 2023. Mega-TTS: Zero-Shot Text-to-Speech at Scale with Intrinsic Inductive Bias. arXiv preprint arXiv:2306.03509 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Joon Son Chung, and Shinji Watanabe","author":"Heo Hee-Soo","year":"2022","unstructured":"Jee-weon Jung, Hee-Soo Heo, Bong-Jin Lee, Jaesong Lee, Hye-jin Shim, Youngki Kwon, Joon Son Chung, and Shinji Watanabe. 2022. Large-scale learning of generalised representations for speaker recognition. arXiv preprint arXiv:2210.10985 (2022)."},{"key":"e_1_3_2_1_34_1","volume-title":"Many-to-Many Spoken Language Translation via Unified Speech and Text Representation Learning with Unit-to-Unit Translation. arXiv preprint arXiv:2308.01831","author":"Kim Minsu","year":"2023","unstructured":"Minsu Kim, Jeongsoo Choi, Dahun Kim, and Yong Man Ro. 2023. Many-to-Many Spoken Language Translation via Unified Speech and Text Representation Learning with Unit-to-Unit Translation. arXiv preprint arXiv:2308.01831 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Shinji Watanabe, and Yong Man Ro.","author":"Kim Minsu","year":"2023","unstructured":"Minsu Kim, Jeongsoo Choi, Soumi Maiti, Jeong Hun Yeo, Shinji Watanabe, and Yong Man Ro. 2023. Towards Practical and Efficient Image-to-Speech Captioning with Vision-Language Pre-training and Multi-modal Tokens. arXiv preprint arXiv:2309.08531 (2023)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3115626"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01409"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.20003"},{"key":"e_1_3_2_1_39_1","volume-title":"International Conference on Learning Representations.","author":"Kingma Diederik P","year":"2015","unstructured":"Diederik P Kingma and Jimmy Ba. 2015. Adam: A method for stochastic optimization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_40_1","volume-title":"Sentencepiece: A simple and language independent subword tokenizer and detokenizer for neural text processing. arXiv preprint arXiv:1808.06226","author":"Kudo Taku","year":"2018","unstructured":"Taku Kudo and John Richardson. 2018. Sentencepiece: A simple and language independent subword tokenizer and detokenizer for neural text processing. arXiv preprint arXiv:1808.06226 (2018)."},{"key":"e_1_3_2_1_41_1","first-page":"1336","article-title":"On generative spoken language modeling from raw audio","volume":"9","author":"Lakhotia Kushal","year":"2021","unstructured":"Kushal Lakhotia, Eugene Kharitonov, Wei-Ning Hsu, Yossi Adi, Adam Polyak, Benjamin Bolte, Tu-Anh Nguyen, Jade Copet, Alexei Baevski, Abdelrahman Mohamed, et al. 2021. On generative spoken language modeling from raw audio. Transactions of the Association for Computational Linguistics, Vol. 9 (2021), 1336--1354.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.63"},{"key":"e_1_3_2_1_43_1","volume-title":"British Machine Vision Conference.","author":"Luo Mingshuang","year":"2020","unstructured":"Mingshuang Luo, Shuang Yang, Xilin Chen, Zitao Liu, and Shiguang Shan. 2020. Synchronous bidirectional learning for multilingual lip reading. In British Machine Vision Conference."},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing. 741--751","author":"Lux Florian","year":"2022","unstructured":"Florian Lux, Julia Koch, and Ngoc Thang Vu. 2022. Low-Resource Multilingual and Zero-Shot Multispeaker TTS. In Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing. 741--751."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096889"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9415063"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414567"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-022-00550-z"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746706"},{"key":"e_1_3_2_1_50_1","volume-title":"Voxtlm: unified decoder-only models for consolidating speech recognition\/synthesis and speech\/text continuation tasks. arXiv preprint arXiv:2309.07937","author":"Maiti Soumi","year":"2023","unstructured":"Soumi Maiti, Yifan Peng, Shukjae Choi, Jee-weon Jung, Xuankai Chang, and Shinji Watanabe. 2023. Voxtlm: unified decoder-only models for consolidating speech recognition\/synthesis and speech\/text continuation tasks. arXiv preprint arXiv:2309.07937 (2023)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00545"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01582"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952625"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472088"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461326"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639643"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-475"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11032"},{"key":"e_1_3_2_1_59_1","volume-title":"End-to-End Speech Recognition: A Survey. arXiv preprint arXiv:2303.03329","author":"Prabhavalkar Rohit","year":"2023","unstructured":"Rohit Prabhavalkar, Takaaki Hori, Tara N Sainath, Ralf Schl\u00fcter, and Shinji Watanabe. 2023. End-to-End Speech Recognition: A Survey. arXiv preprint arXiv:2303.03329 (2023)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00510"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2826"},{"key":"e_1_3_2_1_62_1","volume-title":"International Conference on Machine Learning. PMLR, 28492--28518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International Conference on Machine Learning. PMLR, 28492--28518."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01312"},{"key":"e_1_3_2_1_64_1","volume-title":"International Conference on Learning Representations.","author":"Shi Bowen","year":"2021","unstructured":"Bowen Shi, Wei-Ning Hsu, Kushal Lakhotia, and Abdelrahman Mohamed. 2021. Learning Audio-Visual Speech Representation by Masked Multimodal Cluster Prediction. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_65_1","volume-title":"International Conference on Learning Representations.","author":"Shim Kyuhong","year":"2021","unstructured":"Kyuhong Shim, Jungwook Choi, and Wonyong Sung. 2021. Understanding the role of self attention for efficient speech recognition. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097097"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-85"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461972"},{"key":"e_1_3_2_1_69_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2017.2763455"},{"key":"e_1_3_2_1_71_1","volume-title":"3rd International Conference on Learning Representations, ICLR","author":"Weston Jason","year":"2015","unstructured":"Jason Weston, Sumit Chopra, and Antoine Bordes. 2015. Memory networks. In 3rd International Conference on Learning Representations, ICLR 2015."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-901"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3352388"},{"key":"e_1_3_2_1_74_1","volume-title":"Multi-Temporal Lip-Audio Memory for Visual Speech Recognition. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5.","author":"Yeo Jeong Hun","year":"2023","unstructured":"Jeong Hun Yeo, Minsu Kim, and Yong Man Ro. 2023. Multi-Temporal Lip-Audio Memory for Visual Speech Recognition. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5."},{"key":"e_1_3_2_1_75_1","volume-title":"Visual Speech Recognition for Low-resource Languages with Automatic Labels From Whisper Model. arXiv preprint arXiv:2309.08535","author":"Yeo Jeong Hun","year":"2023","unstructured":"Jeong Hun Yeo, Minsu Kim, Shinji Watanabe, and Yong Man Ro. 2023. Visual Speech Recognition for Low-resource Languages with Automatic Labels From Whisper Model. arXiv preprint arXiv:2309.08535 (2023)."},{"key":"e_1_3_2_1_76_1","unstructured":"Ziqiang Zhang Long Zhou Chengyi Wang Sanyuan Chen Yu Wu Shujie Liu Zhuo Chen Yanqing Liu Huaming Wang Jinyu Li et al. 2023. Speak foreign languages with your own voice: Cross-lingual neural codec language modeling. arXiv preprint arXiv:2303.03926 (2023)."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.6174"},{"key":"e_1_3_2_1_78_1","volume-title":"Vatlm: Visual-audio-text pre-training with unified masked prediction for speech representation learning","author":"Zhu Qiushi","year":"2023","unstructured":"Qiushi Zhu, Long Zhou, Ziqiang Zhang, Shujie Liu, Binxing Jiao, Jie Zhang, Lirong Dai, Daxin Jiang, Jinyu Li, and Furu Wei. 2023. Vatlm: Visual-audio-text pre-training with unified masked prediction for speech representation learning. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_79_1","volume-title":"Learning Cross-Lingual Visual Speech Representations. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5.","author":"Zinonos Andreas","year":"2023","unstructured":"Andreas Zinonos, Alexandros Haliassos, Pingchuan Ma, Stavros Petridis, and Maja Pantic. 2023. Learning Cross-Lingual Visual Speech Representations. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681221","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681221","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:02Z","timestamp":1750295882000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681221"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":79,"alternative-id":["10.1145\/3664647.3681221","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681221","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}