{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:04:03Z","timestamp":1765343043195,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","funder":[{"name":"the National Natural Science Foundation of China (NSFC)","award":["Grant 62371323, Grant 62401380, Grant U2433217, Grant U2333209"],"award-info":[{"award-number":["Grant 62371323, Grant 62401380, Grant U2433217, Grant U2333209"]}]},{"name":"the Natural Science Foundation of Sichuan Province","award":["Grant 2025ZNSFSC1476"],"award-info":[{"award-number":["Grant 2025ZNSFSC1476"]}]},{"name":"Sichuan Science and Technology Program","award":["Grant 2024YFG0010, Grant 2024ZDZX0046"],"award-info":[{"award-number":["Grant 2024YFG0010, Grant 2024ZDZX0046"]}]},{"name":"Institutional Research Fund from Sichuan University","award":["Grant 2024SCUQJTX030"],"award-info":[{"award-number":["Grant 2024SCUQJTX030"]}]},{"name":"the Open Fund of Key Laboratory of Flight Techniques and Flight Safety, CAAC","award":["Grant GY2024-01A"],"award-info":[{"award-number":["Grant GY2024-01A"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755639","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"2054-2063","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["AV-RISE: Hierarchical Cross-Modal Denoising for Learning Robust Audio-Visual Speech Representation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-5091-349X","authenticated-orcid":false,"given":"Zhishuo","family":"Zhao","sequence":"first","affiliation":[{"name":"College of Computer Science, Sichuan University, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7194-5023","authenticated-orcid":false,"given":"Yi","family":"Lin","sequence":"additional","affiliation":[{"name":"College of Computer Science, Sichuan University, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0393-5197","authenticated-orcid":false,"given":"Dongyue","family":"Guo","sequence":"additional","affiliation":[{"name":"College of Computer Science, Sichuan University, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9848-4261","authenticated-orcid":false,"given":"Junyu","family":"Fan","sequence":"additional","affiliation":[{"name":"College of Computer Science, Sichuan University, Chengdu, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2339736"},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 1252-1261","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, Andrew Senior, Oriol Vinyals, and Andrew Zisserman. 2018a. Deep Audio-Visual Speech Recognition. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 1252-1261. https:\/\/arxiv.org\/abs\/1809.02108"},{"key":"e_1_3_2_1_3_1","volume-title":"Andrew Senior, Oriol Vinyals, and Andrew Zisserman.","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, Andrew Senior, Oriol Vinyals, and Andrew Zisserman. 2018b. LRS3-TED: A Large-Scale Dataset for Visual Speech Recognition. In arXiv preprint arXiv:1809.00496. https:\/\/arxiv.org\/abs\/1809.00496"},{"key":"e_1_3_2_1_4_1","volume-title":"hear, and read: Deep aligned representations. arXiv preprint arXiv:1706.00932","author":"Aytar Yusuf","year":"2017","unstructured":"Yusuf Aytar, Carl Vondrick, and Antonio Torralba. 2017. See, hear, and read: Deep aligned representations. arXiv preprint arXiv:1706.00932 (2017)."},{"key":"e_1_3_2_1_5_1","volume-title":"Speech and Language. In International Conference on Machine Learning (ICML). PMLR, 1416-1429","author":"Baevski Alexei","year":"2023","unstructured":"Alexei Baevski, Arun Babu, Wei-Ning Hsu, Qiantong Xu, and Michael Auli. 2023. Efficient Self-Supervised Learning with Contextualized Target Representations for Vision, Speech and Language. In International Conference on Machine Learning (ICML). PMLR, 1416-1429."},{"key":"e_1_3_2_1_6_1","volume-title":"Data2Vec: A General Framework for Self-Supervised Learning in Speech, Vision and Language. arXiv preprint arXiv:2202.03555","author":"Baevski Alexei","year":"2022","unstructured":"Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, and Michael Auli. 2022. Data2Vec: A General Framework for Self-Supervised Learning in Speech, Vision and Language. arXiv preprint arXiv:2202.03555 (2022)."},{"key":"e_1_3_2_1_7_1","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. In Advances in Neural Information Processing Systems (NeurIPS), Vol. 33. 12449-12460.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2007.02.006"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00229"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26484"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"e_1_3_2_1_13_1","first-page":"1086","article-title":"VoxCeleb2","author":"Chung Joon Son","year":"2018","unstructured":"Joon Son Chung, Arsha Nagrani, and Andrew Zisserman. 2018. VoxCeleb2: Deep Speaker Recognition. In Proceedings of Interspeech. 1086-1090. https:\/\/arxiv.org\/abs\/1806.05622","journal-title":"Deep Speaker Recognition. In Proceedings of Interspeech."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.367"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/6046.865479"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2017.7965918"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_2_1_18_1","first-page":"21271","volume-title":"Proceedings of the 33rd Advances in Neural Information Processing Systems (NIPS)","volume":"33","author":"Grill Jean-Bastien","year":"2020","unstructured":"Jean-Bastien Grill, Florian Strub, Florent Altch\u00e9, Corentin Tallec, Pierre Richemond, Elena Buchatskaya, Carl Doersch, Bernardo Avila Pires, Zhaohan Guo, Mohammad Gheshlaghi Azar, et al., 2020. Bootstrap Your Own Latent: A New Approach to Self-Supervised Learning. In Proceedings of the 33rd Advances in Neural Information Processing Systems (NIPS), Vol. 33. 21271-21284."},{"key":"e_1_3_2_1_19_1","volume-title":"Jointly learning visual and auditory speech representations from raw data. arXiv preprint arXiv:2212.06246","author":"Haliassos Alexandros","year":"2022","unstructured":"Alexandros Haliassos, Pingchuan Ma, Rodrigo Mira, Stavros Petridis, and Maja Pantic. 2022. Jointly learning visual and auditory speech representations from raw data. arXiv preprint arXiv:2212.06246 (2022)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448473"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01801"},{"key":"e_1_3_2_1_22_1","volume-title":"Visual context-driven audio feature enhancement for robust end-to-end audio-visual speech recognition. arXiv preprint arXiv:2207.06020","author":"Hong Jungho","year":"2022","unstructured":"Jungho Hong, Minjae Kim, Donghoon Yoo, Youngjune Kim, and Gunhee Kim. 2022. Visual context-driven audio feature enhancement for robust end-to-end audio-visual speech recognition. arXiv preprint arXiv:2207.06020 (2022)."},{"key":"e_1_3_2_1_23_1","volume-title":"Kushal Lakhotia","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio, speech, and language processing, Vol. 29 (2021), 3451-3460."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.848"},{"volume-title":"Robustness in automatic speech recognition: fundamentals and applications","author":"Junqua Jean-Claude","key":"e_1_3_2_1_25_1","unstructured":"Jean-Claude Junqua and Jean-Paul Haton. 2012. Robustness in automatic speech recognition: fundamentals and applications. Vol. 341. Springer Science & Business Media."},{"volume-title":"Multi-Task Corrupted Prediction for Learning Robust Audio-Visual Speech Representation. In The Thirteenth International Conference on Learning Representations.","author":"Kim Sungnyun","key":"e_1_3_2_1_26_1","unstructured":"Sungnyun Kim, Sungwoo Cho, Sangmin Bae, Kangwook Jang, and Se-Young Yun. [n.d.]. Multi-Task Corrupted Prediction for Learning Robust Audio-Visual Speech Representation. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2008.922789"},{"volume-title":"Automatic Speech Recognition: The Development of the SPHINX System","author":"Lee Kai-Fu","key":"e_1_3_2_1_28_1","unstructured":"Kai-Fu Lee. 1988. Automatic Speech Recognition: The Development of the SPHINX System. Springer Science & Business Media."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389642"},{"volume-title":"Proceedings of the European Conference on Computer Vision (ECCV). 740-755","author":"Lin Tsung-Yi","key":"e_1_3_2_1_30_1","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In Proceedings of the European Conference on Computer Vision (ECCV). 740-755. https:\/\/arxiv.org\/abs\/1405.0312"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096889"},{"key":"e_1_3_2_1_32_1","volume-title":"Lira: Learning visual speech representations from audio through self-supervision. arXiv preprint arXiv:2106.09171","author":"Ma Pingchuan","year":"2021","unstructured":"Pingchuan Ma, Rodrigo Mira, Stavros Petridis, Bj\u00f6rn W Schuller, and Maja Pantic. 2021a. Lira: Learning visual speech representations from audio through self-supervision. arXiv preprint arXiv:2106.09171 (2021)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414646"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3171679"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053841"},{"key":"e_1_3_2_1_36_1","volume-title":"Deep Multimodal Learning for Audio-Visual Speech Recognition. In 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2130-2134","author":"Mroueh Youssef","year":"2015","unstructured":"Youssef Mroueh, Etienne Marcheret, and Vaibhava Goel. 2015. Deep Multimodal Learning for Audio-Visual Speech Recognition. In 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2130-2134."},{"key":"e_1_3_2_1_37_1","unstructured":"Chalapathy Neti Gerasimos Potamianos Jean Luettin Iain Matthews Herv\u00e9 Glotin Dimitra Vergyri Joseph Sison Ahmed Mashari and Wen Zhou. 2000. Audio Visual Speech Recognition. Technical Report WS00-AVSR. Johns Hopkins University. Final Workshop Report CLSP\/JHU Summer Workshop."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-014-0629-7"},{"volume-title":"Proceedings of the SIGCHI Conference on Human Factors in Computing Systems. 19-25","author":"Petajan E.","key":"e_1_3_2_1_39_1","unstructured":"E. Petajan, B. Bischoff, D. Bodoff, and N. Brooke. 1988. An improved automatic lipreading system to enhance speech recognition. In Proceedings of the SIGCHI Conference on Human Factors in Computing Systems. 19-25."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461326"},{"key":"e_1_3_2_1_41_1","volume-title":"Lipsound2: Self-supervised pre-training for lip-to-speech reconstruction and lip reading","author":"Qu Leyuan","year":"2022","unstructured":"Leyuan Qu, Cornelius Weber, and Stefan Wermter. 2022. Lipsound2: Self-supervised pre-training for lip-to-speech reconstruction and lip reading. IEEE transactions on neural networks and learning systems, Vol. 35, 2 (2022), 2772-2782."},{"key":"e_1_3_2_1_42_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_43_1","volume-title":"wav2vec: Unsupervised pre-training for speech recognition. arXiv preprint arXiv:1904.05862","author":"Schneider Steffen","year":"2019","unstructured":"Steffen Schneider, Alexei Baevski, Ronan Collobert, and Michael Auli. 2019. wav2vec: Unsupervised pre-training for speech recognition. arXiv preprint arXiv:1904.05862 (2019)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02195"},{"key":"e_1_3_2_1_45_1","volume-title":"Learning audio-visual speech representation by masked multimodal cluster prediction. arXiv preprint arXiv:2201.02184","author":"Shi Bowen","year":"2022","unstructured":"Bowen Shi, Wei-Ning Hsu, Kushal Lakhotia, and Abdelrahman Mohamed. 2022b. Learning audio-visual speech representation by masked multimodal cluster prediction. arXiv preprint arXiv:2201.02184 (2022)."},{"key":"e_1_3_2_1_46_1","volume-title":"Robust self-supervised audio-visual speech recognition. arXiv preprint arXiv:2201.01763","author":"Shi Bowen","year":"2022","unstructured":"Bowen Shi, Wei-Ning Hsu, and Abdelrahman Mohamed. 2022a. Robust self-supervised audio-visual speech recognition. arXiv preprint arXiv:2201.01763 (2022)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"e_1_3_2_1_48_1","volume-title":"MUSAN: A Music, Speech, and Noise Corpus. arXiv preprint arXiv:1510.08484","author":"Snyder David","year":"2015","unstructured":"David Snyder, Guoguo Chen, and Daniel Povey. 2015. MUSAN: A Music, Speech, and Noise Corpus. arXiv preprint arXiv:1510.08484 (2015). https:\/\/arxiv.org\/abs\/1510.08484"},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of the 42nd IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 1-5. https:\/\/arxiv.org\/abs\/1703","author":"Stafylakis Themos","year":"2017","unstructured":"Themos Stafylakis and Georgios Tzimiropoulos. 2017. Combining Residual Networks with LS\u2122s for Lipreading. In Proceedings of the 42nd IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 1-5. https:\/\/arxiv.org\/abs\/1703.04105"},{"key":"e_1_3_2_1_50_1","first-page":"3454","article-title":"Attention-Based Audio-Visual Fusion for Robust Automatic Speech Recognition","author":"Sterpu Georgian","year":"2018","unstructured":"Georgian Sterpu, Christian Saam, and Naomi Harte. 2018. Attention-Based Audio-Visual Fusion for Robust Automatic Speech Recognition. In Proc. Interspeech. 3454-3458.","journal-title":"Proc. Interspeech."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(99)00038-2"},{"key":"e_1_3_2_1_52_1","volume-title":"GLUE: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461","author":"Wang Alex","year":"2018","unstructured":"Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R Bowman. 2018. GLUE: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461 (2018)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29882"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.3390\/math11071733"},{"volume-title":"Automatic Speech Recognition","author":"Yu Dong","key":"e_1_3_2_1_56_1","unstructured":"Dong Yu and Li Deng. 2016. Automatic Speech Recognition. Vol. 1. Springer, Berlin."},{"key":"e_1_3_2_1_57_1","volume-title":"The 16th Asian Conference on Machine Learning (Conference Track).","author":"Zhao Zhishuo","year":"2024","unstructured":"Zhishuo Zhao, Dongyue Guo, Wenjie Ou, Hong Liu, and Yi Lin. 2024. AMG-AVSR: Adaptive Modality Guidance for Audio-Visual Speech Recognition via Progressive Feature Enhancement. In The 16th Asian Conference on Machine Learning (Conference Track)."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29951"},{"key":"e_1_3_2_1_59_1","volume-title":"VATLM: Visual-Audio-Text Pre-training with Unified Masked Prediction for Speech Representation Learning","author":"Zhu Qian","year":"2023","unstructured":"Qian Zhu, Lingyun Zhou, Zhen Zhang, Zhiyao Zhang, and Xiao Liu. 2023. VATLM: Visual-Audio-Text Pre-training with Unified Masked Prediction for Speech Representation Learning. IEEE Transactions on Multimedia (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755639","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:59:24Z","timestamp":1765342764000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755639"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":59,"alternative-id":["10.1145\/3746027.3755639","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755639","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}