{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T07:10:14Z","timestamp":1775200214512,"version":"3.50.1"},"reference-count":51,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434749","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["Interpreting the Role of Visemes in Audio-Visual Speech Recognition"],"prefix":"10.1109","author":[{"given":"Aristeidis","family":"Papadopoulos","sequence":"first","affiliation":[{"name":"Trinity College Dublin,Sigmedia Group, School of Engineering,Dublin,Ireland"}]},{"given":"Naomi","family":"Harte","sequence":"additional","affiliation":[{"name":"Trinity College Dublin,Sigmedia Group, School of Engineering,Dublin,Ireland"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Learning audio-visual speech representation by masked multimodal cluster prediction","volume-title":"International Conference on Learning Representations","author":"Shi"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096889"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-322"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.697"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3275873"},{"key":"ref6","article-title":"Jointly learning visual and auditory speech representations from raw data","volume-title":"The Eleventh International Conference on Learning Representations","author":"Haliassos"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389642"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-3015"},{"key":"ref9","article-title":"wav2vec 2.0: a framework for self-supervised learning of speech representations","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems, ser. NIPS \u201920","author":"Baevski"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref11","first-page":"28 492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proceedings of the 40th International Conference on Machine Learning, ser. Proceedings of Machine Learning Research","volume":"202","author":"Radford"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2231"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414776"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICDMW58026.2022.00120"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.sigmorphon-1.9"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-373"},{"key":"ref18","article-title":"Understanding the role of self attention for efficient speech recognition","volume-title":"International Conference on Learning Representations","author":"Shim"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1985"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096149"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1157"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2507"},{"key":"ref23","volume-title":"Silence is sweeter than speech: Self-supervised model using silence to store speaker information","author":"Feng","year":"2022"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00656"},{"key":"ref25","article-title":"Understanding intermediate layers using linear classifier probes","volume-title":"International Conference on Learning Representations","author":"Alain"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2047"},{"issue":"86","key":"ref27","first-page":"2579","article-title":"Visualizing data using t-sne","volume":"9","author":"van der Maaten","year":"2008","journal-title":"Journal of Machine Learning Research"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1044\/jshr.1104.796"},{"key":"ref29","volume-title":"Speechreading (lipreading)","author":"Jeffers","year":"1971"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-45683-X_60"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472029"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/SIU.2007.4298572"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2005.857572"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.5220\/0003731903220329"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-28954-6_7"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-85"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref38","article-title":"LRS3-TED: a large-scale dataset for visual speech recognition","volume":"abs\/1809.00496","author":"Afouras","year":"2018","journal-title":"CoRR"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1386"},{"issue":"3","key":"ref40","first-page":"192","article-title":"Prosodylab-aligner: A tool for forced alignment of laboratory speech","volume":"39","author":"Gorman","year":"2011","journal-title":"Canadian Acoustics"},{"key":"ref41","article-title":"MUSAN: A music, speech, and noise corpus","volume":"abs\/1510.08484","author":"Snyder","year":"2015","journal-title":"CoRR"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-99"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-885"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888423"},{"key":"ref45","doi-asserted-by":"crossref","DOI":"10.23915\/distill.00002","article-title":"How to use t-SNE effectively","author":"Wattenberg","year":"2016","journal-title":"Distill"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.3390\/info11040193"},{"key":"ref47","article-title":"Barnes-Hut-SNE","volume-title":"International Conference on Learning Representations","author":"van der Maaten"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-44503-X_27"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-44668-0_68"},{"key":"ref50","volume-title":"t-Distributed Stochastic Neighbor Embedding","author":"van der Maaten","year":"2008"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1406.3269"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434749.pdf?arnumber=11434749","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:59:23Z","timestamp":1775192363000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434749\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":51,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434749","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}