{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:03:02Z","timestamp":1776888182796,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611787","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"5523-5531","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["Towards Accurate Lip-to-Speech Synthesis in-the-Wild"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2845-5570","authenticated-orcid":false,"given":"Sindhu","family":"Hegde","sequence":"first","affiliation":[{"name":"University of Oxford, Oxford, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6628-7065","authenticated-orcid":false,"given":"Rudrabha","family":"Mukhopadhyay","sequence":"additional","affiliation":[{"name":"IIIT Hyderabad, Hyderabad, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6767-7057","authenticated-orcid":false,"given":"C.V","family":"Jawahar","sequence":"additional","affiliation":[{"name":"IIIT Hyderabad, Hyderabad, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5262-9722","authenticated-orcid":false,"given":"Vinay","family":"Namboodiri","sequence":"additional","affiliation":[{"name":"University of Bath, Bath, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Andrew Senior, Oriol Vinyals, and Andrew Zisserman.","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, Andrew Senior, Oriol Vinyals, and Andrew Zisserman. 2018c. Deep audio-visual speech recognition. IEEE transactions on pattern analysis and machine intelligence (2018)."},{"key":"e_1_3_2_1_2_1","volume-title":"Joon Son Chung, and Andrew Zisserman","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, and Andrew Zisserman. 2018a. Deep Lip Reading: a comparison of models and an online application. In INTERSPEECH."},{"key":"e_1_3_2_1_3_1","volume-title":"Joon Son Chung, and Andrew Zisserman","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, and Andrew Zisserman. 2018b. LRS3-TED: a large-scale dataset for visual speech recognition. arXiv preprint arXiv:1809.00496 (2018)."},{"key":"e_1_3_2_1_4_1","volume-title":"2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Akbari Hassan","year":"2017","unstructured":"Hassan Akbari, Himani Arora, Liangliang Cao, and Nima Mesgarani. 2017. Lip2Audspec: Speech Reconstruction from Silent Lip Movements Video. 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2017), 2516--2520."},{"key":"e_1_3_2_1_5_1","volume-title":"Lipnet: End-to-end sentence-level lipreading. arXiv preprint arXiv:1611.01599","author":"Assael Yannis M","year":"2016","unstructured":"Yannis M Assael, Brendan Shillingford, Shimon Whiteson, and Nando De Freitas. 2016. Lipnet: End-to-end sentence-level lipreading. arXiv preprint arXiv:1611.01599 (2016)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.367"},{"key":"e_1_3_2_1_8_1","volume-title":"Asian Conference on Computer Vision. Springer, 87--103","author":"Chung Joon Son","year":"2016","unstructured":"Joon Son Chung and Andrew Zisserman. 2016a. Lip reading in the wild. In Asian Conference on Computer Vision. Springer, 87--103."},{"key":"e_1_3_2_1_9_1","volume-title":"Workshop on Multi-view Lip-reading, ACCV.","author":"Chung J. S.","unstructured":"J. S. Chung and A. Zisserman. 2016b. Out of time: automated lip sync in the wild. In Workshop on Multi-view Lip-reading, ACCV."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.2229005"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2017.61"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953127"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Markus Freitag and Yaser Al-Onaizan. 2017. Beam Search Strategies for Neural Machine Translation. In NMT@ACL.","DOI":"10.18653\/v1\/W17-3207"},{"key":"e_1_3_2_1_14_1","volume-title":"BigVGAN: A Universal Neural Vocoder with Large-Scale Training. ArXiv","author":"Lee Sang","year":"2022","unstructured":"Sang gil Lee, Wei Ping, Boris Ginsburg, Bryan Catanzaro, and Sung-Hoon Yoon. 2022. BigVGAN: A Universal Neural Vocoder with Large-Scale Training. ArXiv, Vol. abs\/2206.04658 (2022)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2407694"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01033"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548081"},{"key":"e_1_3_2_1_18_1","volume-title":"Neural Dubber: Dubbing for Videos According to Scripts. In NeurIPS.","author":"Hu Chenxu","year":"2021","unstructured":"Chenxu Hu, Qiao Tian, Tingle Li, Yuping Wang, Yuxuan Wang, and Hang Zhao. 2021. Neural Dubber: Dubbing for Videos According to Scripts. In NeurIPS."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.5555\/3327345.3327360"},{"key":"e_1_3_2_1_20_1","unstructured":"Minsu Kim Joanna Hong and Yong Man Ro. 2022. Lip to Speech Synthesis with Visual Context Attentional GAN. In Neural Information Processing Systems."},{"key":"e_1_3_2_1_21_1","volume-title":"Lip-to-Speech Synthesis in the Wild with Multi-task Learning. ArXiv","author":"Kim Minsu","year":"2023","unstructured":"Minsu Kim, Joanna Hong, and Yong Man Ro. 2023. Lip-to-Speech Synthesis in the Wild with Multi-task Learning. ArXiv, Vol. abs\/2302.08841 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"Kingma and Jimmy Ba","author":"Diederik","year":"2015","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. CoRR, Vol. abs\/1412.6980 (2015)."},{"key":"e_1_3_2_1_23_1","volume-title":"SVTS: Scalable Video-to-Speech Synthesis. In Interspeech.","author":"Mira Rodrigo","year":"2022","unstructured":"Rodrigo Mira, Alexandros Haliassos, Stavros Petridis, Bj\u00f6rn Schuller, and Maja Pantic. 2022. SVTS: Scalable Video-to-Speech Synthesis. In Interspeech."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Liliane Momeni Hannah Bull Prajwal K R Samuel Albanie G\u00fcl Varol and Andrew Zisserman. 2022. Automatic dense annotation of large-vocabulary sign language videos. In ECCV.","DOI":"10.1007\/978-3-031-19833-5_39"},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=HJtEm4p6Z","author":"Ping Wei","year":"2018","unstructured":"Wei Ping, Kainan Peng, Andrew Gibiansky, Sercan O. Arik, Ajay Kannan, Sharan Narang, Jonathan Raiman, and John Miller. 2018. Deep Voice 3: 2000-Speaker Neural Text-to-Speech. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=HJtEm4p6Z"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00510"},{"key":"e_1_3_2_1_27_1","volume-title":"The IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Prajwal K R","unstructured":"K R Prajwal, Rudrabha Mukhopadhyay, Vinay P. Namboodiri, and C.V. Jawahar. 2020a. Learning Individual Speaking Styles for Accurate Lip to Speech Synthesis. In The IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Prajwal K R Liliane Momeni Triantafyllos Afouras and Andrew Zisserman. 2021. Visual Keyword Spotting with Attention. In BMVC.","DOI":"10.5244\/C.35.212"},{"key":"e_1_3_2_1_30_1","volume-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558","author":"Ren Yi","year":"2020","unstructured":"Yi Ren, Chenxu Hu, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2020. Fastspeech 2: Fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558 (2020)."},{"key":"e_1_3_2_1_31_1","volume-title":"Personalized One-Shot Lipreading for an ALS Patient. arXiv preprint arXiv:2111.01740","author":"Sen Bipasha","year":"2021","unstructured":"Bipasha Sen, Aditya Agarwal, Rudrabha Mukhopadhyay, Vinay Namboodiri, and CV Jawahar. 2021. Personalized One-Shot Lipreading for an ALS Patient. arXiv preprint arXiv:2111.01740 (2021)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10920"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"e_1_3_2_1_34_1","volume-title":"Learning Audio-Visual Speech Representation by Masked Multimodal Cluster Prediction. ArXiv","author":"Shi Bowen","year":"2022","unstructured":"Bowen Shi, Wei-Ning Hsu, Kushal Lakhotia, and Abdel rahman Mohamed. 2022. Learning Audio-Visual Speech Representation by Masked Multimodal Cluster Prediction. ArXiv, Vol. abs\/2201.02184 (2022)."},{"key":"e_1_3_2_1_35_1","volume-title":"Attention is All you Need. ArXiv","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam M. Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. ArXiv, Vol. abs\/1706.03762 (2017)."},{"key":"e_1_3_2_1_36_1","volume-title":"Video-Driven Speech Reconstruction using Generative Adversarial Networks. arXiv preprint arXiv:1906.06301","author":"Vougioukas Konstantinos","year":"2019","unstructured":"Konstantinos Vougioukas, Pingchuan Ma, Stavros Petridis, and Maja Pantic. 2019. Video-Driven Speech Reconstruction using Generative Adversarial Networks. arXiv preprint arXiv:1906.06301 (2019)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611787","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611787","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:38Z","timestamp":1755820838000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611787"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":36,"alternative-id":["10.1145\/3581783.3611787","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611787","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}