{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T02:31:57Z","timestamp":1771468317332,"version":"3.50.1"},"reference-count":45,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Institute of Information and Communications Technology Planning and Evaluation"},{"name":"Korea Government","award":["2019-0-01842"],"award-info":[{"award-number":["2019-0-01842"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Signal Process. Lett."],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/lsp.2025.3537949","type":"journal-article","created":{"date-parts":[[2025,2,3]],"date-time":"2025-02-03T18:34:38Z","timestamp":1738607678000},"page":"961-965","source":"Crossref","is-referenced-by-count":4,"title":["Text-to-Speech With Lip Synchronization Based on Speech-Assisted Text-to-Video Alignment and Masked Unit Prediction"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7579-662X","authenticated-orcid":false,"given":"Youngdo","family":"Ahn","sequence":"first","affiliation":[{"name":"School of Electrical Engineering and Computer Science, Gwangju Institute of Science and Technology, Gwangju, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8900-251X","authenticated-orcid":false,"given":"Jongwook","family":"Chae","sequence":"additional","affiliation":[{"name":"School of Electrical Engineering and Computer Science, Gwangju Institute of Science and Technology, Gwangju, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8910-0264","authenticated-orcid":false,"given":"Jong Won","family":"Shin","sequence":"additional","affiliation":[{"name":"School of Electrical Engineering and Computer Science, Gwangju Institute of Science and Technology, Gwangju, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"16582","article-title":"Neural dubber: Dubbing for videos according to scripts","volume-title":"Proc. 35th Int. Conf. Neural Inf. Process. Syst.","author":"Hu","year":"2021"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2021-38"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746421"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2179"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref7","first-page":"3171","article-title":"FastSpeech: Fast, robust and controllable text to speech","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Ren","year":"2019"},{"key":"ref8","article-title":"FastSpeech 2: Fast and high-quality end-to-end text to speech","author":"Ren","year":"2020","journal-title":"arXiv:2006.04558"},{"key":"ref9","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim","year":"2021"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-534"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054535"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26488"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1128"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/575"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446098"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3369537"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053732"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095621"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446852"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-475"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3139"},{"key":"ref23","first-page":"8067","article-title":"Glow-TTS: A generative flow for text-to-speech via monotonic alignment search","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Kim","year":"2020"},{"key":"ref24","article-title":"Classifier-free diffusion guidance","volume-title":"Proc. NeurIPS 2021 Workshop Deep Generative Models Downstream Appl.","author":"Ho","year":"2021"},{"key":"ref25","first-page":"16784","article-title":"GLIDE: Towards photorealistic image generation and editing with text-guided diffusion models","volume-title":"Proc. Inter. Conf. Mach. Learn.","author":"Nichol","year":"2022"},{"key":"ref26","article-title":"AudioGen: Textually guided audio generation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kreuk","year":"2023"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096023"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2021.06.003"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref30","first-page":"1336","article-title":"On generative spoken language modeling from raw audio","volume":"9","author":"Lakhotia","year":"2021","journal-title":"Trans. Assoc. Comput. Linguistics"},{"key":"ref31","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv:1810.04805"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01381"},{"key":"ref34","article-title":"Learning audio-visual speech representation by masked multimodal cluster prediction","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Shi","year":"2022"},{"key":"ref35","article-title":"LRS3-TED: A large-scale dataset for visual speech recognition","author":"Afouras","year":"2018","journal-title":"arXiv:1809.00496"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1982.1056489"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54427-4_19"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3038524"},{"key":"ref42","first-page":"359","article-title":"Using dynamic time warping to find patterns in time series","volume-title":"Proc. 3rd Int. Conf. Knowl. Discov. Data Mining","author":"Berndt","year":"1994"},{"key":"ref43","first-page":"12449","article-title":"Wav2Vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Baevski","year":"2020"},{"key":"ref44","article-title":"Methods for subjective determination of transmission quality","volume-title":"ITU-T","year":"1996"},{"key":"ref45","article-title":"Relative timing of sound and vision for broadcasting","volume-title":"I.-R. R. BT","year":"1998"}],"container-title":["IEEE Signal Processing Letters"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/97\/10802935\/10870201.pdf?arnumber=10870201","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,5]],"date-time":"2025-03-05T19:04:58Z","timestamp":1741201498000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10870201\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":45,"URL":"https:\/\/doi.org\/10.1109\/lsp.2025.3537949","relation":{},"ISSN":["1070-9908","1558-2361"],"issn-type":[{"value":"1070-9908","type":"print"},{"value":"1558-2361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}