{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,2]],"date-time":"2025-12-02T03:31:55Z","timestamp":1764646315189,"version":"3.28.0"},"reference-count":35,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,1,10]],"date-time":"2021-01-10T00:00:00Z","timestamp":1610236800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,1,10]],"date-time":"2021-01-10T00:00:00Z","timestamp":1610236800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,1,10]],"date-time":"2021-01-10T00:00:00Z","timestamp":1610236800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,1,10]]},"DOI":"10.1109\/icpr48806.2021.9412187","type":"proceedings-article","created":{"date-parts":[[2021,5,5]],"date-time":"2021-05-05T22:15:54Z","timestamp":1620252954000},"page":"5286-5293","source":"Crossref","is-referenced-by-count":4,"title":["A Neural Lip-Sync Framework for Synthesizing Photorealistic Virtual News Anchors"],"prefix":"10.1109","author":[{"given":"Ruobing","family":"Zheng","sequence":"first","affiliation":[]},{"given":"Zhou","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Song","sequence":"additional","affiliation":[]},{"given":"Changjiang","family":"Ji","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref33","article-title":"Melnet: A generative model for audio in the frequency domain","author":"vasquez","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1155\/S1110865702206046"},{"key":"ref31","article-title":"Lip synchronization of speech","author":"mcallister","year":"1997","journal-title":"Audio-Visual Speech Processing Computational & Cognitive Science Approaches"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00603"},{"key":"ref35","article-title":"Image-to-image translation with conditional adversarial networks","author":"isola","year":"2016","journal-title":"ArXiv Preprint"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2005.06.042"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00955"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ISMAR-Adjunct.2019.00-40"},{"key":"ref12","article-title":"You said that?","author":"chung","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2018.8545744"},{"key":"ref14","article-title":"Audio-driven talking face video generation with natural head pose","author":"yi","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref15","article-title":"Neu-ral voice puppetry: Audio-driven facial reenactment","author":"thies","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref16","article-title":"Obamanet: Photo-realistic lip-sync from text","author":"kumar","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/258734.258880"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.885910"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073699"},{"key":"ref28","article-title":"Synthesizing photo-real talking head via trajectory-guided sample selection","author":"wang","year":"0","journal-title":"Eleventh Annual Conference of the International Speech Communication Association"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01034"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00917"},{"key":"ref3","doi-asserted-by":"crossref","first-page":"95","DOI":"10.1145\/3072959.3073640","article-title":"Syn-thesizing obama: learning lip sync from audio","volume":"36","author":"suwajanakorn","year":"2017","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"ref6","first-page":"366","article-title":"Audiozface: Generating speech\/face animation from single audio with attention-based bidirectional lstm networks","author":"tian","year":"0","journal-title":"2019 IEEE International Conference on Multimedia & Expo Workshops (ICMEW)"},{"key":"ref29","article-title":"Video-to-video synthesis","author":"wang","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073658"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019299"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3323028"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178899"},{"key":"ref9","first-page":"37","article-title":"End-to-end speech-driven realistic facial animation with temporal gans","author":"vougioukas","year":"0","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops"},{"key":"ref1","article-title":"Text driven 3d photo-realistic talking head","author":"wang","year":"0","journal-title":"Twelfth Annual Conference of the International Speech Communication Association"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2006.12.001"},{"journal-title":"Learning to forget Continual prediction with lstm","year":"1999","author":"gers","key":"ref22"},{"key":"ref21","first-page":"1033","article-title":"Learning recurrent neural networks with hessian-free optimization","author":"martens","year":"0","journal-title":"Proceedings of the 28th international conference on machine learning (ICML-11) Citeseer"},{"key":"ref24","article-title":"An empirical evaluation of generic convolutional and recurrent networks for sequence modeling","author":"bai","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref23","article-title":"The fall of rnn\/lstm","author":"culurciello","year":"2018","journal-title":"Towards Data Science"},{"journal-title":"Keras TCN","year":"2018","author":"philippe","key":"ref26"},{"key":"ref25","article-title":"Wavenet: A generative model for raw audio","author":"oord","year":"2016","journal-title":"ArXiv Preprint"}],"event":{"name":"2020 25th International Conference on Pattern Recognition (ICPR)","start":{"date-parts":[[2021,1,10]]},"location":"Milan, Italy","end":{"date-parts":[[2021,1,15]]}},"container-title":["2020 25th International Conference on Pattern Recognition (ICPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9411940\/9411911\/09412187.pdf?arnumber=9412187","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T11:40:45Z","timestamp":1652182845000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9412187\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,1,10]]},"references-count":35,"URL":"https:\/\/doi.org\/10.1109\/icpr48806.2021.9412187","relation":{},"subject":[],"published":{"date-parts":[[2021,1,10]]}}}