{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,12]],"date-time":"2025-06-12T05:25:19Z","timestamp":1749705919549,"version":"3.37.3"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2020,9,22]],"date-time":"2020-09-22T00:00:00Z","timestamp":1600732800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,9,22]],"date-time":"2020-09-22T00:00:00Z","timestamp":1600732800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2021,1]]},"DOI":"10.1007\/s00371-020-01982-7","type":"journal-article","created":{"date-parts":[[2020,9,22]],"date-time":"2020-09-22T21:38:46Z","timestamp":1600810726000},"page":"95-105","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Fine-grained talking face generation with video reinterpretation"],"prefix":"10.1007","volume":"37","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7113-5066","authenticated-orcid":false,"given":"Xin","family":"Huang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingjie","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minglun","family":"Gong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2020,9,22]]},"reference":[{"key":"1982_CR1","first-page":"353","volume":"97","author":"C Bregler","year":"1997","unstructured":"Bregler, C., Covell, M., Slaney, M.: Video rewrite: driving visual speech with audio. Siggraph 97, 353\u2013360 (1997)","journal-title":"Siggraph"},{"key":"1982_CR2","doi-asserted-by":"crossref","unstructured":"Bulat, A., Tzimiropoulos, G.: How far are we from solving the 2d & 3d face alignment problem?(and a dataset of 230,000 3d facial landmarks). In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1021\u20131030 (2017)","DOI":"10.1109\/ICCV.2017.116"},{"key":"1982_CR3","doi-asserted-by":"crossref","unstructured":"Chen, L., Li, Z., Maddox, R.K., Duan, Z., Xu, C.: Lip movements generation at a glance. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 520\u2013535 (2018)","DOI":"10.1007\/978-3-030-01234-2_32"},{"key":"1982_CR4","doi-asserted-by":"crossref","unstructured":"Chen, L., Maddox, R.K., Duan, Z., Xu, C.: Hierarchical cross-modal talking face generation with dynamic pixel-wise loss. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7832\u20137841 (2019)","DOI":"10.1109\/CVPR.2019.00802"},{"key":"1982_CR5","unstructured":"Chung, J.S., Jamaludin, A., Zisserman, A.: You said that? (2017). arXiv preprint arXiv:1705.02966"},{"key":"1982_CR6","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Zisserman, A.: Lip reading in the wild. In: Asian Conference on Computer Vision, pp. 87\u2013103. Springer, Berlin (2016)","DOI":"10.1007\/978-3-319-54184-6_6"},{"issue":"5","key":"1982_CR7","doi-asserted-by":"publisher","first-page":"2421","DOI":"10.1121\/1.2229005","volume":"120","author":"M Cooke","year":"2006","unstructured":"Cooke, M., Barker, J., Cunningham, S., Shao, X.: An audio-visual corpus for speech perception and automatic speech recognition. J. Acoust. Soc. Am. 120(5), 2421\u20132424 (2006)","journal-title":"J. Acoust. Soc. Am."},{"key":"1982_CR8","doi-asserted-by":"crossref","unstructured":"Deng, Z., Neumann, U.: Expressive speech animation synthesis with phoneme-level controls. In: Computer Graphics Forum, vol.\u00a027, pp. 2096\u20132113. Wiley Online Library, Hoboken (2008)","DOI":"10.1111\/j.1467-8659.2008.01192.x"},{"issue":"9","key":"1982_CR9","doi-asserted-by":"publisher","first-page":"5287","DOI":"10.1007\/s11042-015-2944-3","volume":"75","author":"B Fan","year":"2016","unstructured":"Fan, B., Xie, L., Yang, S., Wang, L., Soong, F.K.: A deep bidirectional LSTM approach for video-realistic talking head. Multimed. Tools Appl 75(9), 5287\u20135309 (2016)","journal-title":"Multimed. Tools Appl"},{"key":"1982_CR10","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., Bengio, Y.: Generative adversarial nets. In: Advances in Neural Information Processing Systems, pp. 2672\u20132680 (2014)"},{"key":"1982_CR11","doi-asserted-by":"crossref","unstructured":"Huang, X., Wang, M., Gong, M.: Hierarchically-fused generative adversarial network for text to realistic image synthesis. In: 2019 16th Conference on Computer and Robot Vision (CRV), pp. 73\u201380. IEEE (2019)","DOI":"10.1109\/CRV.2019.00018"},{"issue":"4","key":"1982_CR12","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1145\/3072959.3073658","volume":"36","author":"T Karras","year":"2017","unstructured":"Karras, T., Aila, T., Laine, S., Herva, A., Lehtinen, J.: Audio-driven facial animation by joint end-to-end learning of pose and emotion. TOG 36(4), 94 (2017)","journal-title":"TOG"},{"key":"1982_CR13","doi-asserted-by":"crossref","unstructured":"Kim, Y., Lee, S.H.: Keyframe-based multi-contact motion synthesis. Vis. Comput. 1\u201315 (2020)","DOI":"10.1007\/s00371-020-01956-9"},{"key":"1982_CR14","doi-asserted-by":"crossref","unstructured":"Li, Y., Min, M.R., Shen, D., Carlson, D., Carin, L.: Video generation from text. In: Thirty-Second AAAI Conference on Artificial Intelligence (2018)","DOI":"10.1609\/aaai.v32i1.12233"},{"key":"1982_CR15","doi-asserted-by":"crossref","unstructured":"Luong, M.T., Pham, H., Manning, C.D.: Effective approaches to attention-based neural machine translation (2015). arXiv preprint arXiv:1508.04025","DOI":"10.18653\/v1\/D15-1166"},{"key":"1982_CR16","doi-asserted-by":"crossref","unstructured":"Ma, S., Fu, J., Wen\u00a0Chen, C., Mei, T.: Da-gan: instance-level image translation by deep attention generative adversarial networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5657\u20135666 (2018)","DOI":"10.1109\/CVPR.2018.00593"},{"issue":"11","key":"1982_CR17","doi-asserted-by":"publisher","first-page":"1915","DOI":"10.1109\/TVCG.2012.67","volume":"18","author":"X Ma","year":"2012","unstructured":"Ma, X., Deng, Z.: A statistical quality model for data-driven speech animation. IEEE Trans. Visual Comput. Graph. 18(11), 1915\u20131927 (2012)","journal-title":"IEEE Trans. Visual Comput. Graph."},{"key":"1982_CR18","unstructured":"Mathieu, M., Couprie, C., LeCun, Y.: Deep multi-scale video prediction beyond mean square error (2015). arXiv preprint arXiv:1511.05440"},{"key":"1982_CR19","unstructured":"Oh, J., Guo, X., Lee, H., Lewis, R.L., Singh, S.: Action-conditional video prediction using deep networks in atari games. In: Advances in Neural Information Processing Systems, pp. 2863\u20132871 (2015)"},{"key":"1982_CR20","doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A., et\u00a0al.: Deep face recognition. In: BMVC, vol.\u00a01, p.\u00a06 (2015)","DOI":"10.5244\/C.29.41"},{"key":"1982_CR21","doi-asserted-by":"crossref","unstructured":"Petridis, S., Li, Z., Pantic, M.: End-to-end visual speech recognition with LSTMS. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2592\u20132596. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952625"},{"key":"1982_CR22","doi-asserted-by":"crossref","unstructured":"Petridis, S., Stafylakis, T., Ma, P., Cai, F., Tzimiropoulos, G., Pantic, M.: End-to-end audiovisual speech recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6548\u20136552. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8461326"},{"key":"1982_CR23","doi-asserted-by":"crossref","unstructured":"Pumarola, A., Agudo, A., Martinez, A.M., Sanfeliu, A., Moreno-Noguer, F.: Ganimation: anatomically-aware facial animation from a single image. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 818\u2013833 (2018)","DOI":"10.1007\/978-3-030-01249-6_50"},{"key":"1982_CR24","doi-asserted-by":"crossref","unstructured":"Qiao, T., Zhang, J., Xu, D., Tao, D.: Mirrorgan: learning text-to-image generation by redescription. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1505\u20131514 (2019)","DOI":"10.1109\/CVPR.2019.00160"},{"key":"1982_CR25","doi-asserted-by":"crossref","unstructured":"Saito, M., Matsumoto, E., Saito, S.: Temporal generative adversarial nets with singular value clipping. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2830\u20132839 (2017)","DOI":"10.1109\/ICCV.2017.308"},{"key":"1982_CR26","doi-asserted-by":"crossref","unstructured":"Song, Y., Zhu, J., Wang, X., Qi, H.: Talking face generation by conditional recurrent adversarial network (2018). arXiv preprint arXiv:1804.04786","DOI":"10.24963\/ijcai.2019\/129"},{"key":"1982_CR27","doi-asserted-by":"crossref","unstructured":"Stafylakis, T., Tzimiropoulos, G.: Combining residual networks with LSTMS for lipreading (2017). arXiv preprint arXiv:1703.04105","DOI":"10.21437\/Interspeech.2017-85"},{"issue":"4","key":"1982_CR28","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn, S., Seitz, S.M., Kemelmacher-Shlizerman, I.: Synthesizing Obama: learning lip sync from audio. ACM Trans. Graph. (TOG) 36(4), 95 (2017)","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"4","key":"1982_CR29","doi-asserted-by":"publisher","first-page":"93","DOI":"10.1145\/3072959.3073699","volume":"36","author":"S Taylor","year":"2017","unstructured":"Taylor, S., Kim, T., Yue, Y., Mahler, M., Krahe, J., Rodriguez, A.G., Hodgins, J., Matthews, I.: A deep learning approach for generalized speech animation. ACM Trans. Graph. (TOG) 36(4), 93 (2017)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"1982_CR30","doi-asserted-by":"crossref","unstructured":"Thies, J., Zollhofer, M., Stamminger, M., Theobalt, C., Nie\u00dfner, M.: Face2face: real-time face capture and reenactment of RGB videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2387\u20132395 (2016)","DOI":"10.1109\/CVPR.2016.262"},{"key":"1982_CR31","doi-asserted-by":"crossref","unstructured":"Tulyakov, S., Liu, M.Y., Yang, X., Kautz, J.: Mocogan: decomposing motion and content for video generation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1526\u20131535 (2018)","DOI":"10.1109\/CVPR.2018.00165"},{"key":"1982_CR32","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Generating videos with scene dynamics. In: Advances In Neural Information Processing Systems, pp. 613\u2013621 (2016)"},{"key":"1982_CR33","doi-asserted-by":"crossref","unstructured":"Vougioukas, K., Petridis, S., Pantic, M.: End-to-end speech-driven realistic facial animation with temporal GANs. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp. 37\u201340 (2019)","DOI":"10.1007\/s11263-019-01251-8"},{"key":"1982_CR34","doi-asserted-by":"crossref","unstructured":"Wan, V., Anderson, R., Blokland, A., Braunschweiler, N., Chen, L., Kolluru, B., Latorre, J., Maia, R., Stenger, B., Yanagisawa, K., et\u00a0al.: Photo-realistic expressive text to talking head synthesis. In: INTERSPEECH, pp. 2667\u20132669 (2013)","DOI":"10.1145\/2503385.2503473"},{"issue":"4","key":"1982_CR35","doi-asserted-by":"publisher","first-page":"600","DOI":"10.1109\/TIP.2003.819861","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang, Z., Bovik, A.C., Sheikh, H.R., Simoncelli, E.P., et al.: Image quality assessment: from error visibility to structural similarity. IEEE Trans. Image Process. 13(4), 600\u2013612 (2004)","journal-title":"IEEE Trans. Image Process."},{"key":"1982_CR36","doi-asserted-by":"crossref","unstructured":"Wiles, O., Sophia\u00a0Koepke, A., Zisserman, A.: X2face: a network for controlling face generation using images, audio, and pose codes. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 670\u2013686 (2018)","DOI":"10.1007\/978-3-030-01261-8_41"},{"issue":"1","key":"1982_CR37","doi-asserted-by":"publisher","first-page":"79","DOI":"10.3354\/cr030079","volume":"30","author":"CJ Willmott","year":"2005","unstructured":"Willmott, C.J., Matsuura, K.: Advantages of the mean absolute error (MAE) over the root mean square error (RMSE) in assessing average model performance. Clim. Res. 30(1), 79\u201382 (2005)","journal-title":"Clim. Res."},{"issue":"3","key":"1982_CR38","doi-asserted-by":"publisher","first-page":"500","DOI":"10.1109\/TMM.2006.888009","volume":"9","author":"L Xie","year":"2007","unstructured":"Xie, L., Liu, Z.Q.: Realistic mouth-synching for speech-driven talking face using articulatory modelling. IEEE Trans. Multimed. 9(3), 500\u2013510 (2007)","journal-title":"IEEE Trans. Multimed."},{"key":"1982_CR39","doi-asserted-by":"crossref","unstructured":"Xu, T., Zhang, P., Huang, Q., Zhang, H., Gan, Z., Huang, X., He, X.: Attngan: fine-grained text to image generation with attentional generative adversarial networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1316\u20131324 (2018)","DOI":"10.1109\/CVPR.2018.00143"},{"issue":"1\u20132","key":"1982_CR40","doi-asserted-by":"publisher","first-page":"23","DOI":"10.1016\/S0167-6393(98)00048-X","volume":"26","author":"H Yehia","year":"1998","unstructured":"Yehia, H., Rubin, P., Vatikiotis-Bateson, E.: Quantitative association of vocal-tract and facial behavior. Speech Commun. 26(1\u20132), 23\u201343 (1998)","journal-title":"Speech Commun."},{"key":"1982_CR41","unstructured":"Zhang, H., Goodfellow, I., Metaxas, D., Odena, A.: Self-attention generative adversarial networks (2018). arXiv preprint arXiv:1805.08318"},{"key":"1982_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, H., Xu, T., Li, H., Zhang, S., Wang, X., Huang, X., Metaxas, D.N.: Stackgan: text to photo-realistic image synthesis with stacked generative adversarial networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5907\u20135915 (2017)","DOI":"10.1109\/ICCV.2017.629"},{"issue":"8","key":"1982_CR43","doi-asserted-by":"publisher","first-page":"1947","DOI":"10.1109\/TPAMI.2018.2856256","volume":"41","author":"H Zhang","year":"2018","unstructured":"Zhang, H., Xu, T., Li, H., Zhang, S., Wang, X., Huang, X., Metaxas, D.N.: Stackgan++: realistic image synthesis with stacked generative adversarial networks. IEEE Trans. Pattern Anal. Mach. Intell. 41(8), 1947\u20131962 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1982_CR44","first-page":"9299","volume":"33","author":"H Zhou","year":"2019","unstructured":"Zhou, H., Liu, Y., Liu, Z., Luo, P., Wang, X.: Talking face generation by adversarially disentangled audio-visual representation. Proc. AAAI Conf. Artif. Intell. 33, 9299\u20139306 (2019)","journal-title":"Proc. AAAI Conf. Artif. Intell."}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-020-01982-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-020-01982-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-020-01982-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,10,7]],"date-time":"2023-10-07T23:47:39Z","timestamp":1696722459000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-020-01982-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,9,22]]},"references-count":44,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2021,1]]}},"alternative-id":["1982"],"URL":"https:\/\/doi.org\/10.1007\/s00371-020-01982-7","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"type":"print","value":"0178-2789"},{"type":"electronic","value":"1432-2315"}],"subject":[],"published":{"date-parts":[[2020,9,22]]},"assertion":[{"value":"12 September 2020","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 September 2020","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Compliance with ethical standards"}},{"value":"To the best of our knowledge, the named authors have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}