{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,29]],"date-time":"2026-06-29T12:49:37Z","timestamp":1782737377239,"version":"3.54.5"},"reference-count":50,"publisher":"Springer Science and Business Media LLC","issue":"11-12","license":[{"start":{"date-parts":[[2019,2,13]],"date-time":"2019-02-13T00:00:00Z","timestamp":1550016000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"funder":[{"DOI":"10.13039\/501100000266","name":"Engineering and Physical Sciences Research Council","doi-asserted-by":"publisher","award":["EP\/M013774\/1"],"award-info":[{"award-number":["EP\/M013774\/1"]}],"id":[{"id":"10.13039\/501100000266","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2019,12]]},"DOI":"10.1007\/s11263-019-01150-y","type":"journal-article","created":{"date-parts":[[2019,2,13]],"date-time":"2019-02-13T11:31:10Z","timestamp":1550057470000},"page":"1767-1779","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":167,"title":["You Said That?: Synthesising Talking Faces from Audio"],"prefix":"10.1007","volume":"127","author":[{"given":"Amir","family":"Jamaludin","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Joon Son","family":"Chung","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Andrew","family":"Zisserman","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2019,2,13]]},"reference":[{"key":"1150_CR1","unstructured":"Afouras, T., Chung, J. S., Senior, A., Vinyals, O., & Zisserman, A. (2018). Deep audio-visual speech recognition. In IEEE transactions on pattern analysis and machine intelligence. arXiv preprint arXiv:1809.02108 ."},{"key":"1150_CR2","doi-asserted-by":"crossref","unstructured":"Arandjelovi\u0107, R., & Zisserman, A. (2017). Look, listen and learn. In Proceedings of the international conference on computer vision.","DOI":"10.1109\/ICCV.2017.73"},{"key":"1150_CR3","unstructured":"Aytar, Y., Vondrick, C., & Torralba, A. (2016). SoundNet: Learning sound representations from unlabeled video. In Advances in neural information processing systems."},{"key":"1150_CR4","unstructured":"Cappelletta, L., & Harte, N. (2012). Phoneme-to-viseme mapping for visual speech recognition. In ICPRAM."},{"key":"1150_CR5","doi-asserted-by":"crossref","unstructured":"Charles, J., Magee, D., & Hogg, D. (2016). Virtual immortality: Reanimating characters from TV shows. In Computer vision\u2013ECCV 2016 workshops (pp. 879\u2013886). Springer.","DOI":"10.1007\/978-3-319-49409-8_71"},{"key":"1150_CR6","doi-asserted-by":"crossref","unstructured":"Chatfield, K., Simonyan, K., Vedaldi, A., & Zisserman, A. (2014). Return of the devil in the details: Delving deep into convolutional nets. In Proceedings of the british machine vision conference.","DOI":"10.5244\/C.28.6"},{"key":"1150_CR7","doi-asserted-by":"crossref","unstructured":"Chen, Q., & Koltun, V. (2017). Photographic image synthesis with cascaded refinement networks. In Proceedings of the international conference on computer vision.","DOI":"10.1109\/ICCV.2017.168"},{"key":"1150_CR8","unstructured":"Chung, J. S., & Zisserman, A. (2016). Out of time: automated lip sync in the wild. In Workshop on multi-view lip-reading, ACCV."},{"key":"1150_CR9","unstructured":"Chung, J. S., Jamaludin, A., & Zisserman, A. (2017). You said that? In Proceedings of the british machine vision conference."},{"key":"1150_CR10","doi-asserted-by":"crossref","unstructured":"Chung, J. S., Nagrani, A., & Zisserman, A. (2018). VoxCeleb2: Deep speaker recognition. In INTERSPEECH.","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"1150_CR11","doi-asserted-by":"crossref","unstructured":"Chung, J. S., Senior, A., Vinyals, O., & Zisserman, A. (2017). Lip reading sentences in the wild. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2017.367"},{"key":"1150_CR12","unstructured":"Chung, S. W., Chung, J. S., & Kang, H. G. (2019). Perfect match: Improved cross-modal embeddings for audio-visual synchronisa-tion. In IEEE international conference on acoustics, speech and signal processing. arXiv preprint arXiv:1809.08001 ."},{"key":"1150_CR13","unstructured":"Denton, E. L., & Birodkar, V. (2017). Unsupervised learning of disentangled representations from video. In Advances in neural information processing systems."},{"key":"1150_CR14","doi-asserted-by":"crossref","unstructured":"Doersch, C., Gupta, A., & Efros, A. A. (2015). Unsupervised visual representation learning by context prediction. In Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/ICCV.2015.167"},{"issue":"1","key":"1150_CR15","doi-asserted-by":"publisher","first-page":"45","DOI":"10.1023\/A:1008166717597","volume":"38","author":"T Ezzat","year":"2000","unstructured":"Ezzat, T., & Poggio, T. (2000). Visual speech synthesis by morphing visemes. International Journal of Computer Vision, 38(1), 45\u201357.","journal-title":"International Journal of Computer Vision"},{"key":"1150_CR16","doi-asserted-by":"crossref","unstructured":"Fan, B., Wang, L., Soong, F. K., & Xie, L. (2015). Photo-real talking head with deep bidirectional LSTM. In IEEE international conference on acoustics, speech and signal processing.","DOI":"10.1109\/ICASSP.2015.7178899"},{"key":"1150_CR17","doi-asserted-by":"crossref","unstructured":"Fernando, B., Bilen, H., Gavves, E., & Gould, S. (2017). Self-supervised video representation learning with odd-one-out networks. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2017.607"},{"key":"1150_CR18","first-page":"193","volume-title":"Computer graphics forum","author":"P Garrido","year":"2015","unstructured":"Garrido, P., Valgaerts, L., Sarmadi, H., Steiner, I., Varanasi, K., P\u00e9rez, P., et al. (2015). VDUB: Modifying face video of actors for plausible visual alignment to a dubbed audio track. In O. Deussen & H. Zhang (Eds.), Computer graphics forum (Vol. 34, pp. 193\u2013204). London: Wiley."},{"key":"1150_CR19","doi-asserted-by":"crossref","unstructured":"Gatys, L. A., Ecker, A. S., & Bethge, M. (2016). Image style transfer using convolutional neural networks. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2414\u20132423).","DOI":"10.1109\/CVPR.2016.265"},{"key":"1150_CR20","unstructured":"Goodfellow, I. J., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A. C., & Bengio, Y. (2014). Generative adversarial nets. In Advances in neural information processing systems (pp. 2672\u20132680)."},{"issue":"5786","key":"1150_CR21","doi-asserted-by":"publisher","first-page":"504","DOI":"10.1126\/science.1127647","volume":"313","author":"GE Hinton","year":"2006","unstructured":"Hinton, G. E., & Salakhutdinov, R. R. (2006). Reducing the dimensionality of data with neural networks. Science, 313(5786), 504\u2013507.","journal-title":"Science"},{"key":"1150_CR22","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J. Y., Zhou, T., & Efros, A. A. (2017). Image-to-image translation with conditional adversarial networks. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2017.632"},{"key":"1150_CR23","unstructured":"Isola, P., Zoran, D., Krishnan, D., & Adelson, E. H. (2016). Learning visual groups from co-occurrences in space and time. In Workshop at international conference on learning representations."},{"key":"1150_CR24","doi-asserted-by":"crossref","unstructured":"Karpathy, A., & Fei-Fei, L. (2015). Deep visual-semantic alignments for generating image descriptions. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3128\u20133137).","DOI":"10.1109\/CVPR.2015.7298932"},{"issue":"4","key":"1150_CR25","doi-asserted-by":"publisher","first-page":"94:1","DOI":"10.1145\/3072959.3073658","volume":"36","author":"T Karras","year":"2017","unstructured":"Karras, T., Aila, T., Laine, S., Herva, A., & Lehtinen, J. (2017). Audio-driven facial animation by joint end-to-end learning of pose and emotion. ACM Transactions on Graphics, 36(4), 94:1\u201394:12. https:\/\/doi.org\/10.1145\/3072959.3073658 .","journal-title":"ACM Transactions on Graphics"},{"key":"1150_CR26","doi-asserted-by":"crossref","unstructured":"Kazemi, V., & Sullivan, J. (2014). One millisecond face alignment with an ensemble of regression trees. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1867\u20131874).","DOI":"10.1109\/CVPR.2014.241"},{"key":"1150_CR27","doi-asserted-by":"crossref","unstructured":"Kim, J., Lee, J. K., & Lee, K. M. (2016). Accurate image super-resolution using very deep convolutional networks. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2016.182"},{"key":"1150_CR28","first-page":"1755","volume":"10","author":"DE King","year":"2009","unstructured":"King, D. E. (2009). Dlib-ml: A machine learning toolkit. The Journal of Machine Learning Research, 10, 1755\u20131758.","journal-title":"The Journal of Machine Learning Research"},{"key":"1150_CR29","doi-asserted-by":"publisher","first-page":"469","DOI":"10.1142\/S021946780100027X","volume":"1","author":"R Lienhart","year":"2001","unstructured":"Lienhart, R. (2001). Reliable transition detection in videos: A survey and practitioner\u2019s guide. International Journal of Image and Graphics, 1, 469.","journal-title":"International Journal of Image and Graphics"},{"key":"1150_CR30","unstructured":"Lucas, B. D., & Kanade, T. (1981). An iterative image registration technique with an application to stereo vision. In Proceedings of the 7th international joint conference on artificial intelligence (pp. 674\u2013679). http:\/\/citeseer.nj.nec.com\/lucas81optical.html ."},{"key":"1150_CR31","doi-asserted-by":"crossref","unstructured":"Misra, I., Zitnick, C. L., & Hebert, M. (2016). Shuffle and learn: unsupervised learning using temporal order verification. In Proceedings of the European conference on computer vision.","DOI":"10.1007\/978-3-319-46448-0_32"},{"key":"1150_CR32","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Albanie, S., & Zisserman, A. (2018). Seeing voices and hearing faces: Cross-modal biometric matching. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2018.00879"},{"key":"1150_CR33","doi-asserted-by":"crossref","unstructured":"Owens, A., Isola, P., McDermott, J. H., Torralba, A., Adelson, E. H., & Freeman, W. T. (2016). Visually indicated sounds. In Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 2405\u20132413. IEEE Computer Society.","DOI":"10.1109\/CVPR.2016.264"},{"key":"1150_CR34","unstructured":"Parkhi, O. M. (2015). Features and methods for improving large scale face recognition. Ph.D. thesis, Department of Engineering Science Oxford University."},{"key":"1150_CR35","doi-asserted-by":"crossref","unstructured":"Parkhi, O. M., Vedaldi, A., & Zisserman, A. (2015). Deep face recognition. In Proceedings of the British machine vision conference","DOI":"10.5244\/C.29.41"},{"key":"1150_CR36","doi-asserted-by":"crossref","unstructured":"Pathak, D., Krahenbuhl, P., Donahue, J., Darrell, T., & Efros, A. A. (2016). Context encoders: Feature learning by inpainting. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2536\u20132544).","DOI":"10.1109\/CVPR.2016.278"},{"key":"1150_CR37","unstructured":"P\u0103tr\u0103ucean, V., Handa, A., & Cipolla, R. (2016). Spatio-temporal video autoencoder with differentiable memory. In Advances in neural information processing systems."},{"issue":"3","key":"1150_CR38","doi-asserted-by":"publisher","first-page":"313","DOI":"10.1145\/882262.882269","volume":"22","author":"P Perez","year":"2003","unstructured":"Perez, P., Gangnet, M., & Blake, A. (2003). Poisson image editing. ACM Transactions on Graphics, 22(3), 313\u2013318.","journal-title":"ACM Transactions on Graphics"},{"key":"1150_CR39","first-page":"1060","volume-title":"ICML. JMLR Workshop and Conference Proceedings","author":"SE Reed","year":"2016","unstructured":"Reed, S. E., Akata, Z., Yan, X., Logeswaran, L., Schiele, B., & Lee, H. (2016). Generative adversarial text to image synthesis. In M. E. Balcan & K. Q. Weinberger (Eds.), ICML. JMLR Workshop and Conference Proceedings (Vol. 48, pp. 1060\u20131069). Cambridge: JMLR."},{"key":"1150_CR40","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., & Brox, T. (2015). U-net: Convolutional networks for biomedical image segmentation. In International conference on medical image computing and computer-assisted intervention (pp. 234\u2013241). Springer.","DOI":"10.1007\/978-3-319-24574-4_28"},{"issue":"4","key":"1150_CR41","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn, S., Seitz, S. M., & Kemelmacher-Shlizerman, I. (2017). Synthesizing obama: Learning lip sync from audio. ACM Transactions on Graphics (TOG), 36(4), 95.","journal-title":"ACM Transactions on Graphics (TOG)"},{"issue":"4","key":"1150_CR42","doi-asserted-by":"publisher","first-page":"93","DOI":"10.1145\/3072959.3073699","volume":"36","author":"S Taylor","year":"2017","unstructured":"Taylor, S., Kim, T., Yue, Y., Mahler, M., Krahe, J., Rodriguez, A. G., et al. (2017). A deep learning approach for generalized speech animation. ACM Transactions on Graphics (TOG), 36(4), 93.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"1150_CR43","unstructured":"van\u00a0den Oord, A., Kalchbrenner, N., Espeholt, L., Vinyals, O., Graves, A., et\u00a0al. (2016a). Conditional image generation with pixelcnn decoders. In Advances in neural information processing systems (pp. 4790\u20134798)."},{"key":"1150_CR44","unstructured":"van den Oord, A., Kalchbrenner, N., & Kavukcuoglu, K. (2016b). Pixel recurrent neural networks. In M. E. Balcan & K. Q. Weinberger (Eds.), Proceedings of the 33rd International Conference on Machine Learning. Proceedings of Machine Learning Research (Vol. 48, pp. 1747\u20131756). NewYork: PMLR."},{"key":"1150_CR45","doi-asserted-by":"crossref","unstructured":"Vedaldi, A., & Lenc, K. (2015). Matconvnet: Convolutional neural networks for matlab. In Proceedings of the ACM multimedia conference.","DOI":"10.1145\/2733373.2807412"},{"key":"1150_CR46","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., & Erhan, D. (2015). Show and tell: A neural image caption generator. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3156\u20133164).","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"1150_CR47","doi-asserted-by":"crossref","unstructured":"Wang, X., & Gupta, A. (2015). Unsupervised learning of visual representations using videos. In Proceedings of the international conference on computer vision.","DOI":"10.1109\/ICCV.2015.320"},{"key":"1150_CR48","first-page":"2048","volume-title":"Proceedings of the 32nd International Conference on Machine Learning. Proceedings of Machine Learning Research","author":"K Xu","year":"2015","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., et al. (2015). Show, attend and tell: Neural image caption generation with visual attention. In F. Bach & D. Blei (Eds.), Proceedings of the 32nd International Conference on Machine Learning. Proceedings of Machine Learning Research (Vol. 37, pp. 2048\u20132057). Lille: PMLR."},{"key":"1150_CR49","unstructured":"Xue, T., Wu, J., Bouman, K., & Freeman, B. (2016). Visual dynamics: Probabilistic future frame synthesis via cross convolutional networks. In Advances in neural information processing systems."},{"key":"1150_CR50","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., & Efros, A. A. (2016). Colorful image colorization. In Proceedings of the European conference on computer vision (pp. 649\u2013666). Springer.","DOI":"10.1007\/978-3-319-46487-9_40"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-019-01150-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-019-01150-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-019-01150-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,11,27]],"date-time":"2020-11-27T16:57:26Z","timestamp":1606496246000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-019-01150-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,2,13]]},"references-count":50,"journal-issue":{"issue":"11-12","published-print":{"date-parts":[[2019,12]]}},"alternative-id":["1150"],"URL":"https:\/\/doi.org\/10.1007\/s11263-019-01150-y","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,2,13]]},"assertion":[{"value":"28 February 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 January 2019","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 February 2019","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}