{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:50:05Z","timestamp":1778082605870,"version":"3.51.4"},"publisher-location":"Cham","reference-count":29,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319544267","type":"print"},{"value":"9783319544274","type":"electronic"}],"license":[{"start":{"date-parts":[[2017,1,1]],"date-time":"2017-01-01T00:00:00Z","timestamp":1483228800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017]]},"DOI":"10.1007\/978-3-319-54427-4_19","type":"book-chapter","created":{"date-parts":[[2017,3,15]],"date-time":"2017-03-15T08:16:53Z","timestamp":1489565813000},"page":"251-263","source":"Crossref","is-referenced-by-count":367,"title":["Out of Time: Automated Lip Sync in the Wild"],"prefix":"10.1007","author":[{"given":"Joon Son","family":"Chung","sequence":"first","affiliation":[]},{"given":"Andrew","family":"Zisserman","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2017,3,16]]},"reference":[{"key":"19_CR1","unstructured":"Bt.1359: Relative timing of sound and vision for broadcasting. ITU (1998)"},{"key":"19_CR2","doi-asserted-by":"crossref","unstructured":"Anina, I., Zhou, Z., Zhao, G., Pietik\u00e4inen, M.: Ouluvs2: a multi-view audiovisual database for non-rigid mouth motion analysis. In: 11th IEEE International Conference and Workshops on Automatic Face and Gesture Recognition (FG), vol. 1, pp. 1\u20135. IEEE (2015)","DOI":"10.1109\/FG.2015.7163155"},{"issue":"1","key":"19_CR3","doi-asserted-by":"crossref","first-page":"179","DOI":"10.1155\/2007\/70186","volume":"2007","author":"H Bredin","year":"2007","unstructured":"Bredin, H., Chollet, G.: Audiovisual speech synchrony measure: application to biometrics. EURASIP J. Appl. Signal Process. 2007(1), 179 (2007)","journal-title":"EURASIP J. Appl. Signal Process."},{"key":"19_CR4","doi-asserted-by":"crossref","unstructured":"Chakravarty, P., Tuytelaars, T.: Cross-modal supervision for learning active speaker detection in video. arXiv preprint arXiv:1603.08907 (2016)","DOI":"10.1007\/978-3-319-46454-1_18"},{"key":"19_CR5","doi-asserted-by":"crossref","unstructured":"Chatfield, K., Simonyan, K., Vedaldi, A., Zisserman, A.: Return of the devil in the details: delving deep into convolutional nets. In: Proceedings of BMVC (2014)","DOI":"10.5244\/C.28.6"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Chopra, S., Hadsell, R., LeCun, Y.: Learning a similarity metric discriminatively, with application to face verification. In: Proceedings of the CVPR, vol. 1, pp. 539\u2013546. IEEE (2005)","DOI":"10.1109\/CVPR.2005.202"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Zisserman, A.: Lip reading in the wild. In: Proceedings of ACCV (2016)","DOI":"10.1007\/978-3-319-54184-6_6"},{"issue":"4","key":"19_CR8","doi-asserted-by":"crossref","first-page":"357","DOI":"10.1109\/TASSP.1980.1163420","volume":"28","author":"SB Davis","year":"1980","unstructured":"Davis, S.B., Mermelstein, P.: Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences. IEEE Trans. Acoust. Speech Signal Process. 28(4), 357\u2013366 (1980)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"19_CR9","unstructured":"Geras, K.J., Mohamed, A.R., Caruana, R., Urban, G., Wang, S., Aslan, O., Philipose, M., Richardson, M., Sutton, C.: Compressing LSTMS into CNNS. arXiv preprint arXiv:1511.06433 (2015)"},{"key":"19_CR10","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. arXiv preprint arXiv:1502.03167 (2015)"},{"key":"19_CR11","unstructured":"Jia, Y.: Caffe: an open source convolutional architecture for fast feature embedding (2013). http:\/\/caffe.berkeleyvision.org\/"},{"key":"19_CR12","first-page":"1755","volume":"10","author":"DE King","year":"2009","unstructured":"King, D.E.: Dlib-ml: a machine learning toolkit. J. Mach. Learn. Res. 10, 1755\u20131758 (2009)","journal-title":"J. Mach. Learn. Res."},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Koster, B.E., Rodman, R.D., Bitzer, D.: Automated lip-sync: direct translation of speech-sound to mouth-shape. In: 1994 Conference Record of the Twenty-Eighth Asilomar Conference on Signals, Systems and Computers, vol. 1, pp. 583\u2013586. IEEE (1994)","DOI":"10.1109\/ACSSC.1994.471519"},{"key":"19_CR14","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: ImageNet classification with deep convolutional neural networks. In: NIPS, pp. 1106\u20131114 (2012)"},{"issue":"4","key":"19_CR15","doi-asserted-by":"crossref","first-page":"118","DOI":"10.1002\/vis.4340020404","volume":"2","author":"J Lewis","year":"1991","unstructured":"Lewis, J.: Automated lip-sync: background and techniques. J. Vis. Comput. Anim. 2(4), 118\u2013122 (1991)","journal-title":"J. Vis. Comput. Anim."},{"key":"19_CR16","doi-asserted-by":"crossref","first-page":"469","DOI":"10.1142\/S021946780100027X","volume":"1","author":"R Lienhart","year":"2001","unstructured":"Lienhart, R.: Reliable transition detection in videos: a survey and practitioner\u2019s guide. Int. J. Image Graph. 1, 469\u2013486 (2001)","journal-title":"Int. J. Image Graph."},{"key":"19_CR17","doi-asserted-by":"crossref","unstructured":"Marcheret, E., Potamianos, G., Vopicka, J., Goel, V.: Detecting audio-visual synchrony using deep neural networks. In: Sixteenth Annual Conference of the International Speech Communication Association (2015)","DOI":"10.21437\/Interspeech.2015-201"},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"McAllister, D.F., Rodman, R.D., Bitzer, D.L., Freeman, A.S.: Lip synchronization of speech. In: Audio-Visual Speech Processing: Computational & Cognitive Science Approaches (1997)","DOI":"10.1145\/259081.259312"},{"key":"19_CR19","doi-asserted-by":"crossref","unstructured":"Morishima, S., Ogata, S., Murai, K., Nakamura, S.: Audio-visual speech translation with automatic lip syncronization and face tracking based on 3-D head model. In: IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), vol. 2, p. II-2117. IEEE (2002)","DOI":"10.1109\/ICASSP.2002.5745053"},{"issue":"3","key":"19_CR20","doi-asserted-by":"crossref","first-page":"271","DOI":"10.1007\/s10044-008-0121-2","volume":"12","author":"EA R\u00faa","year":"2009","unstructured":"R\u00faa, E.A., Bredin, H., Mateo, C.G., Chollet, G., Jim\u00e9nez, D.G.: Audio-visual speech asynchrony detection using co-inertia analysis and coupled hidden markov models. Pattern Anal. Appl. 12(3), 271\u2013284 (2009)","journal-title":"Pattern Anal. Appl."},{"key":"19_CR21","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, S., Karpathy, A., Khosla, A., Bernstein, M., Berg, A., Li, F.: ImageNet large scale visual recognition challenge. IJCV 115, 211\u2013252 (2015)","journal-title":"IJCV"},{"issue":"7","key":"19_CR22","doi-asserted-by":"crossref","first-page":"1396","DOI":"10.1109\/TMM.2007.906583","volume":"9","author":"ME Sargin","year":"2007","unstructured":"Sargin, M.E., Yemez, Y., Erzin, E., Tekalp, A.M.: Audiovisual synchronization and fusion using canonical correlation analysis. IEEE Trans. Multimed. 9(7), 1396\u20131403 (2007)","journal-title":"IEEE Trans. Multimed."},{"key":"19_CR23","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: NIPS (2014)"},{"key":"19_CR24","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: International Conference on Learning Representations (2015)"},{"key":"19_CR25","unstructured":"Lucas, B.D., Kanade, T.: An iterative image registration technique with an application to stereo vision, Vancouver, BC, Canada (1981)"},{"key":"19_CR26","doi-asserted-by":"crossref","unstructured":"Vedaldi, A., Lenc, K.: Matconvnet - convolutional neural networks for MATLAB. CoRR abs\/1412.4564 (2014)","DOI":"10.1145\/2733373.2807412"},{"key":"19_CR27","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Arandjelovi\u0107, R., Zisserman, A.: Faces in places: compound query retrieval. In: British Machine Vision Conference (2016)","DOI":"10.5244\/C.30.56"},{"issue":"1","key":"19_CR28","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TPAMI.2013.173","volume":"36","author":"Z Zhou","year":"2014","unstructured":"Zhou, Z., Hong, X., Zhao, G., Pietik\u00e4inen, M.: A compact representation of visual speech data using latent variables. IEEE Trans. Pattern Anal. Mach. Intell. 36(1), 1 (2014)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"19_CR29","doi-asserted-by":"crossref","unstructured":"Zoric, G., Pandzic, I.S.: A real-time lip sync system using a genetic algorithm for automatic neural network configuration. In: 2005 IEEE International Conference on Multimedia and Expo, pp. 1366\u20131369. IEEE (2005)","DOI":"10.1109\/ICME.2005.1521684"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2016 Workshops"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-54427-4_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,16]],"date-time":"2025-06-16T21:16:39Z","timestamp":1750108599000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-54427-4_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017]]},"ISBN":["9783319544267","9783319544274"],"references-count":29,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-54427-4_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2017]]}}}