{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T13:57:00Z","timestamp":1775570220052,"version":"3.50.1"},"publisher-location":"Cham","reference-count":52,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729393","type":"print"},{"value":"9783031729409","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,17]],"date-time":"2024-11-17T00:00:00Z","timestamp":1731801600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,17]],"date-time":"2024-11-17T00:00:00Z","timestamp":1731801600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72940-9_1","type":"book-chapter","created":{"date-parts":[[2024,11,16]],"date-time":"2024-11-16T20:42:53Z","timestamp":1731789773000},"page":"1-18","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":15,"title":["Audio-Synchronized Visual Animation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-5219-2630","authenticated-orcid":false,"given":"Lin","family":"Zhang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3308-9585","authenticated-orcid":false,"given":"Shentong","family":"Mo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2848-8649","authenticated-orcid":false,"given":"Yijing","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0955-6510","authenticated-orcid":false,"given":"Pedro","family":"Morgado","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,17]]},"reference":[{"key":"1_CR1","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: IEEE International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"1_CR2","doi-asserted-by":"crossref","unstructured":"Blattmann, A., et al.: Align your latents: high-resolution video synthesis with latent diffusion models. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"1_CR3","unstructured":"Brock, A., Donahue, J., Simonyan, K.: Large scale GAN training for high fidelity natural image synthesis. In: ICLR (2019)"},{"key":"1_CR4","unstructured":"Castellano, B.: Pyscenedetect. https:\/\/www.scenedetect.com\/"},{"key":"1_CR5","unstructured":"Chen, H., et al.: VideoCrafter1: open diffusion models for high-quality video generation (2023)"},{"key":"1_CR6","unstructured":"Chen, H., Xie, W., Afouras, T., Nagrani, A., Vedaldi, A., Zisserman, A.: Audio-visual synchronization in the wild. In: Proceedings of the British Machine Vision Conference (BMVC) (2021)"},{"key":"1_CR7","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Vedaldi, A., Zisserman, A.: VGGSound: a large-scale audio-visual dataset. In: ICASSP (2020)","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"1_CR8","unstructured":"Chen, S., et al.: BEATs: audio pre-training with acoustic tokenizers. In: ICML (2023)"},{"key":"1_CR9","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Zisserman, A.: Lip reading in the wild. In: ACCV (2016)","DOI":"10.1007\/978-3-319-54184-6_6"},{"key":"1_CR10","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Zisserman, A.: Out of time: automated lip sync in the wild. In: ACCV Workshop (2016)","DOI":"10.1007\/978-3-319-54427-4_19"},{"key":"1_CR11","doi-asserted-by":"crossref","unstructured":"Chung, S.W., Chung, J.S., Kang, H.G.: Perfect match: improved cross-modal embeddings for audio-visual synchronisation. In: ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2019)","DOI":"10.1109\/ICASSP.2019.8682524"},{"key":"1_CR12","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. In: NeurIPS (2021)"},{"key":"1_CR13","doi-asserted-by":"crossref","unstructured":"Gemmeke, J.F., et al.: Audio set: an ontology and human-labeled dataset for audio events. In: Proceedings of IEEE ICASSP (2017)","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"1_CR14","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: ImageBind: one embedding space to bind them all. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"1_CR15","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-Or, D.: Prompt-to-prompt image editing with cross attention control. In: ICLR (2023)"},{"key":"1_CR16","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium. In: Advances in Neural Information Processing Systems (2017)"},{"key":"1_CR17","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: NeurIPS (2020)"},{"key":"1_CR18","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. In: NeurIPS Workshop on Deep Generative Models and Downstream Applications (2022)"},{"key":"1_CR19","unstructured":"Iashin, V., Xie, W., Rahtu, E., Zisserman, A.: Sparse in space and time: audio-visual synchronisation with trainable selectors. In: British Machine Vision Conference (BMVC) (2022)"},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Jeong, Y., et al.: The power of sound (TPOS): audio reactive video generation with stable diffusion. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7822\u20137832 (2023)","DOI":"10.1109\/ICCV51070.2023.00719"},{"key":"1_CR21","doi-asserted-by":"crossref","unstructured":"Khachatryan, L., et al.: Text2Video-Zero: text-to-image diffusion models are zero-shot video generators. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"1_CR22","doi-asserted-by":"publisher","unstructured":"Lee, S.H., et al.: Sound-guided semantic video generation. In: Avidan, S., Brostow, G., Cisse, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. ECCV 2022. LNCS, vol. 13677. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19790-1_3","DOI":"10.1007\/978-3-031-19790-1_3"},{"key":"1_CR23","unstructured":"Lee, S., Kong, C., Jeon, D., Kwak, N.: AADiff: audio-aligned video synthesis with text-to-image diffusion. In: CVPR Workshop on Content Generation (2023)"},{"key":"1_CR24","unstructured":"Li, R., Yang, S., Ross, D.A., Kanazawa, A.: Learn to dance with AIST++: music conditioned 3D dance generation. In: ICCV (2021)"},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: GLIGEN: open-set grounded text-to-image generation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"1_CR26","unstructured":"Luo, S., Yan, C., Hu, C., Zhao, H.: Diff-Foley: synchronized video-to-audio synthesis with latent diffusion models. In: NeurIPS (2023)"},{"key":"1_CR27","doi-asserted-by":"crossref","unstructured":"Mokady, R., Hertz, A., Aberman, K., Pritch, Y., Cohen-Or, D.: Null-text inversion for editing real images using guided diffusion models. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"1_CR28","doi-asserted-by":"crossref","unstructured":"Ng, E., et al.: From audio to photoreal embodiment: synthesizing humans in conversations. In: arXiv (2024)","DOI":"10.1109\/CVPR52733.2024.00101"},{"key":"1_CR29","doi-asserted-by":"crossref","unstructured":"Owens, A., Isola, P., McDermott, J., Torralba, A., Adelson, E.H., Freeman, W.T.: Visually indicated sounds. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.264"},{"key":"1_CR30","doi-asserted-by":"crossref","unstructured":"Park, S.J., Kim, M., Hong, J., Choi, J., Ro, Y.M.: SyncTalkFace: talking face generation with precise lip-syncing via audio-lip memory. In: AAAI Conference on Artificial Intelligence (AAAI) (2022)","DOI":"10.1609\/aaai.v36i2.20102"},{"key":"1_CR31","doi-asserted-by":"crossref","unstructured":"Morgado, P., Misra, I., Vasconcelos, N.: Robust audio-visual instance discrimination. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.01274"},{"key":"1_CR32","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"1_CR33","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. In: arXiv (2022)"},{"key":"1_CR34","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"1_CR35","doi-asserted-by":"crossref","unstructured":"Ruan, L., et al.: MM-Diffusion: learning multi-modal diffusion models for joint audio and video generation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00985"},{"key":"1_CR36","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. In: NeurIPS (2022)"},{"key":"1_CR37","unstructured":"Singer, U., et al.: Make-a-video: text-to-video generation without text-video data (2022)"},{"key":"1_CR38","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. In: ICLR (2021)"},{"key":"1_CR39","doi-asserted-by":"crossref","unstructured":"Sung-Bin, K., Senocak, A., Ha, H., Owens, A., Oh, T.H.: Sound to visual scene generation by audio-to-visual latent alignment. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00622"},{"key":"1_CR40","unstructured":"Tang, Z., Yang, Z., Zhu, C., Zeng, M., Bansal, M.: Any-to-any generation via composable diffusion. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023). https:\/\/openreview.net\/forum?id=2EDqbSCnmF"},{"key":"1_CR41","unstructured":"Tsuchida, S., Fukayama, S., Hamasaki, M., Goto, M.: AIST dance video database: multi-genre, multi-dancer, and multi-camera database for dance information processing. In: Proceedings of the 20th International Society for Music Information Retrieval Conference, ISMIR 2019, Delft, Netherlands, November 2019"},{"key":"1_CR42","unstructured":"Unterthiner, T., van Steenkiste, S., Kurach, K., Marinier, R., Michalski, M., Gelly, S.: Towards accurate generative models of video: a new metric & challenges. In: arXiv (2019)"},{"key":"1_CR43","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"1_CR44","unstructured":"Wang, J., Yuan, H., Chen, D., Zhang, Y., Wang, X., Zhang, S.: ModelScope text-to-video technical report (2023)"},{"key":"1_CR45","unstructured":"Wang, W., et al.: Zero-shot video editing using off-the-shelf image diffusion models. arXiv preprint arXiv:2303.17599 (2023)"},{"key":"1_CR46","doi-asserted-by":"crossref","unstructured":"Wu, R., Chen, L., Yang, T., Guo, C., Li, C., Zhang, X.: LAMP: learn a motion pattern by few-shot tuning a text-to-image diffusion model. arXiv preprint arXiv:2310.10769 (2023)","DOI":"10.1109\/CVPR52733.2024.00677"},{"key":"1_CR47","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: VideoClip: contrastive pre-training for zero-shot video-text understanding. In: EMNLP (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"1_CR48","doi-asserted-by":"crossref","unstructured":"Yariv, G., Gat, I., Benaim, S., Wolf, L., Schwartz, I., Adi, Y.: Diverse and aligned audio-to-video generation via text-to-video model adaptation (2023)","DOI":"10.1609\/aaai.v38i7.28486"},{"key":"1_CR49","unstructured":"Ye, Z., Jiang, Z., Ren, Y., Liu, J., He, J., Zhao, Z.: GeneFace: Generalized and high-fidelity audio-driven 3D talking face synthesis. In: ICLR (2023)"},{"key":"1_CR50","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: IEEE International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"1_CR51","doi-asserted-by":"crossref","unstructured":"Zhou, H., Liu, Y., Liu, Z., Luo, P., Wang, X.: Talking face generation by adversarially disentangled audio-visual representation. In: AAAI Conference on Artificial Intelligence (AAAI) (2019)","DOI":"10.1609\/aaai.v33i01.33019299"},{"key":"1_CR52","doi-asserted-by":"crossref","unstructured":"Zhou, H., Sun, Y., Wu, W., Loy, C.C., Wang, X., Liu, Z.: Pose-controllable talking face generation by implicitly modularized audio-visual representation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.00416"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72940-9_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,16]],"date-time":"2024-11-16T21:32:28Z","timestamp":1731792748000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72940-9_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,17]]},"ISBN":["9783031729393","9783031729409"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72940-9_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,17]]},"assertion":[{"value":"17 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}