{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:46:51Z","timestamp":1778082411728,"version":"3.51.4"},"publisher-location":"Cham","reference-count":46,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031197895","type":"print"},{"value":"9783031197901","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19790-1_3","type":"book-chapter","created":{"date-parts":[[2022,10,23]],"date-time":"2022-10-23T07:02:44Z","timestamp":1666508564000},"page":"34-50","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":20,"title":["Sound-Guided Semantic Video Generation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7773-7858","authenticated-orcid":false,"given":"Seung Hyun","family":"Lee","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4705-0705","authenticated-orcid":false,"given":"Gyeongrok","family":"Oh","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4780-4749","authenticated-orcid":false,"given":"Wonmin","family":"Byeon","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6865-7154","authenticated-orcid":false,"given":"Chanyoung","family":"Kim","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9304-136X","authenticated-orcid":false,"given":"Won Jeong","family":"Ryoo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3780-5350","authenticated-orcid":false,"given":"Sang Ho","family":"Yoon","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1080-9394","authenticated-orcid":false,"given":"Hyunjun","family":"Cho","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6378-9887","authenticated-orcid":false,"given":"Jihyun","family":"Bae","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6520-2074","authenticated-orcid":false,"given":"Jinkyu","family":"Kim","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7349-0018","authenticated-orcid":false,"given":"Sangpil","family":"Kim","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,24]]},"reference":[{"key":"3_CR1","unstructured":"Brouwer, H.: Audio-reactive latent interpolations with StyleGAN. In: NeurIPS 2020 Workshop on Machine Learning for Creativity and Design (2020)"},{"key":"3_CR2","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo Vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"3_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"701","DOI":"10.1007\/978-3-030-58583-9_42","volume-title":"Computer Vision \u2013 ECCV 2020","author":"M Chatterjee","year":"2020","unstructured":"Chatterjee, M., Cherian, A.: Sound2Sight: generating visual dynamics from sound and context. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12372, pp. 701\u2013719. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58583-9_42"},{"key":"3_CR4","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Vedaldi, A., Zisserman, A.: VGGSound: a large-scale audio-visual dataset. In: ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 721\u2013725. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"3_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1007\/978-3-030-58545-7_3","volume-title":"Computer Vision \u2013 ECCV 2020","author":"L Chen","year":"2020","unstructured":"Chen, L., et al.: Talking-head generation with rhythmic head motion. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12354, pp. 35\u201351. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58545-7_3"},{"key":"3_CR6","doi-asserted-by":"crossref","unstructured":"Chen, L., Maddox, R.K., Duan, Z., Xu, C.: Hierarchical cross-modal talking face generation with dynamic pixel-wise loss. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7832\u20137841 (2019)","DOI":"10.1109\/CVPR.2019.00802"},{"key":"3_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"408","DOI":"10.1007\/978-3-030-58577-8_25","volume-title":"Computer Vision \u2013 ECCV 2020","author":"D Das","year":"2020","unstructured":"Das, D., Biswas, S., Sinha, S., Bhowmick, B.: Speech-driven facial animation using cascaded GANs for learning of motion and texture. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 408\u2013424. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_25"},{"key":"3_CR8","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Xue, N., Zafeiriou, S.: Arcface: additive angular margin loss for deep face recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4690\u20134699 (2019)","DOI":"10.1109\/CVPR.2019.00482"},{"key":"3_CR9","unstructured":"Fox, G., Tewari, A., Elgharib, M., Theobalt, C.: Stylevideogan: a temporal generative model using a pretrained stylegan. arXiv preprint arXiv:2107.07224 (2021)"},{"key":"3_CR10","doi-asserted-by":"crossref","unstructured":"Guzhov, A., Raue, F., Hees, J., Dengel, A.: AudioCLIP: extending clip to image, text and audio (2021)","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"3_CR11","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J.Y., Zhou, T., Efros, A.A.: Image-to-image translation with conditional adversarial networks. CVPR (2017)","DOI":"10.1109\/CVPR.2017.632"},{"key":"3_CR12","unstructured":"Jeong, D., Doh, S., Kwon, T.: Tr$$\\ddot{a}$$umerai: Dreaming music with stylegan. arXiv preprint arXiv:2102.04680 (2021)"},{"key":"3_CR13","doi-asserted-by":"crossref","unstructured":"Ji, X., et al.: Audio-driven emotional video portraits. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14080\u201314089 (2021)","DOI":"10.1109\/CVPR46437.2021.01386"},{"key":"3_CR14","first-page":"852","volume":"34","author":"T Karras","year":"2021","unstructured":"Karras, T., et al.: Alias-free generative adversarial networks. Adv. Neural. Inf. Process. Syst. 34, 852\u2013863 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR15","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aila, T.: A style-based generator architecture for generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4401\u20134410 (2019)","DOI":"10.1109\/CVPR.2019.00453"},{"key":"3_CR16","unstructured":"Kay, W., et al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"3_CR17","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"key":"3_CR18","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: Proceedings of the International Conference on Computer Vision (ICCV) (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"3_CR19","doi-asserted-by":"crossref","unstructured":"Lahiri, A., Kwatra, V., Frueh, C., Lewis, J., Bregler, C.: LipSync3D: data-efficient learning of personalized 3D talking faces from video using pose and lighting normalization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2755\u20132764 (2021)","DOI":"10.1109\/CVPR46437.2021.00278"},{"key":"3_CR20","first-page":"14042","volume":"34","author":"G Le Moing","year":"2021","unstructured":"Le Moing, G., Ponce, J., Schmid, C.: Ccvs: context-aware controllable video synthesis. Adv. Neural. Inf. Process. Syst. 34, 14042\u201314055 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR21","unstructured":"Lee, S.H., et al.: Sound-guided semantic image manipulation. arXiv preprint arXiv:2112.00007 (2021)"},{"issue":"2","key":"3_CR22","doi-asserted-by":"publisher","first-page":"522","DOI":"10.1109\/TMM.2018.2856090","volume":"21","author":"B Li","year":"2018","unstructured":"Li, B., Liu, X., Dinesh, K., Duan, Z., Sharma, G.: Creating a multitrack classical music performance dataset for multimodal music analysis: challenges, insights, and applications. IEEE Trans. Multimedia 21(2), 522\u2013535 (2018)","journal-title":"IEEE Trans. Multimedia"},{"issue":"5","key":"3_CR23","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1109\/MSP.2021.3090678","volume":"38","author":"A Mesaros","year":"2021","unstructured":"Mesaros, A., Heittola, T., Virtanen, T., Plumbley, M.D.: Sound event detection: a tutorial. IEEE Signal Process. Mag. 38(5), 67\u201383 (2021). https:\/\/doi.org\/10.1109\/MSP.2021.3090678","journal-title":"IEEE Signal Process. Mag."},{"key":"3_CR24","doi-asserted-by":"crossref","unstructured":"Patashnik, O., Wu, Z., Shechtman, E., Cohen-Or, D., Lischinski, D.: StyleCLIP: text-driven manipulation of stylegan imagery. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 2085\u20132094 (2021)","DOI":"10.1109\/ICCV48922.2021.00209"},{"key":"3_CR25","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. Image, 2, T2 (2021)"},{"key":"3_CR26","doi-asserted-by":"crossref","unstructured":"Richard, A., Lea, C., Ma, S., Gall, J., de la Torre, F., Sheikh, Y.: Audio- and gaze-driven facial animation of codec avatars. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 41\u201350 (2021)","DOI":"10.1109\/WACV48630.2021.00009"},{"key":"3_CR27","doi-asserted-by":"crossref","unstructured":"Richardson, E., et al.: Encoding in style: a StyleGAN encoder for image-to-image translation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2287\u20132296 (2021)","DOI":"10.1109\/CVPR46437.2021.00232"},{"key":"3_CR28","doi-asserted-by":"crossref","unstructured":"Saito, M., Matsumoto, E., Saito, S.: Temporal generative adversarial nets with singular value clipping. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2830\u20132839 (2017)","DOI":"10.1109\/ICCV.2017.308"},{"key":"3_CR29","unstructured":"Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., Chen, X.: Improved techniques for training gans. Adv. Neural Inf. Process. Syst. 29 (2016)"},{"key":"3_CR30","doi-asserted-by":"crossref","unstructured":"Skorokhodov, I., Sotnikov, G., Elhoseiny, M.: Aligning latent and image spaces to connect the unconnectable. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 14144\u201314153 (2021)","DOI":"10.1109\/ICCV48922.2021.01388"},{"key":"3_CR31","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: Ucf101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"3_CR32","doi-asserted-by":"crossref","unstructured":"Suwajanakorn, S., Seitz, S.M., Kemelmacher-Shlizerman, I.: Synthesizing Obama. ACM Trans. Graph. (TOG) 36, 1\u201313 (2017)","DOI":"10.1145\/3072959.3073640"},{"key":"3_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"716","DOI":"10.1007\/978-3-030-58517-4_42","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Thies","year":"2020","unstructured":"Thies, J., Elgharib, M., Tewari, A., Theobalt, C., Nie\u00dfner, M.: Neural voice puppetry: audio-driven facial reenactment. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12361, pp. 716\u2013731. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58517-4_42"},{"key":"3_CR34","unstructured":"Tian, Y., et al.: A good image generator is what you need for high-resolution video synthesis. In: International Conference on Learning Representations (2021). https:\/\/openreview.net\/forum?id=6puCSjH3hwA"},{"key":"3_CR35","doi-asserted-by":"crossref","unstructured":"Tulyakov, S., Liu, M.Y., Yang, X., Kautz, J.: MoCoGAN: decomposing motion and content for video generation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1526\u20131535 (2018)","DOI":"10.1109\/CVPR.2018.00165"},{"key":"3_CR36","unstructured":"Unterthiner, T., van Steenkiste, S., Kurach, K., Marinier, R., Michalski, M., Gelly, S.: Towards accurate generative models of video: A new metric & challenges. arXiv preprint arXiv:1812.01717 (2018)"},{"key":"3_CR37","first-page":"613","volume":"29","author":"C Vondrick","year":"2016","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Generating videos with scene dynamics. Adv. Neural. Inf. Process. Syst. 29, 613\u2013621 (2016)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR38","doi-asserted-by":"crossref","unstructured":"Wang, T.C., Mallya, A., Liu, M.Y.: One-shot free-view neural talking-head synthesis for video conferencing. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.00991"},{"key":"3_CR39","doi-asserted-by":"crossref","unstructured":"Wang, Y., Bilinski, P., Bremond, F., Dantcheva, A.: G3AN: disentangling appearance and motion for video generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5264\u20135273 (2020)","DOI":"10.1109\/CVPR42600.2020.00531"},{"key":"3_CR40","doi-asserted-by":"crossref","unstructured":"Wu, H., Jia, J., Wang, H., Dou, Y., Duan, C., Deng, Q.: Imitating arbitrary talking style for realistic audio-driven talking face synthesis. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 1478\u20131486 (2021)","DOI":"10.1145\/3474085.3475280"},{"key":"3_CR41","doi-asserted-by":"crossref","unstructured":"Wu, H.H., Seetharaman, P., Kumar, K., Bello, J.P.: Wav2clip: Learning robust audio representations from clip (2021)","DOI":"10.31219\/osf.io\/r2vwf"},{"key":"3_CR42","doi-asserted-by":"crossref","unstructured":"Xia, W., Yang, Y., Xue, J.H., Wu, B.: TediGAN: text-guided diverse face image generation and manipulation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2256\u20132265 (2021)","DOI":"10.1109\/CVPR46437.2021.00229"},{"key":"3_CR43","unstructured":"Yan, W., Zhang, Y., Abbeel, P., Srinivas, A.: VideoGPT: video generation using VQ-VAE and transformers. arXiv preprint arXiv:2104.10157 (2021)"},{"key":"3_CR44","unstructured":"Yi, R., Ye, Z., Zhang, J., Bao, H., Liu, Y.J.: Audio-driven talking face video generation with learning-based personalized head pose (2020)"},{"key":"3_CR45","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"issue":"6","key":"3_CR46","first-page":"1","volume":"39","author":"Y Zhou","year":"2020","unstructured":"Zhou, Y., Han, X., Shechtman, E., Echevarria, J., Kalogerakis, E., Li, D.: MakeitTALK: speaker-aware talking-head animation. ACM Trans. Graph. 39(6), 1\u20135 (2020)","journal-title":"ACM Trans. Graph."}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19790-1_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,12]],"date-time":"2024-03-12T10:57:58Z","timestamp":1710241078000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19790-1_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031197895","9783031197901"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19790-1_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"24 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}