{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T01:35:32Z","timestamp":1743039332447,"version":"3.40.3"},"publisher-location":"Cham","reference-count":59,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031730238"},{"type":"electronic","value":"9783031730245"}],"license":[{"start":{"date-parts":[[2024,11,24]],"date-time":"2024-11-24T00:00:00Z","timestamp":1732406400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,24]],"date-time":"2024-11-24T00:00:00Z","timestamp":1732406400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73024-5_11","type":"book-chapter","created":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T16:41:49Z","timestamp":1732552909000},"page":"172-189","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Co-speech Gesture Video Generation with\u00a03D Human Meshes"],"prefix":"10.1007","author":[{"given":"Aniruddha","family":"Mahapatra","sequence":"first","affiliation":[]},{"given":"Richa","family":"Mishra","sequence":"additional","affiliation":[]},{"given":"Renda","family":"Li","sequence":"additional","affiliation":[]},{"given":"Ziyi","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Boyang","family":"Ding","sequence":"additional","affiliation":[]},{"given":"Shoulei","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Jun-Yan","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Peng","family":"Chang","sequence":"additional","affiliation":[]},{"given":"Mei","family":"Han","sequence":"additional","affiliation":[]},{"given":"Jing","family":"Xiao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,24]]},"reference":[{"key":"11_CR1","doi-asserted-by":"publisher","DOI":"10.1145\/3592097","author":"T Ao","year":"2023","unstructured":"Ao, T., Zhang, Z., Liu, L.: GestureDiffuCLIP: gesture diffusion model with CLIP latents. ACM Trans. Graph. (2023). https:\/\/doi.org\/10.1145\/3592097","journal-title":"ACM Trans. Graph."},{"key":"11_CR2","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. In: NeurIPS, vol. 33 (2020)"},{"key":"11_CR3","doi-asserted-by":"crossref","unstructured":"Boukhayma, A., Bem, R.D., Torr, P.H.: 3D hand shape and pose from images in the wild. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01110"},{"key":"11_CR4","doi-asserted-by":"crossref","unstructured":"Ceylan, D., Huang, C.H.P., Mitra, N.J.: Pix2Video: video editing using image diffusion. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.02121"},{"key":"11_CR5","doi-asserted-by":"crossref","unstructured":"Chan, C., Ginosar, S., Zhou, T., Efros, A.A.: Everybody dance now. In: IEEE International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00603"},{"key":"11_CR6","unstructured":"Chen, W., et al.: Control-a-video: controllable text-to-video generation with diffusion models. arXiv:2305.13840 (2023)"},{"key":"11_CR7","doi-asserted-by":"crossref","unstructured":"Cudeiro, D., Bolkart, T., Laidlaw, C., Ranjan, A., Black, M.J.: Capture, learning, and synthesis of 3D speaking styles. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01034"},{"key":"11_CR8","doi-asserted-by":"crossref","unstructured":"Fan, Y., Lin, Z., Saito, J., Wang, W., Komura, T.: FaceFormer: speech-driven 3D facial animation with transformers. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"11_CR9","unstructured":"Geyer, M., Bar-Tal, O., Bagon, S., Dekel, T.: TokenFlow: consistent diffusion features for consistent video editing. arXiv preprint arxiv:2307.10373 (2023)"},{"key":"11_CR10","doi-asserted-by":"crossref","unstructured":"Ginosar, S., Bar, A., Kohavi, G., Chan, C., Owens, A., Malik, J.: Learning individual styles of conversational gesture. In: Computer Vision and Pattern Recognition (CVPR). IEEE (2019)","DOI":"10.1109\/CVPR.2019.00361"},{"key":"11_CR11","doi-asserted-by":"crossref","unstructured":"Guan, J., et al.: StyleSync: high-fidelity generalized and personalized lip sync in style-based generator. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00151"},{"key":"11_CR12","doi-asserted-by":"crossref","unstructured":"Guo, Y., Chen, K., Liang, S., Liu, Y., Bao, H., Zhang, J.: AD-NeRF: audio driven neural radiance fields for talking head synthesis. In: IEEE\/CVF International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"11_CR13","doi-asserted-by":"crossref","unstructured":"Habibie, I., et al.: Learning speech-driven 3D conversational gestures from video. In: Proceedings of the 21st ACM International Conference on Intelligent Virtual Agents, pp. 101\u2013108 (2021)","DOI":"10.1145\/3472306.3478335"},{"key":"11_CR14","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local nash equilibrium. In: NeurIPS, vol. 30 (2017)"},{"key":"11_CR15","doi-asserted-by":"crossref","unstructured":"Huang, Z., et al.: Make-your-anchor: a diffusion-based 2D avatar generation framework. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00668"},{"key":"11_CR16","doi-asserted-by":"crossref","unstructured":"Huh, M., Zhang, R., Zhu, J.Y., Paris, S., Hertzmann, A.: Transforming and projecting images to class-conditional generative networks. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58536-5_2"},{"key":"11_CR17","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J.Y., Zhou, T., Efros, A.A.: Image-to-image translation with conditional adversarial networks. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.632"},{"issue":"4","key":"11_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073658","volume":"36","author":"T Karras","year":"2017","unstructured":"Karras, T., Aila, T., Laine, S., Herva, A., Lehtinen, J.: Audio-driven facial animation by joint end-to-end learning of pose and emotion. ACM Trans. Graph. (TOG) 36(4), 1\u201312 (2017)","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"6","key":"11_CR19","doi-asserted-by":"publisher","first-page":"194","DOI":"10.1145\/3130800.3130813","volume":"36","author":"T Li","year":"2017","unstructured":"Li, T., Bolkart, T., Black, M.J., Li, H., Romero, J.: Learning a model of facial shape and expression from 4D scans. ACM Trans. Graph. 36(6), 194 (2017)","journal-title":"ACM Trans. Graph."},{"key":"11_CR20","doi-asserted-by":"crossref","unstructured":"Lin, S., Yang, L., Saleemi, I., Sengupta, S.: Robust high-resolution video matting with temporal guidance. In: WACV (2022)","DOI":"10.1109\/WACV51458.2022.00319"},{"key":"11_CR21","unstructured":"Liu, X., et al.: Audio-driven co-speech gesture video generation. In: NeurIPS (2022)"},{"key":"11_CR22","doi-asserted-by":"crossref","unstructured":"Loper, M., Mahmood, N., Romero, J., Pons-Moll, G., Black, M.J.: SMPL: a skinned multi-person linear model. In: Seminal Graphics Papers: Pushing the Boundaries, vol. 2, pp. 851\u2013866 (2023)","DOI":"10.1145\/3596711.3596800"},{"issue":"6","key":"11_CR23","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3478513.3480484","volume":"40","author":"Y Lu","year":"2021","unstructured":"Lu, Y., Chai, J., Cao, X.: Live speech portraits: real-time photorealistic talking-head animation. ACM Trans. Graph. 40(6), 1\u20137 (2021). https:\/\/doi.org\/10.1145\/3478513.3480484","journal-title":"ACM Trans. Graph."},{"key":"11_CR24","doi-asserted-by":"crossref","unstructured":"Ma, Y., et al.: StyleTalk: one-shot talking head generation with controllable speaking styles. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 1896\u20131904 (2023)","DOI":"10.1609\/aaai.v37i2.25280"},{"key":"11_CR25","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"359","DOI":"10.1007\/978-3-030-58598-3_22","volume-title":"Computer Vision \u2013 ECCV 2020","author":"A Mallya","year":"2020","unstructured":"Mallya, A., Wang, T.-C., Sapra, K., Liu, M.-Y.: World-consistent video-to-video synthesis. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12353, pp. 359\u2013378. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58598-3_22"},{"key":"11_CR26","doi-asserted-by":"crossref","unstructured":"Mensah, D., Kim, N.H., Aittala, M., Laine, S., Lehtinen, J.: A hybrid generator architecture for controllable face synthesis. In: ACM SIGGRAPH 2023 Conference Proceedings, pp. 1\u201310 (2023)","DOI":"10.1145\/3588432.3591563"},{"key":"11_CR27","unstructured":"Van\u00a0den Oord, A., Kalchbrenner, N., Espeholt, L., Vinyals, O., Graves, A.: Conditional image generation with PixelCNN decoders. In: NeurIPS, vol. 29 (2016)"},{"key":"11_CR28","unstructured":"Ouyang, L., et\u00a0al.: Training language models to follow instructions with human feedback. In: NeurIPS, vol. 35 (2022)"},{"key":"11_CR29","doi-asserted-by":"crossref","unstructured":"Park, T., Liu, M.Y., Wang, T.C., Zhu, J.Y.: Semantic image synthesis with spatially-adaptive normalization. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00244"},{"key":"11_CR30","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., et al.: Expressive body capture: 3D hands, face, and body from a single image. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01123"},{"key":"11_CR31","doi-asserted-by":"crossref","unstructured":"Prajwal, K.R., Mukhopadhyay, R., Namboodiri, V.P., Jawahar, C.: A lip sync expert is all you need for speech to lip generation in the wild. In: ACM MM (2020)","DOI":"10.1145\/3394171.3413532"},{"key":"11_CR32","doi-asserted-by":"crossref","unstructured":"Z Qi, C., et al.: FateZero: fusing attentions for zero-shot text-based video editing. arXiv:2303.09535 (2023)","DOI":"10.1109\/ICCV51070.2023.01460"},{"key":"11_CR33","doi-asserted-by":"crossref","unstructured":"Qian, S., Tu, Z., Zhi, Y., Liu, W., Gao, S.: Speech drives templates: co-speech gesture synthesis with learned templates. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE (2021)","DOI":"10.1109\/ICCV48922.2021.01089"},{"key":"11_CR34","unstructured":"Ravi, N., et al.: Accelerating 3D deep learning with PyTorch3D. arXiv:2007.08501 (2020)"},{"key":"11_CR35","doi-asserted-by":"crossref","unstructured":"Richard, A., Zollh\u00f6fer, M., Wen, Y., De\u00a0la Torre, F., Sheikh, Y.: MeshTalk: 3D face animation from speech using cross-modality disentanglement. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"11_CR36","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"11_CR37","doi-asserted-by":"crossref","unstructured":"Shen, S., et al.: DiffTalk: crafting diffusion models for generalized audio-driven portraits animation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00197"},{"key":"11_CR38","unstructured":"Siarohin, A., Lathuili\u00e8re, S., Tulyakov, S., Ricci, E., Sebe, N.: First order motion model for image animation. In: NeurIPS (2019)"},{"key":"11_CR39","doi-asserted-by":"crossref","unstructured":"Skorokhodov, I., Tulyakov, S., Elhoseiny, M.: StyleGAN-V: a continuous video generator with the price, image quality and perks of styleGAN2. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00361"},{"key":"11_CR40","doi-asserted-by":"crossref","unstructured":"Tulyakov, S., Liu, M.Y., Yang, X., Kautz, J.: MoCoGAN: decomposing motion and content for video generation. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00165"},{"key":"11_CR41","unstructured":"Van Den\u00a0Oord, A., Vinyals, O.: Neural discrete representation learning. In: NeurIPS, vol. 30 (2017)"},{"key":"11_CR42","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Generating videos with scene dynamics. In: NeurIPS (2016)"},{"key":"11_CR43","doi-asserted-by":"crossref","unstructured":"Wang, J., Qian, X., Zhang, M., Tan, R.T., Li, H.: Seeing what you said: talking face generation guided by a lip reading expert. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01408"},{"key":"11_CR44","unstructured":"Wang, T.C., et al.: Video-to-video synthesis. In: NeurIPS (2018)"},{"key":"11_CR45","doi-asserted-by":"crossref","unstructured":"Wang, T.C., Liu, M.Y., Zhu, J.Y., Tao, A., Kautz, J., Catanzaro, B.: High-resolution image synthesis and semantic manipulation with conditional GANs. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00917"},{"key":"11_CR46","doi-asserted-by":"crossref","unstructured":"Wang, T.C., Mallya, A., Liu, M.Y.: One-shot free-view neural talking-head synthesis for video conferencing. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00991"},{"issue":"4","key":"11_CR47","doi-asserted-by":"publisher","first-page":"600","DOI":"10.1109\/TIP.2003.819861","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang, Z., Bovik, A.C., Sheikh, H.R., Simoncelli, E.P.: Image quality assessment: from error visibility to structural similarity. IEEE Trans. Image Process. 13(4), 600\u2013612 (2004)","journal-title":"IEEE Trans. Image Process."},{"key":"11_CR48","doi-asserted-by":"crossref","unstructured":"Wu, H., Jia, J., Wang, H., Dou, Y., Duan, C., Deng, Q.: Imitating arbitrary talking style for realistic audio-driven talking face synthesis. In: ACM MM (2021)","DOI":"10.1145\/3474085.3475280"},{"key":"11_CR49","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., et al.: Tune-a-video: one-shot tuning of image diffusion models for text-to-video generation. arXiv preprint arXiv:2212.11565 (2022)","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"11_CR50","unstructured":"Yan, W., Zhang, Y., Abbeel, P., Srinivas, A.: VideoGPT: Video generation using VQ-VAE and transformers. arXiv preprint arXiv:2104.10157 (2021)"},{"key":"11_CR51","doi-asserted-by":"crossref","unstructured":"Yang, S., Zhou, Y., Liu, Z., , Loy, C.C.: Rerender a video: zero-shot text-guided video-to-video translation. In: ACM SIGGRAPH Asia Conference Proceedings (2023)","DOI":"10.1145\/3610548.3618160"},{"key":"11_CR52","doi-asserted-by":"crossref","unstructured":"Yang, S., et al.: DiffuseStyleGesture: stylized audio-driven co-speech gesture generation with diffusion models. In: IJCAI (2023)","DOI":"10.24963\/ijcai.2023\/650"},{"issue":"3","key":"11_CR53","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3449063","volume":"40","author":"X Yao","year":"2021","unstructured":"Yao, X., Fried, O., Fatahalian, K., Agrawala, M.: Iterative text-based editing of talking-heads using neural retargeting. ACM Trans. Graph. (TOG) 40(3), 1\u201314 (2021)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"11_CR54","doi-asserted-by":"crossref","unstructured":"Yi, H., et al.: Generating holistic 3D human motion from speech. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00053"},{"key":"11_CR55","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"key":"11_CR56","doi-asserted-by":"crossref","unstructured":"Zhang, W., et al.: SadTalker: learning realistic 3D motion coefficients for stylized audio-driven single image talking face animation. arXiv preprint arXiv:2211.12194 (2022)","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"11_CR57","doi-asserted-by":"crossref","unstructured":"Zhao, J., Zhang, H.: Thin-plate spline motion model for image animation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"11_CR58","doi-asserted-by":"crossref","unstructured":"Zhu, L., Liu, X., Liu, X., Qian, R., Liu, Z., Yu, L.: Taming diffusion models for audio-driven co-speech gesture generation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01016"},{"key":"11_CR59","doi-asserted-by":"crossref","unstructured":"Zielonka, W., Bolkart, T., Thies, J.: Instant volumetric head avatars. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00444"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73024-5_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T17:06:54Z","timestamp":1732554414000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73024-5_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,24]]},"ISBN":["9783031730238","9783031730245"],"references-count":59,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73024-5_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,24]]},"assertion":[{"value":"24 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}