{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,20]],"date-time":"2025-04-20T18:21:35Z","timestamp":1745173295341,"version":"3.40.3"},"publisher-location":"Cham","reference-count":45,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031726835"},{"type":"electronic","value":"9783031726842"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72684-2_21","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:03:50Z","timestamp":1730574230000},"page":"365-382","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["S$$^{3}$$D-NeRF: Single-Shot Speech-Driven Neural Radiance Field for\u00a0High Fidelity Talking Head Synthesis"],"prefix":"10.1007","author":[{"given":"Dongze","family":"Li","sequence":"first","affiliation":[]},{"given":"Kang","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yifeng","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Peng","sequence":"additional","affiliation":[]},{"given":"Yingya","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Jing","family":"Dong","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"issue":"12","key":"21_CR1","doi-asserted-by":"publisher","first-page":"8717","DOI":"10.1109\/TPAMI.2018.2889052","volume":"44","author":"T Afouras","year":"2018","unstructured":"Afouras, T., Chung, J.S., Senior, A., Vinyals, O., Zisserman, A.: Deep audio-visual speech recognition. IEEE Trans. Pattern Anal. Mach. Intell. 44(12), 8717\u20138727 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"21_CR2","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. In: Advances in Neural Information Processing Systems, vol. 33, pp. 12449\u201312460 (2020)"},{"key":"21_CR3","doi-asserted-by":"crossref","unstructured":"Blanz, V., Vetter, T.: A morphable model for the synthesis of 3D faces. In: Seminal Graphics Papers: Pushing the Boundaries, vol. 2, pp. 157\u2013164 (2023)","DOI":"10.1145\/3596711.3596730"},{"key":"21_CR4","doi-asserted-by":"crossref","unstructured":"Chan, E.R., et al.: Efficient geometry-aware 3D generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16123\u201316133 (2022)","DOI":"10.1109\/CVPR52688.2022.01565"},{"key":"21_CR5","doi-asserted-by":"crossref","unstructured":"Chen, L., Maddox, R.K., Duan, Z., Xu, C.: Hierarchical cross-modal talking face generation with dynamic pixel-wise loss. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7832\u20137841 (2019)","DOI":"10.1109\/CVPR.2019.00802"},{"key":"21_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1007\/978-3-319-54427-4_19","volume-title":"Computer Vision \u2013 ACCV 2016 Workshops","author":"JS Chung","year":"2017","unstructured":"Chung, J.S., Zisserman, A.: Out of time: automated lip sync in the wild. In: Chen, C.-S., Lu, J., Ma, K.-K. (eds.) ACCV 2016, Part II. LNCS, vol. 10117, pp. 251\u2013263. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-54427-4_19"},{"key":"21_CR7","doi-asserted-by":"crossref","unstructured":"Deng, Y., Yang, J., Xu, S., Chen, D., Jia, Y., Tong, X.: Accurate 3D face reconstruction with weakly-supervised learning: from single image to image set. In: IEEE Computer Vision and Pattern Recognition Workshops (2019)","DOI":"10.1109\/CVPRW.2019.00038"},{"key":"21_CR8","doi-asserted-by":"crossref","unstructured":"Gafni, G., Thies, J., Zollh\u00f6fer, M., Nie\u00dfner, M.: Dynamic neural radiance fields for monocular 4D facial avatar reconstruction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8649\u20138658 (2021)","DOI":"10.1109\/CVPR46437.2021.00854"},{"issue":"11","key":"21_CR9","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., et al.: Generative adversarial networks. Commun. ACM 63(11), 139\u2013144 (2020)","journal-title":"Commun. ACM"},{"key":"21_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"152","DOI":"10.1007\/978-3-030-58529-7_10","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Guo","year":"2020","unstructured":"Guo, J., Zhu, X., Yang, Y., Yang, F., Lei, Z., Li, S.Z.: Towards fast, accurate and stable 3D dense face alignment. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12364, pp. 152\u2013168. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58529-7_10"},{"key":"21_CR11","doi-asserted-by":"crossref","unstructured":"Guo, Y., Chen, K., Liang, S., Liu, Y.J., Bao, H., Zhang, J.: AD-NeRF: audio driven neural radiance fields for talking head synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5784\u20135794 (2021)","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"21_CR12","doi-asserted-by":"crossref","unstructured":"Hong, F.T., Zhang, L., Shen, L., Xu, D.: Depth-aware generative adversarial network for talking head video generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3397\u20133406 (2022)","DOI":"10.1109\/CVPR52688.2022.00339"},{"key":"21_CR13","doi-asserted-by":"crossref","unstructured":"Hong, Y., Peng, B., Xiao, H., Liu, L., Zhang, J.: HeadNeRF: a real-time nerf-based parametric head model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20374\u201320384 (2022)","DOI":"10.1109\/CVPR52688.2022.01973"},{"key":"21_CR14","doi-asserted-by":"crossref","unstructured":"Jang, W., Agapito, L.: CodeNeRF: disentangled neural radiance fields for object categories. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12949\u201312958 (2021)","DOI":"10.1109\/ICCV48922.2021.01271"},{"key":"21_CR15","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aittala, M., Hellsten, J., Lehtinen, J., Aila, T.: Analyzing and improving the image quality of StyleGAN. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8110\u20138119 (2020)","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"21_CR16","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"key":"21_CR17","doi-asserted-by":"crossref","unstructured":"Li, J., Zhang, J., Bai, X., Zhou, J., Gu, L.: Efficient region-aware neural radiance fields for high-fidelity talking portrait synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 7568\u20137578 (2023)","DOI":"10.1109\/ICCV51070.2023.00696"},{"key":"21_CR18","doi-asserted-by":"crossref","unstructured":"Li, W., et al.: One-shot high-fidelity talking-head synthesis with deformable neural radiance field (2023)","DOI":"10.1109\/CVPR52729.2023.01723"},{"key":"21_CR19","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"21_CR20","doi-asserted-by":"publisher","first-page":"106","DOI":"10.1007\/978-3-031-19836-6_7","volume-title":"Computer Vision-ECCV 2022, Part XXXVII","author":"X Liu","year":"2022","unstructured":"Liu, X., Xu, Y., Wu, Q., Zhou, H., Wu, W., Zhou, B.: Semantic-aware implicit neural audio-driven video portrait generation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XXXVII. LNCS, vol. 13697, pp. 106\u2013125. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19836-6_7"},{"key":"21_CR21","unstructured":"Locatello, F., et al.: Object-centric learning with slot attention. In: Advances in Neural Information Processing Systems, vol. 33, pp. 11525\u201311538 (2020)"},{"key":"21_CR22","doi-asserted-by":"crossref","unstructured":"Ma, Y., et al.: StyleTalk: one-shot talking head generation with controllable speaking styles. In: Proceedings of the Thirty-Seventh AAAI Conference on Artificial Intelligence (AAAI) (2023)","DOI":"10.1609\/aaai.v37i2.25280"},{"key":"21_CR23","doi-asserted-by":"crossref","unstructured":"Ma, Z., Zhu, X., Qi, G.J., Lei, Z., Zhang, L.: OTAvatar: one-shot talking face avatar with controllable tri-plane rendering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16901\u201316910 (2023)","DOI":"10.1109\/CVPR52729.2023.01621"},{"key":"21_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"405","DOI":"10.1007\/978-3-030-58452-8_24","volume-title":"Computer Vision \u2013 ECCV 2020","author":"B Mildenhall","year":"2020","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: NeRF: representing scenes as neural radiance fields for view synthesis. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 405\u2013421. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_24"},{"issue":"4","key":"21_CR25","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3528223.3530127","volume":"41","author":"T M\u00fcller","year":"2022","unstructured":"M\u00fcller, T., Evans, A., Schied, C., Keller, A.: Instant neural graphics primitives with a multiresolution hash encoding. ACM Trans. Graph. (ToG) 41(4), 1\u201315 (2022)","journal-title":"ACM Trans. Graph. (ToG)"},{"key":"21_CR26","doi-asserted-by":"crossref","unstructured":"Park, K., et al.: Nerfies: deformable neural radiance fields. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5865\u20135874 (2021)","DOI":"10.1109\/ICCV48922.2021.00581"},{"key":"21_CR27","doi-asserted-by":"crossref","unstructured":"Prajwal, K., Mukhopadhyay, R., Namboodiri, V.P., Jawahar, C.: A lip sync expert is all you need for speech to lip generation in the wild. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 484\u2013492 (2020)","DOI":"10.1145\/3394171.3413532"},{"key":"21_CR28","doi-asserted-by":"crossref","unstructured":"Pumarola, A., Corona, E., Pons-Moll, G., Moreno-Noguer, F.: D-NeRF: neural radiance fields for dynamic scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10318\u201310327 (2021)","DOI":"10.1109\/CVPR46437.2021.01018"},{"key":"21_CR29","doi-asserted-by":"crossref","unstructured":"Ren, Y., Li, G., Chen, Y., Li, T.H., Liu, S.: PIRenderer: controllable portrait image generation via semantic neural rendering (2021)","DOI":"10.1109\/ICCV48922.2021.01350"},{"key":"21_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015, Part III. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"21_CR31","doi-asserted-by":"publisher","first-page":"666","DOI":"10.1007\/978-3-031-19775-8_39","volume-title":"Computer Vision-ECCV 2022, Part XII","author":"S Shen","year":"2022","unstructured":"Shen, S., Li, W., Zhu, Z., Duan, Y., Zhou, J., Lu, J.: Learning dynamic facial radiance fields for few-shot talking head synthesis. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XII. LNCS, vol. 13672, pp. 666\u2013682. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19775-8_39"},{"key":"21_CR32","unstructured":"Tang, J., et al.: Real-time neural radiance talking portrait synthesis via audio-spatial decomposition. arXiv preprint arXiv:2211.12368 (2022)"},{"key":"21_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"716","DOI":"10.1007\/978-3-030-58517-4_42","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Thies","year":"2020","unstructured":"Thies, J., Elgharib, M., Tewari, A., Theobalt, C., Nie\u00dfner, M.: Neural voice puppetry: audio-driven facial reenactment. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12361, pp. 716\u2013731. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58517-4_42"},{"key":"21_CR34","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: LipFormer: high-fidelity and generalizable talking face generation with a pre-learned facial codebook. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13844\u201313853 (2023)","DOI":"10.1109\/CVPR52729.2023.01330"},{"key":"21_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"700","DOI":"10.1007\/978-3-030-58589-1_42","volume-title":"Computer Vision \u2013 ECCV 2020","author":"K Wang","year":"2020","unstructured":"Wang, K., et al.: MEAD: a large-scale audio-visual dataset for emotional talking-face generation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12366, pp. 700\u2013717. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58589-1_42"},{"key":"21_CR36","doi-asserted-by":"crossref","unstructured":"Wang, S., Li, L., Ding, Y., Yu, X.: One-shot talking face generation from single-speaker audio-visual correlation learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 2531\u20132539 (2022)","DOI":"10.1609\/aaai.v36i3.20154"},{"key":"21_CR37","doi-asserted-by":"crossref","unstructured":"Wiles, O., Koepke, A., Zisserman, A.: X2Face: a network for controlling face generation using images, audio, and pose codes. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 670\u2013686 (2018)","DOI":"10.1007\/978-3-030-01261-8_41"},{"key":"21_CR38","unstructured":"Ye, Z., Jiang, Z., Ren, Y., Liu, J., He, J., Zhao, Z.: GeneFace: generalized and high-fidelity audio-driven 3D talking face synthesis. arXiv preprint arXiv:2301.13430 (2023)"},{"key":"21_CR39","unstructured":"Yi, R., Ye, Z., Zhang, J., Bao, H., Liu, Y.J.: Audio-driven talking face video generation with learning-based personalized head pose. arXiv preprint arXiv:2002.10137 (2020)"},{"key":"21_CR40","doi-asserted-by":"crossref","unstructured":"Yu, W., et al.: NOFA: NeRF-based one-shot facial avatar reconstruction. In: ACM SIGGRAPH 2023 Conference Proceedings, pp. 1\u201312 (2023)","DOI":"10.1145\/3588432.3591555"},{"key":"21_CR41","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 586\u2013595 (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"key":"21_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, W., et al.: SadTalker: learning realistic 3D motion coefficients for stylized audio-driven single image talking face animation. arXiv preprint arXiv:2211.12194 (2022)","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"21_CR43","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Li, L., Ding, Y., Fan, C.: Flow-guided one-shot talking face generation with a high-resolution audio-visual dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3661\u20133670 (2021)","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"21_CR44","doi-asserted-by":"crossref","unstructured":"Zhou, H., Sun, Y., Wu, W., Loy, C.C., Wang, X., Liu, Z.: Pose-controllable talking face generation by implicitly modularized audio-visual representation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.00416"},{"issue":"6","key":"21_CR45","first-page":"1","volume":"39","author":"Y Zhou","year":"2020","unstructured":"Zhou, Y., Han, X., Shechtman, E., Echevarria, J., Kalogerakis, E., Li, D.: MakeltTalk: speaker-aware talking-head animation. ACM Trans. Graph. (TOG) 39(6), 1\u201315 (2020)","journal-title":"ACM Trans. Graph. (TOG)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72684-2_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:10:53Z","timestamp":1730574653000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72684-2_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031726835","9783031726842"],"references-count":45,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72684-2_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}