{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:29:44Z","timestamp":1777656584021,"version":"3.51.4"},"publisher-location":"Cham","reference-count":50,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726835","type":"print"},{"value":"9783031726842","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72684-2_1","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:02:45Z","timestamp":1730574165000},"page":"1-17","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Modeling and\u00a0Driving Human Body Soundfields Through Acoustic Primitives"],"prefix":"10.1007","author":[{"given":"Chao","family":"Huang","sequence":"first","affiliation":[]},{"given":"Dejan","family":"Markovi\u0107","sequence":"additional","affiliation":[]},{"given":"Chenliang","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Alexander","family":"Richard","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"1_CR1","unstructured":"Metahuman creator (2021). https:\/\/metahuman.unrealengine.com"},{"key":"1_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"248","DOI":"10.1007\/978-3-030-58523-5_15","volume-title":"Computer Vision \u2013 ECCV 2020","author":"C Ahuja","year":"2020","unstructured":"Ahuja, C., Lee, D.W., Nakano, Y.I., Morency, L.-P.: Style transfer for co-speech gesture animation: a multi-speaker conditional-mixture approach. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12363, pp. 248\u2013265. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58523-5_15"},{"issue":"4","key":"1_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3450626.3459850","volume":"40","author":"T Bagautdinov","year":"2021","unstructured":"Bagautdinov, T., et al.: Driving-signal aware full-body avatars. ACM Trans. Graph. (TOG) 40(4), 1\u201317 (2021)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"1_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"17","DOI":"10.1007\/978-3-030-58539-6_2","volume-title":"Computer Vision \u2013 ECCV 2020","author":"C Chen","year":"2020","unstructured":"Chen, C., et al.: SoundSpaces: audio-visual navigation in\u00a03D\u00a0environments. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12351, pp. 17\u201336. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58539-6_2"},{"key":"1_CR5","doi-asserted-by":"crossref","unstructured":"Chen, C., et al.: Novel-view acoustic synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00620"},{"key":"1_CR6","unstructured":"Chen, C., et al.: Soundspaces 2.0: a simulation platform for visual-acoustic learning. In: Advances in Neural Information Processing Systems, vol. 35, pp. 8896\u20138911 (2022)"},{"key":"1_CR7","unstructured":"Chen, Z., Hong, F., Mei, H., Wang, G., Yang, L., Liu, Z.: PrimDiffusion: volumetric primitives diffusion for 3D human generation. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023)"},{"issue":"4","key":"1_CR8","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3197517.3201357","volume":"37","author":"A Ephrat","year":"2018","unstructured":"Ephrat, A., et al.: Looking to listen at the cocktail party: a speaker-independent audio-visual model for speech separation. ACM Trans. Graph. (TOG) 37(4), 1\u201311 (2018)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"1_CR9","doi-asserted-by":"crossref","unstructured":"Gao, R., Feris, R., Grauman, K.: Learning to separate object sounds by watching unlabeled video. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 35\u201353 (2018)","DOI":"10.1007\/978-3-030-01219-9_3"},{"key":"1_CR10","doi-asserted-by":"crossref","unstructured":"Gao, R., Grauman, K.: 2.5 D visual sound. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 324\u2013333 (2019)","DOI":"10.1109\/CVPR.2019.00041"},{"key":"1_CR11","doi-asserted-by":"crossref","unstructured":"Gao, R., Grauman, K.: VisualVoice: audio-visual speech separation with cross-modal consistency. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15490\u201315500. IEEE (2021)","DOI":"10.1109\/CVPR46437.2021.01524"},{"key":"1_CR12","doi-asserted-by":"crossref","unstructured":"Gebru, I.D., et al.: Implicit HRTF modeling using temporal convolutional networks. In: 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), ICASSP 2021, pp. 3385\u20133389. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9414750"},{"key":"1_CR13","doi-asserted-by":"crossref","unstructured":"Ginosar, S., Bar, A., Kohavi, G., Chan, C., Owens, A., Malik, J.: Learning individual styles of conversational gesture. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3497\u20133506 (2019)","DOI":"10.1109\/CVPR.2019.00361"},{"key":"1_CR14","doi-asserted-by":"crossref","unstructured":"Hendrix, C., Barfield, W.: The sense of presence within auditory virtual environments. Presence: Teleoperators Virtual Environ. 5(3), 290\u2013301 (1996)","DOI":"10.1162\/pres.1996.5.3.290"},{"key":"1_CR15","unstructured":"Hu, D., et al.: Discriminative sounding objects localization via self-supervised audiovisual matching. In: Advances in Neural Information Processing Systems, vol. 33, pp. 10077\u201310087 (2020)"},{"key":"1_CR16","unstructured":"Huang, C., Liang, S., Tian, Y., Kumar, A., Xu, C.: Davis: high-quality audio-visual separation with generative diffusion models. arXiv preprint arXiv:2308.00122 (2023)"},{"key":"1_CR17","doi-asserted-by":"crossref","unstructured":"Huang, C., Tian, Y., Kumar, A., Xu, C.: Egocentric audio-visual object localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 22910\u201322921 (2023)","DOI":"10.1109\/CVPR52729.2023.02194"},{"key":"1_CR18","doi-asserted-by":"crossref","unstructured":"Huang, H., Solah, M., Li, D., Yu, L.F.: Audible panorama: automatic spatial audio generation for panorama imagery. In: Proceedings of the 2019 CHI Conference on Human Factors in Computing Systems, pp. 1\u201311 (2019)","DOI":"10.1145\/3290605.3300851"},{"key":"1_CR19","doi-asserted-by":"crossref","unstructured":"Jiang, H., Murdock, C., Ithapu, V.K.: Egocentric deep multi-channel audio-visual active speaker localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10544\u201310552 (2022)","DOI":"10.1109\/CVPR52688.2022.01029"},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Kerbl, B., Kopanas, G., Leimk\u00fchler, T., Drettakis, G.: 3D Gaussian splatting for real-time radiance field rendering. ACM Trans. Graph. 42(4) (2023)","DOI":"10.1145\/3592433"},{"key":"1_CR21","unstructured":"Lee, H.Y., et al.: Dancing to music. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"issue":"4","key":"1_CR22","first-page":"1","volume":"37","author":"D Li","year":"2018","unstructured":"Li, D., Langlois, T.R., Zheng, C.: Scene-aware audio for 360 videos. ACM Trans. Graph. (TOG) 37(4), 1\u201312 (2018)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"1_CR23","unstructured":"Liang, S., Huang, C., Tian, Y., Kumar, A., Xu, C.: AV-NeRF: learning neural fields for real-world audio-visual scene synthesis. In: Conference on Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"1_CR24","doi-asserted-by":"crossref","unstructured":"Lombardi, S., Simon, T., Saragih, J., Schwartz, G., Lehrmann, A., Sheikh, Y.: Neural volumes: learning dynamic renderable volumes from images. ACM Trans. Graph. 38(4), 65:1\u201365:14 (2019)","DOI":"10.1145\/3306346.3323020"},{"issue":"4","key":"1_CR25","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3450626.3459863","volume":"40","author":"S Lombardi","year":"2021","unstructured":"Lombardi, S., Simon, T., Schwartz, G., Zollhoefer, M., Sheikh, Y., Saragih, J.: Mixture of volumetric primitives for efficient neural rendering. ACM Trans. Graph. (ToG) 40(4), 1\u201313 (2021)","journal-title":"ACM Trans. Graph. (ToG)"},{"key":"1_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"405","DOI":"10.1007\/978-3-030-58452-8_24","volume-title":"Computer Vision \u2013 ECCV 2020","author":"B Mildenhall","year":"2020","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: NeRF: representing scenes as neural radiance fields for view synthesis. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 405\u2013421. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_24"},{"key":"1_CR27","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"218","DOI":"10.1007\/978-3-031-19836-6_13","volume-title":"ECCV 2022","author":"S Mo","year":"2022","unstructured":"Mo, S., Morgado, P.: Localizing visual sounds the easy way. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13697, pp. 218\u2013234. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19836-6_13"},{"key":"1_CR28","unstructured":"Morgado, P., Nvasconcelos, N., Langlois, T., Wang, O.: Self-supervised generation of spatial audio for 360 video. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"1_CR29","doi-asserted-by":"crossref","unstructured":"Ng, E., et al.: From audio to photoreal embodiment: synthesizing humans in conversations. In: IEEE Conference on Computer Vision and Pattern Recognition (2024)","DOI":"10.1109\/CVPR52733.2024.00101"},{"key":"1_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"292","DOI":"10.1007\/978-3-030-58565-5_18","volume-title":"Computer Vision \u2013 ECCV 2020","author":"R Qian","year":"2020","unstructured":"Qian, R., Hu, D., Dinkel, H., Wu, M., Xu, N., Lin, W.: Multiple sound sources localization from coarse to fine. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12365, pp. 292\u2013308. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58565-5_18"},{"key":"1_CR31","doi-asserted-by":"crossref","unstructured":"Qian, S., Kirschstein, T., Schoneveld, L., Davoli, D., Giebenhain, S., Nie\u00dfner, M.: GaussianAvatars: photorealistic head avatars with rigged 3D Gaussians (2023)","DOI":"10.1109\/CVPR52733.2024.01919"},{"key":"1_CR32","doi-asserted-by":"crossref","unstructured":"Remelli, E., et\u00a0al.: Drivable volumetric avatars using texel-aligned features. In: ACM SIGGRAPH 2022 Conference Proceedings, pp.\u00a01\u20139 (2022)","DOI":"10.1145\/3528233.3530740"},{"key":"1_CR33","doi-asserted-by":"crossref","unstructured":"Richard, A., Dodds, P., Ithapu, V.K.: Deep impulse responses: estimating and parameterizing filters with deep networks. In: IEEE International Conference on Acoustics, Speech and Signal Processing (2022)","DOI":"10.1109\/ICASSP43922.2022.9746135"},{"key":"1_CR34","doi-asserted-by":"crossref","unstructured":"Richard, A., Lea, C., Ma, S., Gall, J., de\u00a0la Torre, F., Sheikh, Y.: Audio- and gaze-driven facial animation of codec avatars. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 41\u201350 (2021)","DOI":"10.1109\/WACV48630.2021.00009"},{"key":"1_CR35","unstructured":"Richard, A., et al.: Neural synthesis of binaural speech from mono audio. In: International Conference on Learning Representations (2021)"},{"key":"1_CR36","doi-asserted-by":"crossref","unstructured":"Richard, A., Zollhoefer, M., Wen, Y., de\u00a0la Torre, F., Sheikh, Y.: Meshtalk: 3D face animation from speech using cross-modality disentanglement. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"1_CR37","doi-asserted-by":"crossref","unstructured":"Saito, S., Schwartz, G., Simon, T., Li, J., Nam, G.: Relightable gaussian codec avatars (2023)","DOI":"10.1109\/CVPR52733.2024.00021"},{"key":"1_CR38","unstructured":"Samarasinghe, P.N., Abhayapala, T.D.: 3D spatial soundfield recording over large regions. In: Proceedings of the International Workshop on Acoustic Signal Enhancement (IWAENC) (2012)"},{"issue":"9","key":"1_CR39","first-page":"675","volume":"47","author":"L Savioja","year":"1999","unstructured":"Savioja, L., Huopaniemi, J., Lokki, T., V\u00e4\u00e4n\u00e4nen, R.: Creating interactive virtual acoustic environments. J. Audio Eng. Soc. 47(9), 675\u2013705 (1999)","journal-title":"J. Audio Eng. Soc."},{"key":"1_CR40","unstructured":"Tevet, G., Raab, S., Gordon, B., Shafir, Y., Cohen-or, D., Bermano, A.H.: Human motion diffusion model. In: The Eleventh International Conference on Learning Representations (2023)"},{"key":"1_CR41","doi-asserted-by":"crossref","unstructured":"Tian, Y., Hu, D., Xu, C.: Cyclic co-learning of sounding object visual grounding and sound separation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2745\u20132754 (2021)","DOI":"10.1109\/CVPR46437.2021.00277"},{"key":"1_CR42","unstructured":"Williams, E.G.: Fourier Acoustics. Academic Press (1999)"},{"key":"1_CR43","doi-asserted-by":"crossref","unstructured":"Xing, J., Xia, M., Zhang, Y., Cun, X., Wang, J., Wong, T.T.: Codetalker: speech-driven 3D facial animation with discrete motion prior. arXiv preprint arXiv:2301.02379 (2023)","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"1_CR44","unstructured":"Xudong, X., Markovic, D., Sandakly, J., Keebler, T., Krenn, S., Richard, A.: Sounding bodies: modeling 3D spatial sound of humans using body pose and audio. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023)"},{"key":"1_CR45","doi-asserted-by":"crossref","unstructured":"Yamamoto, R., Song, E., Hwang, M.J., Kim, J.M.: Parallel waveform synthesis based on generative adversarial networks with voicing-aware conditional discriminators. In: 2021 IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2021 (ICASSP), pp. 6039\u20136043. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9413369"},{"key":"1_CR46","doi-asserted-by":"crossref","unstructured":"Yi, H., et al.: Generating holistic 3D human motion from speech. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 469\u2013480 (2023)","DOI":"10.1109\/CVPR52729.2023.00053"},{"issue":"6","key":"1_CR47","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3414685.3417838","volume":"39","author":"Y Yoon","year":"2020","unstructured":"Yoon, Y., et al.: Speech gesture generation from the trimodal context of text, audio, and speaker identity. ACM Trans. Graph. (TOG) 39(6), 1\u201316 (2020)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"1_CR48","doi-asserted-by":"crossref","unstructured":"Zhao, H., Gan, C., Rouditchenko, A., Vondrick, C., McDermott, J., Torralba, A.: The sound of pixels. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 570\u2013586 (2018)","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"1_CR49","unstructured":"Zielonka, W., Bagautdinov, T., Saito, S., Zollh\u00f6fer, M., Thies, J., Romero, J.: Drivable 3D Gaussian avatars (2023)"},{"key":"1_CR50","series-title":"Springer Topics in Signal Processing","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-17207-7","volume-title":"Ambisonics: A Practical 3D Audio Theory for Recording, Studio Production, Sound Reinforcement, and Virtual Reality","author":"F Zotter","year":"2019","unstructured":"Zotter, F., Frank, M.: Ambisonics: A Practical 3D Audio Theory for Recording, Studio Production, Sound Reinforcement, and Virtual Reality. Springer Topics in Signal Processing, Springer, Cham (2019)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72684-2_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:03:30Z","timestamp":1730574210000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72684-2_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031726835","9783031726842"],"references-count":50,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72684-2_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}