{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T17:19:36Z","timestamp":1765041576297,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":26,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819794362"},{"type":"electronic","value":"9789819794379"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-9437-9_29","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T16:32:17Z","timestamp":1730392337000},"page":"369-381","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Optimized Conversational Gesture Generation with\u00a0Enhanced Motion Feature Extraction and\u00a0Cascaded Generator"],"prefix":"10.1007","author":[{"given":"Xiang","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yifeng","family":"Peng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhaoxiang","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shijie","family":"Dong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruitao","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kai","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shiguo","family":"Lian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"29_CR1","doi-asserted-by":"crossref","unstructured":"Alexanderson, S., Henter, G.E., Kucherenko, T., Beskow, J.: Style-controllable speech-driven gesture synthesis using normalising flows. In: Computer Graphics Forum, vol.\u00a039, pp. 487\u2013496. Wiley Online Library (2020)","DOI":"10.1111\/cgf.13946"},{"issue":"4","key":"29_CR2","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592458","volume":"42","author":"S Alexanderson","year":"2023","unstructured":"Alexanderson, S., Nagy, R., Beskow, J., Henter, G.E.: Listen, denoise, action! audio-driven motion synthesis with diffusion models. ACM Trans. Graph. (TOG) 42(4), 1\u201320 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"6","key":"29_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3550454.3555435","volume":"41","author":"T Ao","year":"2022","unstructured":"Ao, T., Gao, Q., Lou, Y., Chen, B., Liu, L.: Rhythmic gesticulator: rhythm-aware co-speech gesture synthesis with hierarchical neural embeddings. ACM Trans. Graph. (TOG) 41(6), 1\u201319 (2022)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"29_CR4","doi-asserted-by":"crossref","unstructured":"Cassell, J., Vilhj\u00e1lmsson, H.H., Bickmore, T.: Beat: the behavior expression animation toolkit. In: Proceedings of the 28th Annual Conference on Computer Graphics and Interactive Techniques, pp. 477\u2013486 (2001)","DOI":"10.1145\/383259.383315"},{"key":"29_CR5","unstructured":"Chung, J., Gulcehre, C., Cho, K., Bengio, Y.: Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555 (2014)"},{"key":"29_CR6","doi-asserted-by":"crossref","unstructured":"Ginosar, S., Bar, A., Kohavi, G., Chan, C., Owens, A., Malik, J.: Learning individual styles of conversational gesture. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3497\u20133506 (2019)","DOI":"10.1109\/CVPR.2019.00361"},{"key":"29_CR7","doi-asserted-by":"publisher","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","volume":"29","author":"WN Hsu","year":"2021","unstructured":"Hsu, W.N., Bolte, B., Tsai, Y.H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: Hubert: self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 3451\u20133460 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"29_CR8","unstructured":"Kipp, M.: Gesture generation by imitation: From human behavior to computer character animation. Universal-Publishers (2005)"},{"key":"29_CR9","doi-asserted-by":"crossref","unstructured":"Kopp, S., Wachsmuth, I.: Model-based animation of co-verbal gesture. In: Proceedings of Computer Animation 2002 (CA 2002), pp. 252\u2013257. IEEE (2002)","DOI":"10.1109\/CA.2002.1017547"},{"key":"29_CR10","doi-asserted-by":"crossref","unstructured":"Kucherenko, T., Jonell, P., Yoon, Y., Wolfert, P., Henter, G.E.: a large, crowdsourced evaluation of gesture generation systems on common data: the genea challenge 2020. In: 26th International Conference on Intelligent User Interfaces, pp. 11\u201321 (2021)","DOI":"10.1145\/3397481.3450692"},{"key":"29_CR11","doi-asserted-by":"crossref","unstructured":"Kucherenko, T., et al.: The genea challenge 2023: a large-scale evaluation of gesture generation models in monadic and dyadic settings. In: Proceedings of the 25th International Conference on Multimodal Interaction, pp. 792\u2013801 (2023)","DOI":"10.1145\/3577190.3616120"},{"key":"29_CR12","doi-asserted-by":"crossref","unstructured":"Levine, S., Theobalt, C., Koltun, V.: Real-time prosody-driven synthesis of body language. In: ACM SIGGRAPH Asia 2009 Papers, pp. 1\u201310 (2009)","DOI":"10.1145\/1661412.1618518"},{"key":"29_CR13","doi-asserted-by":"crossref","unstructured":"Li, J., et al.: Audio2gestures: generating diverse gestures from speech audio with conditional variational autoencoders. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11293\u201311302 (2021)","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"29_CR14","doi-asserted-by":"publisher","unstructured":"Liu, H., et al.: Beat: a large-scale semantic and emotional multi-modal dataset for conversational gestures synthesis. In: Avidan, S., Brostow, G., Cisse, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, pp. 612\u2013630. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-20071-7_36","DOI":"10.1007\/978-3-031-20071-7_36"},{"key":"29_CR15","doi-asserted-by":"crossref","unstructured":"Liu, X., et al.: Learning hierarchical cross-modal association for co-speech gesture generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10462\u201310472 (2022)","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"29_CR16","unstructured":"Liu, Y., et al.: Roberta: a robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"29_CR17","doi-asserted-by":"crossref","unstructured":"Ma, P., Petridis, S., Pantic, M.: End-to-end audio-visual speech recognition with conformers. In: ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7613\u20137617. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9414567"},{"key":"29_CR18","doi-asserted-by":"crossref","unstructured":"Nyatsanga, S., Kucherenko, T., Ahuja, C., Henter, G.E., Neff, M.: A comprehensive review of data-driven co-speech gesture generation. In: Computer Graphics Forum, vol.\u00a042, pp. 569\u2013596. Wiley Online Library (2023)","DOI":"10.1111\/cgf.14776"},{"key":"29_CR19","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: Action-conditioned 3d human motion synthesis with transformer vae. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10985\u201310995 (2021)","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"29_CR20","doi-asserted-by":"publisher","unstructured":"Watanabe, S., et al.: ESPnet: end-to-end speech processing toolkit. In: Proceedings of Interspeech, pp. 2207\u20132211 (2018). https:\/\/doi.org\/10.21437\/Interspeech.2018-1456","DOI":"10.21437\/Interspeech.2018-1456"},{"issue":"3","key":"29_CR21","doi-asserted-by":"publisher","first-page":"379","DOI":"10.1109\/THMS.2022.3149173","volume":"52","author":"P Wolfert","year":"2022","unstructured":"Wolfert, P., Robinson, N., Belpaeme, T.: A review of evaluation practices of gesture generation in embodied conversational agents. IEEE Trans. Hum.-Mach. Syst. 52(3), 379\u2013389 (2022)","journal-title":"IEEE Trans. Hum.-Mach. Syst."},{"key":"29_CR22","doi-asserted-by":"crossref","unstructured":"Yang, S., et al.: Diffusestylegesture: stylized audio-driven co-speech gesture generation with diffusion models. arXiv preprint arXiv:2305.04919 (2023)","DOI":"10.24963\/ijcai.2023\/650"},{"issue":"6","key":"29_CR23","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3414685.3417838","volume":"39","author":"Y Yoon","year":"2020","unstructured":"Yoon, Y., et al.: Speech gesture generation from the trimodal context of text, audio, and speaker identity. ACM Trans. Graph. (TOG) 39(6), 1\u201316 (2020)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"29_CR24","doi-asserted-by":"crossref","unstructured":"Yoon, Y., et al.: The genea challenge 2022: a large evaluation of data-driven co-speech gesture generation. In: Proceedings of the 2022 International Conference on Multimodal Interaction, pp. 736\u2013747 (2022)","DOI":"10.1145\/3536221.3558058"},{"key":"29_CR25","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Barnes, C., Lu, J., Yang, J., Li, H.: On the continuity of rotation representations in neural networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5745\u20135753 (2019)","DOI":"10.1109\/CVPR.2019.00589"},{"key":"29_CR26","doi-asserted-by":"crossref","unstructured":"Zhu, L., Liu, X., Liu, X., Qian, R., Liu, Z., Yu, L.: Taming diffusion models for audio-driven co-speech gesture generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10544\u201310553 (2023)","DOI":"10.1109\/CVPR52729.2023.01016"}],"container-title":["Lecture Notes in Computer Science","Natural Language Processing and Chinese Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-9437-9_29","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T16:32:46Z","timestamp":1730392366000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-9437-9_29"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9789819794362","9789819794379"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-9437-9_29","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"NLPCC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CCF International Conference on Natural Language Processing and Chinese Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hangzhou","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 November 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"nlpcc2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/tcci.ccf.org.cn\/conference\/2024\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}