{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T18:43:31Z","timestamp":1772822611306,"version":"3.50.1"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T00:00:00Z","timestamp":1770854400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T00:00:00Z","timestamp":1770854400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["CCF Trans. Pervasive Comp. Interact."],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s42486-025-00213-z","type":"journal-article","created":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T13:23:10Z","timestamp":1770902590000},"page":"165-180","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Enhanced data techniques and optimization in conversational gesture generation"],"prefix":"10.1007","volume":"8","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3893-3888","authenticated-orcid":false,"given":"Xiang","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yifeng","family":"Peng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhaoxiang","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kai","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shiguo","family":"Lian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,12]]},"reference":[{"key":"213_CR1","doi-asserted-by":"crossref","unstructured":"Ahuja, C., Lee, D.W., Ishii, R., Morency, L.-P.: No gestures left behind: learning relationships between spoken language and freeform gestures. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 1884\u20131895 (2020)","DOI":"10.18653\/v1\/2020.findings-emnlp.170"},{"key":"213_CR2","doi-asserted-by":"crossref","unstructured":"Alexanderson, S., Henter, G.E., Kucherenko, T., Beskow, J.: Style-controllable speech-driven gesture synthesis using normalising flows. In: Computer graphics forum, vol.\u00a039, Wiley Online Library, (2020), pp. 487\u2013496","DOI":"10.1111\/cgf.13946"},{"issue":"4","key":"213_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592458","volume":"42","author":"S Alexanderson","year":"2023","unstructured":"Alexanderson, S., Nagy, R., Beskow, J., Henter, G.E.: Listen, denoise, action! audio-driven motion synthesis with diffusion models. ACM Trans. Graph. (TOG) 42(4), 1\u201320 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"6","key":"213_CR4","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3550454.3555435","volume":"41","author":"T Ao","year":"2022","unstructured":"Ao, T., Gao, Q., Lou, Y., Chen, B., Liu, L.: Rhythmic gesticulator: rhythm-aware co-speech gesture synthesis with hierarchical neural embeddings. ACM Trans. Graph. (TOG) 41(6), 1\u201319 (2022)","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"4","key":"213_CR5","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592097","volume":"42","author":"T Ao","year":"2023","unstructured":"Ao, T., Zhang, Z., Liu, L.: Gesturediffuclip: gesture diffusion model with clip latents. ACM Trans. Graph (TOG) 42(4), 1\u201318 (2023)","journal-title":"ACM Trans. Graph (TOG)"},{"key":"213_CR6","doi-asserted-by":"crossref","unstructured":"Bhattacharya, U., Childs, E., Rewkowski, N., Manocha, D.: Speech2affectivegestures: synthesizing co-speech gestures with generative adversarial affective expression learning. In: Proceedings of the 29th ACM international conference on multimedia, (2021), pp. 2027\u20132036","DOI":"10.1145\/3474085.3475223"},{"key":"213_CR7","doi-asserted-by":"crossref","unstructured":"Cassell, J., Pelachaud, C., Badler, N., Steedman, M., Achorn, B., Becket, T., Douville, B., Prevost, S., Stone, M.: Animated conversation: rule-based generation of facial expression, gesture and spoken intonation for multiple conversational agents. In: Proceedings of the 21st annual conference on Computer graphics and interactive techniques, (1994), pp. 413\u2013420","DOI":"10.1145\/192161.192272"},{"key":"213_CR8","doi-asserted-by":"crossref","unstructured":"Cassell, J., Vilhj\u00e1lmsson, H.H., Bickmore, T.: Beat: the behavior expression animation toolkit. In: Proceedings of the 28th annual conference on computer graphics and interactive techniques, (2001), pp. 477\u2013486","DOI":"10.1145\/383259.383315"},{"key":"213_CR9","unstructured":"Chung, J., Gulcehre, C., Cho, K., Bengio, Y.: Empirical evaluation of gated recurrent neural networks on sequence modeling, arXiv preprint arXiv:1412.3555 (2014)"},{"key":"213_CR10","doi-asserted-by":"crossref","unstructured":"Ferstl, Y., McDonnell, R.: Investigating the use of recurrent motion modelling for speech gesture generation. In: Proceedings of the 18th international conference on intelligent virtual agents, (2018), pp. 93\u201398","DOI":"10.1145\/3267851.3267898"},{"key":"213_CR11","doi-asserted-by":"crossref","unstructured":"Frechette, C., Moreno, R.: The roles of animated pedagogical agents\u2019 presence and nonverbal communication in multimedia learning environments, J. Media Psychol. (2010)","DOI":"10.1027\/1864-1105\/a000009"},{"issue":"200","key":"213_CR12","doi-asserted-by":"publisher","first-page":"675","DOI":"10.1080\/01621459.1937.10503522","volume":"32","author":"M Friedman","year":"1937","unstructured":"Friedman, M.: The use of ranks to avoid the assumption of normality implicit in the analysis of variance. J. Am. Stat. Assoc. 32(200), 675\u2013701 (1937)","journal-title":"J. Am. Stat. Assoc."},{"key":"213_CR13","doi-asserted-by":"crossref","unstructured":"Gao, D., Shi, J., Chuang, S.-P., Garcia, L.P., Lee, H.-y., Watanabe, S., Khudanpur, S.: EURO: ESPnet unsupervised asr open-source toolkit, arXiv preprint arXiv:2211.17196 (2022)","DOI":"10.1109\/ICASSP49357.2023.10096977"},{"key":"213_CR14","doi-asserted-by":"crossref","unstructured":"Ginosar, S., Bar, A., Kohavi, G., Chan, C., Owens, A., Malik, J.: Learning individual styles of conversational gesture. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, (2019), pp. 3497\u20133506","DOI":"10.1109\/CVPR.2019.00361"},{"key":"213_CR15","doi-asserted-by":"crossref","unstructured":"Habibie, I., Xu, W., Mehta, D., Liu, L., Seidel, H.-P., Pons-Moll, G., Elgharib, M., Theobalt, C.: Learning speech-driven 3d conversational gestures from video. In: Proceedings of the 21st ACM international conference on intelligent virtual agents, (2021), pp. 101\u2013108","DOI":"10.1145\/3472306.3478335"},{"key":"213_CR16","doi-asserted-by":"publisher","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","volume":"29","author":"W-N Hsu","year":"2021","unstructured":"Hsu, W.-N., Bolte, B., Tsai, Y.-H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: Hubert: self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 3451\u20133460 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"213_CR17","doi-asserted-by":"crossref","unstructured":"Joo, H., Simon, T., Cikara, M., Sheikh, Y.: Towards social artificial intelligence: Nonverbal social signal prediction in a triadic interaction. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, (2019), pp. 10873\u201310883","DOI":"10.1109\/CVPR.2019.01113"},{"issue":"177","key":"213_CR18","first-page":"90","volume":"7","author":"A Kendon","year":"1972","unstructured":"Kendon, A.: Some relationships between body motion and speech. Stud. Dyadic Commun. 7(177), 90 (1972)","journal-title":"Stud. Dyadic Commun."},{"key":"213_CR19","volume-title":"Gesture Generation by Imitation: From Human Behavior to Computer Character Animation","author":"M Kipp","year":"2005","unstructured":"Kipp, M.: Gesture Generation by Imitation: From Human Behavior to Computer Character Animation. Universal-Publishers (2005)"},{"key":"213_CR20","doi-asserted-by":"crossref","unstructured":"Kopp, S., Wachsmuth, I.: Model-based animation of co-verbal gesture. In: Proceedings of computer animation 2002 (CA 2002), IEEE, (2002), pp. 252\u2013257","DOI":"10.1109\/CA.2002.1017547"},{"key":"213_CR21","doi-asserted-by":"crossref","unstructured":"Kucherenko, T., Jonell, P., Yoon, Y., Wolfert, P., Henter, G.E.: A large, crowdsourced evaluation of gesture generation systems on common data: The genea challenge 2020. In: 26th international conference on intelligent user interfaces, (2021), pp. 11\u201321","DOI":"10.1145\/3397481.3450692"},{"key":"213_CR22","doi-asserted-by":"crossref","unstructured":"Kucherenko, T., Nagy, R., Yoon, Y., Woo, J., Nikolov, T., Tsakov, M., Henter, G.\u00a0E.: The genea challenge 2023: A large-scale evaluation of gesture generation models in monadic and dyadic settings. In: Proceedings of the 25th international conference on multimodal interaction, (2023), pp. 792\u2013801","DOI":"10.1145\/3577190.3616120"},{"key":"213_CR23","doi-asserted-by":"crossref","unstructured":"Levine, S., Theobalt, C., Koltun, V.: Real-time prosody-driven synthesis of body language. In: ACM SIGGRAPH Asia 2009 papers, (2009), pp. 1\u201310","DOI":"10.1145\/1661412.1618518"},{"key":"213_CR24","doi-asserted-by":"crossref","unstructured":"Li, J., Kang, D., Pei, W., Zhe, X., Zhang, Y., He, Z., Bao, L.: Audio2gestures: Generating diverse gestures from speech audio with conditional variational autoencoders. In: Proceedings of the IEEE\/CVF international conference on computer vision, (2021), pp. 11293\u201311302","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"213_CR25","doi-asserted-by":"crossref","unstructured":"Liang, Y., Feng, Q., Zhu, L., Hu, L., Pan, P., Yang, Y.: Seeg: Semantic energized co-speech gesture generation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, (2022), pp. 10473\u201310482","DOI":"10.1109\/CVPR52688.2022.01022"},{"key":"213_CR26","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1023\/A:1022628806385","volume":"8","author":"L-J Lin","year":"1992","unstructured":"Lin, L.-J.: Self-improving reactive agents based on reinforcement learning, planning and teaching. Mach. Learn. 8, 293\u2013321 (1992)","journal-title":"Mach. Learn."},{"key":"213_CR27","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L., Stoyanov, V.: Roberta: A robustly optimized bert pretraining approach, arXiv preprint arXiv:1907.11692 (2019)"},{"key":"213_CR28","doi-asserted-by":"crossref","unstructured":"Liu, X., Wu, Q., Zhou, H., Xu, Y., Qian, R., Lin, X., Zhou, X., Wu, W., Dai, B., Zhou, B.: Learning hierarchical cross-modal association for co-speech gesture generation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, (2022), pp. 10462\u201310472","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"213_CR29","doi-asserted-by":"crossref","unstructured":"Liu, H., Zhu, Z., Iwamoto, N., Peng, Y., Li, Z., Zhou, Y., Bozkurt, E., Zheng, B.: Beat: A large-scale semantic and emotional multi-modal dataset for conversational gestures synthesis. In: European conference on computer vision, Springer, (2022), pp. 612\u2013630","DOI":"10.1007\/978-3-031-20071-7_36"},{"key":"213_CR30","doi-asserted-by":"crossref","unstructured":"Ma, P., Petridis, S., Pantic, M.: End-to-end audio-visual speech recognition with conformers. In: ICASSP 2021-2021 IEEE international conference on acoustics, speech and signal processing (ICASSP), IEEE, 2021, pp. 7613\u20137617","DOI":"10.1109\/ICASSP39728.2021.9414567"},{"issue":"205","key":"213_CR31","first-page":"109","volume":"34","author":"F Milton","year":"1939","unstructured":"Milton, F.: A correction: the use of ranks to avoid the assumption of normality implicit in the analysis of variance. J. Am. Stat. Assoc. 34(205), 109 (1939)","journal-title":"J. Am. Stat. Assoc."},{"key":"213_CR32","doi-asserted-by":"crossref","unstructured":"Nyatsanga, S., Kucherenko, T., Ahuja, C., Henter, G.E., Neff, M.: A comprehensive review of data-driven co-speech gesture generation. In: Computer Graphics Forum, Vol.\u00a042, Wiley Online Library, (2023), pp. 569\u2013596","DOI":"10.1111\/cgf.14776"},{"issue":"17","key":"213_CR33","doi-asserted-by":"publisher","first-page":"1648","DOI":"10.1080\/10447318.2021.1898851","volume":"37","author":"I Wang","year":"2021","unstructured":"Wang, I., Ruiz, J.: Examining the use of nonverbal communication in virtual agents. Int. J. Human-Computer Interact. 37(17), 1648\u20131673 (2021)","journal-title":"Int. J. Human-Computer Interact."},{"issue":"3","key":"213_CR34","doi-asserted-by":"publisher","first-page":"379","DOI":"10.1109\/THMS.2022.3149173","volume":"52","author":"P Wolfert","year":"2022","unstructured":"Wolfert, P., Robinson, N., Belpaeme, T.: A review of evaluation practices of gesture generation in embodied conversational agents. IEEE Trans. Human-Mach. Syst. 52(3), 379\u2013389 (2022)","journal-title":"IEEE Trans. Human-Mach. Syst."},{"key":"213_CR35","doi-asserted-by":"crossref","unstructured":"Yang, S., Wu, Z., Li, M., Zhang, Z., Hao, L., Bao, W., Cheng, M., Xiao, L.: Diffusestylegesture: Stylized audio-driven co-speech gesture generation with diffusion models. In: Proceedings of the thirty-second international joint conference on artificial intelligence, IJCAI-23, (2023), pp. 5860\u20135868","DOI":"10.24963\/ijcai.2023\/650"},{"key":"213_CR36","doi-asserted-by":"crossref","unstructured":"Yang, S., Wu, Z., Li, M., Zhang, Z., Hao, L., Bao, W., Zhuang, H.: Qpgesture: Quantization-based and phase-guided motion matching for natural speech-driven gesture generation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, (2023), pp. 2321\u20132330","DOI":"10.1109\/CVPR52729.2023.00230"},{"key":"213_CR37","doi-asserted-by":"crossref","unstructured":"Yi, H., Liang, H., Liu, Y., Cao, Q., Wen, Y., Bolkart, T., Tao, D., Black, M.J.: Generating holistic 3d human motion from speech. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, (2023), pp. 469\u2013480","DOI":"10.1109\/CVPR52729.2023.00053"},{"key":"213_CR38","doi-asserted-by":"crossref","unstructured":"Yoon, Y., Ko, W.-R., Jang, M., Lee, J., Kim, J., Lee, G.: Robots learn social skills: End-to-end learning of co-speech gesture generation for humanoid robots. In: 2019 International conference on robotics and automation (ICRA), IEEE, (2019), pp. 4303\u20134309","DOI":"10.1109\/ICRA.2019.8793720"},{"key":"213_CR39","doi-asserted-by":"crossref","unstructured":"Yoon, Y., Wolfert, P., Kucherenko, T., Viegas, C., Nikolov, T., Tsakov, M., Henter, G.E.: The genea challenge 2022: A large evaluation of data-driven co-speech gesture generation. In: Proceedings of the 2022 international conference on multimodal interaction, (2022), pp. 736\u2013747","DOI":"10.1145\/3536221.3558058"},{"issue":"6","key":"213_CR40","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3414685.3417838","volume":"39","author":"Y Yoon","year":"2020","unstructured":"Yoon, Y., Cha, B., Lee, J.-H., Jang, M., Lee, J., Kim, J., Lee, G.: Speech gesture generation from the trimodal context of text, audio, and speaker identity. ACM Trans. Graph. (TOG) 39(6), 1\u201316 (2020)","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"4","key":"213_CR41","first-page":"1","volume":"43","author":"Z Zhang","year":"2024","unstructured":"Zhang, Z., Ao, T., Zhang, Y., Gao, Q., Lin, C., Chen, B., Liu, L.: Semantic gesticulator: semantics-aware co-speech gesture synthesis. ACM Trans. Graph. (TOG) 43(4), 1\u201317 (2024)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"213_CR42","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Barnes, C., Lu, J., Yang, J., Li, H.: On the continuity of rotation representations in neural networks. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, (2019), pp. 5745\u20135753","DOI":"10.1109\/CVPR.2019.00589"},{"key":"213_CR43","doi-asserted-by":"crossref","unstructured":"Zhu, L., Liu, X., Liu, X., Qian, R., Liu, Z., Yu, L.: Taming diffusion models for audio-driven co-speech gesture generation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, (2023), pp. 10544\u201310553","DOI":"10.1109\/CVPR52729.2023.01016"},{"key":"213_CR44","doi-asserted-by":"crossref","unstructured":"Zhu, W., Ma, X., Ro, D., Ci, H., Zhang, J., Shi, J., Gao, F., Tian, Q., Wang, Y.: Human motion generation: a survey. IEEE Trans. Patt. Anal. Mach. Intell. (2023)","DOI":"10.1109\/TPAMI.2023.3330935"}],"container-title":["CCF Transactions on Pervasive Computing and Interaction"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42486-025-00213-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42486-025-00213-z","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42486-025-00213-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T13:03:21Z","timestamp":1772802201000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42486-025-00213-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,12]]},"references-count":44,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["213"],"URL":"https:\/\/doi.org\/10.1007\/s42486-025-00213-z","relation":{},"ISSN":["2524-521X","2524-5228"],"issn-type":[{"value":"2524-521X","type":"print"},{"value":"2524-5228","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,12]]},"assertion":[{"value":"18 July 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 September 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"On behalf of all authors, the corresponding author states that there is no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}