{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T11:37:03Z","timestamp":1773920223260,"version":"3.50.1"},"reference-count":109,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/100009776","name":"McMaster University","doi-asserted-by":"publisher","award":["20019495"],"award-info":[{"award-number":["20019495"]}],"id":[{"id":"10.13039\/100009776","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SN COMPUT. SCI."],"DOI":"10.1007\/s42979-025-03910-9","type":"journal-article","created":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T08:03:07Z","timestamp":1745308987000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Audio2Moves: Two-Level Hierarchical Framework for Audio-Driven Human Motion Synthesis"],"prefix":"10.1007","volume":"6","author":[{"given":"Yanbo","family":"Cheng","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nada","family":"Elmasry","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5680-1929","authenticated-orcid":false,"given":"Yingying","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"3910_CR1","doi-asserted-by":"crossref","unstructured":"Tevet G, Raab S, Gordon B, Shafir Y, Cohen-or D, Bermano A.H. Human motion diffusion model. In: The Eleventh International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=SJ1kSyO2jwu","DOI":"10.1145\/3680528.3687579"},{"key":"3910_CR2","unstructured":"Zhang M, Cai Z, Pan L, Hong F, Guo X, Yang L, Liu Z. Motiondiffuse: Text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001 (2022)"},{"key":"3910_CR3","doi-asserted-by":"crossref","unstructured":"Guo C, Mu Y, Javed M.G, Wang S, Cheng L. Momask: Generative masked modeling of 3d human motions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1900\u20131910 (2024)","DOI":"10.1109\/CVPR52733.2024.00186"},{"key":"3910_CR4","doi-asserted-by":"crossref","unstructured":"Aristidou A, Zeng Q, Stavrakis E, Yin K, Cohen-Or D, Chrysanthou Y, Chen B. Emotion control of unstructured dance movements. In: Proceedings of the ACM SIGGRAPH\/Eurographics Symposium on Computer Animation, pp. 1\u201310 (2017)","DOI":"10.1145\/3099564.3099566"},{"key":"3910_CR5","doi-asserted-by":"crossref","unstructured":"Cheng, Y, Wang, Y. Transformer-Based Two-level Approach for Music-driven Dance Choreography. In: Proceedings of the 19th International Joint Conference on Computer Vision, Imaging and Computer Graphics Theory and Applications - GRAPP, pp. 127\u2013139 (2024). INSTICC","DOI":"10.5220\/0012434500003660"},{"issue":"1","key":"3910_CR6","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1330511.1330516","volume":"27","author":"M Neff","year":"2008","unstructured":"Neff M, Kipp M, Albrecht I, Seidel H-P. Gesture modeling and animation based on a probabilistic re-creation of speaker style. ACM Transactions On Graphics (TOG). 2008;27(1):1\u201324.","journal-title":"ACM Transactions On Graphics (TOG)"},{"issue":"6","key":"3910_CR7","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3550454.3555435","volume":"41","author":"T Ao","year":"2022","unstructured":"Ao T, Gao Q, Lou Y, Chen B, Liu L. Rhythmic gesticulator: Rhythm-aware co-speech gesture synthesis with hierarchical neural embeddings. ACM Transactions on Graphics (TOG). 2022;41(6):1\u201319.","journal-title":"ACM Transactions on Graphics (TOG)"},{"issue":"4","key":"3910_CR8","first-page":"1","volume":"36","author":"HJ Smith","year":"2017","unstructured":"Smith HJ, Neff M. Understanding the impact of animated gesture performance on personality perceptions. ACM Transactions on Graphics (TOG). 2017;36(4):1\u201312.","journal-title":"ACM Transactions on Graphics (TOG)"},{"issue":"2","key":"3910_CR9","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3340254","volume":"2","author":"HJ Smith","year":"2019","unstructured":"Smith HJ, Cao C, Neff M, Wang Y. Efficient neural networks for real-time motion style transfer. Proceedings of the ACM on Computer Graphics and Interactive Techniques. 2019;2(2):1\u201317.","journal-title":"Proceedings of the ACM on Computer Graphics and Interactive Techniques"},{"issue":"4","key":"3910_CR10","doi-asserted-by":"publisher","first-page":"119","DOI":"10.1145\/2766999","volume":"34","author":"S Xia","year":"2015","unstructured":"Xia S, Wang C, Chai J, Hodgins J. Realtime style transfer for unlabeled heterogeneous human motion. ACM Transactions on Graphics (TOG). 2015;34(4):119.","journal-title":"ACM Transactions on Graphics (TOG)"},{"issue":"4","key":"3910_CR11","doi-asserted-by":"publisher","first-page":"137","DOI":"10.1145\/2897824.2925955","volume":"35","author":"ME Yumer","year":"2016","unstructured":"Yumer ME, Mitra NJ. Spectral style transfer for human motion between independent actions. ACM Transactions on Graphics (TOG). 2016;35(4):137.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"3910_CR12","unstructured":"Morro Motion https:\/\/assetstore.unity.com\/packages\/3d \/animations\/dance-mocap-collection-102966"},{"key":"3910_CR13","doi-asserted-by":"crossref","unstructured":"Ferstl Y, McDonnell R. Iva: Investigating the use of recurrent motion modelling for speech gesture generation. In: IVA \u201918 Proceedings of the 18th International Conference on Intelligent Virtual Agents (2018). https:\/\/trinityspeechgesture.scss.tcd.ie","DOI":"10.1145\/3267851.3267898"},{"issue":"4","key":"3910_CR14","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592458","volume":"42","author":"S Alexanderson","year":"2023","unstructured":"Alexanderson S, Nagy R, Beskow J, Henter GE. Listen, denoise, action! audio-driven motion synthesis with diffusion models. ACM Transactions on Graphics (TOG). 2023;42(4):1\u201320.","journal-title":"ACM Transactions on Graphics (TOG)"},{"issue":"3","key":"3910_CR15","doi-asserted-by":"publisher","first-page":"483","DOI":"10.1145\/566654.566606","volume":"21","author":"O Arikan","year":"2002","unstructured":"Arikan O, Forsyth DA. Interactive motion generation from examples. ACM Transactions on Graphics (TOG). 2002;21(3):483\u201390.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"3910_CR16","doi-asserted-by":"crossref","unstructured":"Kovar L, Gleicher M, Pighin F. Motion graphs. In: Seminal Graphics Papers: Pushing the Boundaries, Volume 2, pp. 723\u2013732 (2023)","DOI":"10.1145\/3596711.3596788"},{"issue":"5","key":"3910_CR17","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1109\/38.708559","volume":"18","author":"C Rose","year":"1998","unstructured":"Rose C, Cohen MF, Bodenheimer B. Verbs and adverbs: Multidimensional motion interpolation. IEEE Comput Graphics Appl. 1998;18(5):32\u201340.","journal-title":"IEEE Comput Graphics Appl"},{"key":"3910_CR18","doi-asserted-by":"crossref","unstructured":"Mukai T, Kuriyama S. Geostatistical motion interpolation. In: ACM SIGGRAPH 2005 Papers, pp. 1062\u20131070 (2005)","DOI":"10.1145\/1186822.1073313"},{"key":"3910_CR19","unstructured":"Bowden R. Learning statistical models of human motion. In: IEEE Workshop on Human Modeling, Analysis and Synthesis, CVPR, vol. 2000 (2000)"},{"issue":"3","key":"3910_CR20","doi-asserted-by":"publisher","first-page":"398","DOI":"10.1006\/cviu.2000.0894","volume":"81","author":"A Galata","year":"2001","unstructured":"Galata A, Johnson N, Hogg D. Learning variable-length markov models of behavior. Comput Vis Image Underst. 2001;81(3):398\u2013413.","journal-title":"Comput Vis Image Underst"},{"issue":"4","key":"3910_CR21","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073663","volume":"36","author":"D Holden","year":"2017","unstructured":"Holden D, Komura T, Saito J. Phase-functioned neural networks for character control. ACM Transactions on Graphics (TOG). 2017;36(4):1\u201313.","journal-title":"ACM Transactions on Graphics (TOG)"},{"issue":"4","key":"3910_CR22","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3197517.3201366","volume":"37","author":"H Zhang","year":"2018","unstructured":"Zhang H, Starke S, Komura T, Saito J. Mode-adaptive neural networks for quadruped motion control. ACM Transactions on Graphics (TOG). 2018;37(4):1\u201311.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"3910_CR23","doi-asserted-by":"crossref","unstructured":"Wang Y, Neff M. Deep signatures for indexing and retrieval in large motion databases. In: Proceedings of the 8th ACM SIGGRAPH Conference on Motion in Games, pp. 37\u201345 (2015)","DOI":"10.1145\/2822013.2822024"},{"key":"3910_CR24","doi-asserted-by":"crossref","unstructured":"Holden D, Saito J, Komura T, Joyce T. Learning motion manifolds with convolutional autoencoders. In: SIGGRAPH Asia 2015 Technical Briefs, pp. 1\u20134 (2015)","DOI":"10.1145\/2820903.2820918"},{"issue":"4","key":"3910_CR25","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2897824.2925975","volume":"35","author":"D Holden","year":"2016","unstructured":"Holden D, Saito J, Komura T. A deep learning framework for character motion synthesis and editing. ACM Transactions on Graphics (TOG). 2016;35(4):1\u201311.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"3910_CR26","doi-asserted-by":"crossref","unstructured":"Martinez J, Black M.J, Romero J. On human motion prediction using recurrent neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2891\u20132900 (2017)","DOI":"10.1109\/CVPR.2017.497"},{"issue":"1","key":"3910_CR27","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1109\/TVCG.2019.2938520","volume":"27","author":"Z Wang","year":"2019","unstructured":"Wang Z, Chai J, Xia S. Combining recurrent neural networks and adversarial training for human motion synthesis and control. IEEE Trans Visual Comput Graphics. 2019;27(1):14\u201328.","journal-title":"IEEE Trans Visual Comput Graphics"},{"issue":"1","key":"3910_CR28","doi-asserted-by":"publisher","first-page":"216","DOI":"10.1109\/TVCG.2019.2936810","volume":"27","author":"H Wang","year":"2019","unstructured":"Wang H, Ho ES, Shum HP, Zhu Z. Spatio-temporal manifold learning for human motions via long-horizon modeling. IEEE Trans Visual Comput Graphics. 2019;27(1):216\u201327.","journal-title":"IEEE Trans Visual Comput Graphics"},{"key":"3910_CR29","unstructured":"Li Z, Zhou Y, Xiao S, He C, Huang Z, Li H. Auto-conditioned recurrent networks for extended complex human motion synthesis. arXiv preprint arXiv:1707.05363 (2017)"},{"key":"3910_CR30","doi-asserted-by":"crossref","unstructured":"Tang T, Jia J, Mao H. Dance with melody: An lstm-autoencoder approach to music-oriented dance synthesis. In: Proceedings of the 26th ACM International Conference on Multimedia, pp. 1598\u20131606 (2018)","DOI":"10.1145\/3240508.3240526"},{"issue":"4","key":"3910_CR31","first-page":"1","volume":"41","author":"P Li","year":"2022","unstructured":"Li P, Aberman K, Zhang Z, Hanocka R, Sorkine-Hornung O. Ganimator: Neural motion synthesis from a single sequence. ACM Transactions on Graphics (TOG). 2022;41(4):1\u201312.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"3910_CR32","unstructured":"Heess N, Tb D, Sriram S, Lemmon J, Merel J, Wayne G, Tassa Y, Erez T, Wang Z, Eslami S, et al. Emergence of locomotion behaviours in rich environments. arXiv preprint arXiv:1707.02286 (2017)"},{"issue":"4","key":"3910_CR33","first-page":"1","volume":"37","author":"XB Peng","year":"2018","unstructured":"Peng XB, Abbeel P, Levine S, Panne M. Deepmimic: Example-guided deep reinforcement learning of physics-based character skills. ACM Transactions On Graphics (TOG). 2018;37(4):1\u201314.","journal-title":"ACM Transactions On Graphics (TOG)"},{"issue":"4","key":"3910_CR34","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3450626.3459670","volume":"40","author":"XB Peng","year":"2021","unstructured":"Peng XB, Ma Z, Abbeel P, Levine S, Kanazawa A. Amp: Adversarial motion priors for stylized physics-based character control. ACM Transactions on Graphics (ToG). 2021;40(4):1\u201320.","journal-title":"ACM Transactions on Graphics (ToG)"},{"issue":"5","key":"3910_CR35","doi-asserted-by":"publisher","first-page":"445","DOI":"10.1002\/cav.1477","volume":"24","author":"Y Zhu","year":"2013","unstructured":"Zhu Y, Ramakrishnan AS, Hamann B, Neff M. A system for automatic animation of piano performances. Computer Animation and Virtual Worlds. 2013;24(5):445\u201357.","journal-title":"Computer Animation and Virtual Worlds"},{"key":"3910_CR36","unstructured":"ElKoura G, Singh K. Handrix: animating the human hand. In: Proceedings of the 2003 ACM SIGGRAPH\/Eurographics Symposium on Computer Animation, pp. 110\u2013119 (2003)"},{"key":"3910_CR37","doi-asserted-by":"crossref","unstructured":"Shlizerman E, Dery L, Schoen H, Kemelmacher-Shlizerman I. Audio to body dynamics. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7574\u20137583 (2018)","DOI":"10.1109\/CVPR.2018.00790"},{"key":"3910_CR38","doi-asserted-by":"crossref","unstructured":"Shiratori T, Nakazawa A, Ikeuchi K. Synthesizing dance performance using musical and motion features. In: Proceedings 2006 IEEE International Conference on Robotics and Automation, 2006. ICRA 2006., pp. 3654\u20133659 (2006). IEEE","DOI":"10.1109\/ROBOT.2006.1642260"},{"key":"3910_CR39","doi-asserted-by":"publisher","first-page":"895","DOI":"10.1007\/s11042-012-1288-5","volume":"62","author":"M Lee","year":"2013","unstructured":"Lee M, Lee K, Park J. Music similarity-based approach to generating dance motion sequence. Multimedia tools and applications. 2013;62:895\u2013912.","journal-title":"Multimedia tools and applications"},{"issue":"3","key":"3910_CR40","doi-asserted-by":"publisher","first-page":"747","DOI":"10.1109\/TMM.2011.2181492","volume":"14","author":"F Ofli","year":"2011","unstructured":"Ofli F, Erzin E, Yemez Y, Tekalp AM. Learn2dance: Learning statistical music-to-dance mappings for choreography synthesis. IEEE Trans Multimedia. 2011;14(3):747\u201359.","journal-title":"IEEE Trans Multimedia"},{"key":"3910_CR41","unstructured":"Fukayama S, Goto M. Music content driven automated choreography with beat-wise motion connectivity constraints. Proceedings of SMC, 177\u2013183 (2015)"},{"key":"3910_CR42","unstructured":"Lee H.-Y, Yang X, Liu M.-Y, Wang T.-C, Lu Y.-D, Yang M.-H, Kautz J. Dancing to music. Advances in neural information processing systems 32 (2019)"},{"key":"3910_CR43","unstructured":"Zhuang W, Wang C, Xia S, Chai J, Wang Y. Music2dance: Music-driven dance generation using wavenet. arXiv:2002.03761 (2020)"},{"key":"3910_CR44","doi-asserted-by":"crossref","unstructured":"Yan S, Li Z, Xiong Y, Yan H, Lin D. Convolutional sequence generation for skeleton-based action synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4394\u20134402 (2019)","DOI":"10.1109\/ICCV.2019.00449"},{"key":"3910_CR45","unstructured":"Alemi O, Fran\u00e7oise J, Pasquier P. Groovenet: Real-time music-driven dance movement generation using artificial neural networks. networks 8(17), 26 (2017)"},{"key":"3910_CR46","unstructured":"Huang R, Hu H, Wu W, Sawada K, Zhang M, Jiang D. Dance revolution: Long-term dance generation with music via curriculum learning. arXiv preprint arXiv:2006.06119 (2020)"},{"key":"3910_CR47","unstructured":"Crnkovic-Friis L, Crnkovic-Friis L. Generative choreography using deep learning. arXiv preprint arXiv:1605.06921 (2016)"},{"issue":"4","key":"3910_CR48","first-page":"1","volume":"40","author":"K Chen","year":"2021","unstructured":"Chen K, Tan Z, Lei J, Zhang S-H, Guo Y-C, Zhang W, Hu S-M. Choreomaster: choreography-oriented music-driven dance synthesis. ACM Transactions on Graphics (TOG). 2021;40(4):1\u201313.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"3910_CR49","doi-asserted-by":"crossref","unstructured":"Aristidou A, Yiannakidis A, Aberman K, Cohen-Or D, Shamir A, Chrysanthou Y. Rhythm is a dancer: Music-driven motion synthesis with global structure. IEEE transactions on visualization and computer graphics (2022)","DOI":"10.1109\/TVCG.2022.3163676"},{"key":"3910_CR50","doi-asserted-by":"crossref","unstructured":"Papillon M, Pettee M, Miolane N. Pirounet: Creating dance through artist-centric deep learning. In: International Conference on ArtsIT, Interactivity and Game Creation, pp. 447\u2013465 (2022). Springer","DOI":"10.1007\/978-3-031-28993-4_31"},{"key":"3910_CR51","doi-asserted-by":"crossref","unstructured":"Cheng Y, Jiang Y, Wang Y. Music-stylized hierarchical dance synthesis with user control. In: Computer Graphics International (2024)","DOI":"10.1016\/j.vrih.2024.06.004"},{"key":"3910_CR52","doi-asserted-by":"crossref","unstructured":"Li R, Yang S, Ross D.A, Kanazawa A. Ai choreographer: Music conditioned 3d dance generation with aist++. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13401\u201313412 (2021)","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"3910_CR53","doi-asserted-by":"crossref","unstructured":"Siyao L, Yu W, Gu T, Lin C, Wang Q, Qian C, Loy C.C, Liu Z. Bailando: 3d dance generation by actor-critic gpt with choreographic memory. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11050\u201311059 (2022)","DOI":"10.1109\/CVPR52688.2022.01077"},{"key":"3910_CR54","doi-asserted-by":"crossref","unstructured":"Li B, Zhao Y, Zhelun S, Sheng L. Danceformer: Music conditioned 3d dance generation with parametric motion transformer. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 1272\u20131279 (2022)","DOI":"10.1609\/aaai.v36i2.20014"},{"key":"3910_CR55","unstructured":"Li J, Yin Y, Chu H, Zhou Y, Wang T, Fidler S, Li H. Learning to generate diverse dance motions with transformer. arXiv preprint arXiv:2008.08171 (2020)"},{"key":"3910_CR56","doi-asserted-by":"crossref","unstructured":"Cassell J, Vilhj\u00e1lmsson H.H, Bickmore T. Beat: the behavior expression animation toolkit. In: Proceedings of the 28th Annual Conference on Computer Graphics and Interactive Techniques, pp. 477\u2013486 (2001)","DOI":"10.1145\/383259.383315"},{"key":"3910_CR57","doi-asserted-by":"crossref","unstructured":"Pelachaud C, Bilvi M. Computational model of believable conversational agents. Communication in multiagent systems: Agent communication languages and conversation policies, 300\u2013317 (2003)","DOI":"10.1007\/978-3-540-44972-0_17"},{"key":"3910_CR58","doi-asserted-by":"crossref","unstructured":"Lee J, Marsella S. Nonverbal behavior generator for embodied conversational agents. In: International Workshop on Intelligent Virtual Agents, pp. 243\u2013255 (2006). Springer","DOI":"10.1007\/11821830_20"},{"key":"3910_CR59","unstructured":"Thiebaux M, Marsella S, Marshall A.N, Kallmann M. Smartbody: Behavior realization for embodied conversational agents. In: Proceedings of the 7th International Joint Conference on Autonomous Agents and Multiagent systems-Volume 1, pp. 151\u2013158 (2008)"},{"key":"3910_CR60","doi-asserted-by":"crossref","unstructured":"Marsella S, Xu Y, Lhommet M, Feng A, Scherer S, Shapiro A. Virtual character performance from speech. In: Proceedings of the 12th ACM SIGGRAPH\/Eurographics Symposium on Computer Animation, pp. 25\u201335 (2013)","DOI":"10.1145\/2485895.2485900"},{"key":"3910_CR61","unstructured":"Bergmann K, Kopp S. Increasing the expressiveness of virtual agents: autonomous generation of speech and gesture for spatial description tasks. In: AAMAS (1), pp. 361\u2013368 (2009)"},{"key":"3910_CR62","doi-asserted-by":"crossref","unstructured":"Bergmann K, Kopp S. Gnetic\u2013using bayesian decision networks for iconic gesture generation. In: Intelligent Virtual Agents: 9th International Conference, IVA 2009 Amsterdam, The Netherlands, September 14-16, 2009 Proceedings 9, pp. 76\u201389 (2009). Springer","DOI":"10.1007\/978-3-642-04380-2_12"},{"key":"3910_CR63","doi-asserted-by":"crossref","unstructured":"Levine S, Theobalt C, Koltun V. Real-time prosody-driven synthesis of body language. In: ACM SIGGRAPH Asia 2009 Papers, pp. 1\u201310 (2009)","DOI":"10.1145\/1661412.1618518"},{"key":"3910_CR64","doi-asserted-by":"crossref","unstructured":"Levine S, Kr\u00e4henb\u00fchl P, Thrun S, Koltun V. Gesture controllers. In: ACM SIGGRAPH 2010 Papers, pp. 1\u201311 (2010)","DOI":"10.1145\/1833349.1778861"},{"key":"3910_CR65","doi-asserted-by":"crossref","unstructured":"Chiu C.-C, Marsella S. How to train your avatar: A data driven approach to gesture generation. In: International Workshop on Intelligent Virtual Agents, pp. 127\u2013140 (2011). Springer","DOI":"10.1007\/978-3-642-23974-8_14"},{"key":"3910_CR66","doi-asserted-by":"crossref","unstructured":"Ghorbani S, Ferstl Y, Holden D, Troje N.F, Carbonneau M.-A. Zeroeggs: Zero-shot example-based gesture generation from speech. In: Computer Graphics Forum, vol. 42, pp. 206\u2013216 (2023). Wiley Online Library","DOI":"10.1111\/cgf.14734"},{"key":"3910_CR67","doi-asserted-by":"crossref","unstructured":"Liu H, Zhu Z, Iwamoto N, Peng Y, Li Z, Zhou Y, Bozkurt E, Zheng B. Beat: A large-scale semantic and emotional multi-modal dataset for conversational gestures synthesis. In: European Conference on Computer Vision, pp. 612\u2013630 (2022). Springer","DOI":"10.1007\/978-3-031-20071-7_36"},{"key":"3910_CR68","doi-asserted-by":"crossref","unstructured":"Liu H, Zhu Z, Becherini G, Peng Y, Su M, Zhou Y, Iwamoto N, Zheng B, Black M.J. Emage: Towards unified holistic co-speech gesture generation via masked audio gesture modeling. arXiv preprint arXiv:2401.00374 (2023)","DOI":"10.1109\/CVPR52733.2024.00115"},{"key":"3910_CR69","doi-asserted-by":"crossref","unstructured":"Habibie I, Xu W, Mehta D, Liu L, Seidel H.-P, Pons-Moll G, Elgharib M, Theobalt C. Learning speech-driven 3d conversational gestures from video. In: Proceedings of the 21st ACM International Conference on Intelligent Virtual Agents, pp. 101\u2013108 (2021)","DOI":"10.1145\/3472306.3478335"},{"key":"3910_CR70","doi-asserted-by":"crossref","unstructured":"Yoon Y, Ko W.-R, Jang M, Lee J, Kim J, Lee G. Robots learn social skills: End-to-end learning of co-speech gesture generation for humanoid robots. In: 2019 International Conference on Robotics and Automation (ICRA), pp. 4303\u20134309 (2019). IEEE","DOI":"10.1109\/ICRA.2019.8793720"},{"issue":"6","key":"3910_CR71","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3414685.3417838","volume":"39","author":"Y Yoon","year":"2020","unstructured":"Yoon Y, Cha B, Lee J-H, Jang M, Lee J, Kim J, Lee G. Speech gesture generation from the trimodal context of text, audio, and speaker identity. ACM Transactions on Graphics (TOG). 2020;39(6):1\u201316.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"3910_CR72","doi-asserted-by":"crossref","unstructured":"Bhattacharya U, Childs E, Rewkowski N, Manocha D. Speech2affectivegestures: Synthesizing co-speech gestures with generative adversarial affective expression learning. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 2027\u20132036 (2021)","DOI":"10.1145\/3474085.3475223"},{"key":"3910_CR73","doi-asserted-by":"crossref","unstructured":"Hasegawa D, Kaneko N, Shirakawa S, Sakuta H, Sumi K. Evaluation of speech-to-gesture generation using bi-directional lstm network. In: Proceedings of the 18th International Conference on Intelligent Virtual Agents, pp. 79\u201386 (2018)","DOI":"10.1145\/3267851.3267878"},{"key":"3910_CR74","doi-asserted-by":"crossref","unstructured":"Liu X, Wu Q, Zhou H, Xu Y, Qian R, Lin X, Zhou X, Wu W, Dai B, Zhou B. Learning hierarchical cross-modal association for co-speech gesture generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10462\u201310472 (2022)","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"3910_CR75","doi-asserted-by":"crossref","unstructured":"Bhattacharya U, Rewkowski N, Banerjee A, Guhan P, Bera A, Manocha D. Text2gestures: A transformer-based network for generating emotive body gestures for virtual agents. In: 2021 IEEE Virtual Reality and 3D User Interfaces (VR), pp. 1\u201310 (2021). IEEE","DOI":"10.1109\/VR50410.2021.00037"},{"key":"3910_CR76","doi-asserted-by":"crossref","unstructured":"Alexanderson S, Henter G.E, Kucherenko T, Beskow J. Style-controllable speech-driven gesture synthesis using normalising flows. In: Computer Graphics Forum, vol. 39, pp. 487\u2013496 (2020). Wiley Online Library","DOI":"10.1111\/cgf.13946"},{"key":"3910_CR77","doi-asserted-by":"crossref","unstructured":"Habibie I, Elgharib M, Sarkar K, Abdullah A, Nyatsanga S, Neff M, Theobalt C. A motion matching-based framework for controllable gesture synthesis from speech. In: ACM SIGGRAPH 2022 Conference Proceedings, pp. 1\u20139 (2022)","DOI":"10.1145\/3528233.3530750"},{"key":"3910_CR78","doi-asserted-by":"crossref","unstructured":"Li J, Kang D, Pei W, Zhe X, Zhang Y, He Z, Bao L. Audio2gestures: Generating diverse gestures from speech audio with conditional variational autoencoders. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11293\u201311302 (2021)","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"3910_CR79","first-page":"21386","volume":"35","author":"X Liu","year":"2022","unstructured":"Liu X, Wu Q, Zhou H, Du Y, Wu W, Lin D, Liu Z. Audio-driven co-speech gesture video generation. Adv Neural Inf Process Syst. 2022;35:21386\u201399.","journal-title":"Adv Neural Inf Process Syst"},{"key":"3910_CR80","doi-asserted-by":"crossref","unstructured":"Yi H, Liang H, Liu Y, Cao Q, Wen Y, Bolkart T, Tao D, Black M.J. Generating holistic 3d human motion from speech. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 469\u2013480 (2023)","DOI":"10.1109\/CVPR52729.2023.00053"},{"key":"3910_CR81","doi-asserted-by":"crossref","unstructured":"Kucherenko T, Hasegawa D, Henter G.E, Kaneko N, Kjellstr\u00f6m H. Analyzing input and output representations for speech-driven gesture generation. In: Proceedings of the 19th ACM International Conference on Intelligent Virtual Agents, pp. 97\u2013104 (2019)","DOI":"10.1145\/3308532.3329472"},{"key":"3910_CR82","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1016\/j.cag.2020.04.007","volume":"89","author":"Y Ferstl","year":"2020","unstructured":"Ferstl Y, Neff M, McDonnell R. Adversarial gesture generation with realistic gesture phasing. Computers & Graphics. 2020;89:117\u201330.","journal-title":"Computers & Graphics"},{"key":"3910_CR83","doi-asserted-by":"crossref","unstructured":"Yang S, Wu Z, Li M, Zhang Z, Hao L, Bao W, Cheng M, Xiao L. Diffusestylegesture: Stylized audio-driven co-speech gesture generation with diffusion models. arXiv preprint arXiv:2305.04919 (2023)","DOI":"10.24963\/ijcai.2023\/650"},{"issue":"4","key":"3910_CR84","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592097","volume":"42","author":"T Ao","year":"2023","unstructured":"Ao T, Zhang Z, Liu L. Gesturediffuclip: Gesture diffusion model with clip latents. ACM Transactions on Graphics (TOG). 2023;42(4):1\u201318.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"3910_CR85","doi-asserted-by":"crossref","unstructured":"Tevet G, Gordon B, Hertz A, Bermano A.H, Cohen-Or D. Motionclip: Exposing human motion generation to clip space. In: European Conference on Computer Vision, pp. 358\u2013374 (2022). Springer","DOI":"10.1007\/978-3-031-20047-2_21"},{"key":"3910_CR86","unstructured":"McNeill D. Gesture and thought. In: Gesture and Thought, (2008)"},{"key":"3910_CR87","unstructured":"CMU: Carnegie Mellon University Mocap Database. Carnegie Mellon University (2000)"},{"key":"3910_CR88","unstructured":"M\u00fcller M, R\u00f6der T, Clausen M, Eberhardt B, Kr\u00fcger B, Weber A. Documentation mocap database hdm05 (2007)"},{"key":"3910_CR89","doi-asserted-by":"crossref","unstructured":"Ofli F, Chaudhry R, Kurillo G, Vidal R, Bajcsy R. Berkeley mhad: A comprehensive multimodal human action database. In: 2013 IEEE Workshop on Applications of Computer Vision (WACV), pp. 53\u201360 (2013). IEEE","DOI":"10.1109\/WACV.2013.6474999"},{"key":"3910_CR90","doi-asserted-by":"crossref","unstructured":"Reynolds D.A, et al. Gaussian mixture models. Encyclopedia of biometrics 741(659-663) (2009)","DOI":"10.1007\/978-0-387-73003-5_196"},{"issue":"1","key":"3910_CR91","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1111\/j.2517-6161.1977.tb01600.x","volume":"39","author":"AP Dempster","year":"1977","unstructured":"Dempster AP, Laird NM, Rubin DB. Maximum likelihood from incomplete data via the em algorithm. J Roy Stat Soc: Ser B (Methodol). 1977;39(1):1\u201322.","journal-title":"J Roy Stat Soc: Ser B (Methodol)"},{"key":"3910_CR92","doi-asserted-by":"crossref","unstructured":"Syakur M.A, Khotimah B.K, Rochman E, Satoto B.D. Integration k-means clustering method and elbow method for identification of the best customer profile cluster. In: IOP Conference Series: Materials Science and Engineering, vol. 336, p. 012017 (2018). IOP Publishing","DOI":"10.1088\/1757-899X\/336\/1\/012017"},{"key":"3910_CR93","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez A.N, Kaiser \u0141, Polosukhin I. Attention is all you need. Advances in neural information processing systems 30 (2017)"},{"key":"3910_CR94","unstructured":"Wu N, Green B, Ben X, O\u2019Banion S. Deep transformer models for time series forecasting: The influenza prevalence case. arXiv preprint arXiv:2001.08317 (2020)"},{"key":"3910_CR95","doi-asserted-by":"crossref","unstructured":"McFee B, Raffel C, Liang D, Ellis D.P, McVicar M, Battenberg E, Nieto O. librosa: Audio and music signal analysis in python. In: Proceedings of the 14th Python in Science Conference, vol. 8, pp. 18\u201325 (2015)","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"3910_CR96","doi-asserted-by":"crossref","unstructured":"Aristidou A, Charalambous P, Chrysanthou Y. Emotion analysis and classification: understanding the performers\u2019 emotions using the lma entities. In: Computer Graphics Forum, vol. 34, pp. 262\u2013276 (2015). Wiley Online Library","DOI":"10.1111\/cgf.12598"},{"key":"3910_CR97","unstructured":"Laban R, Ullmann L. The mastery of movement. (1971)"},{"key":"3910_CR98","unstructured":"Kingma D.P, Ba J. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"3910_CR99","doi-asserted-by":"crossref","unstructured":"Zhang R, Isola P, Efros A.A, Shechtman E, Wang O. The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 586\u2013595 (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"key":"3910_CR100","doi-asserted-by":"publisher","first-page":"497","DOI":"10.1109\/TMM.2020.2981989","volume":"23","author":"G Sun","year":"2020","unstructured":"Sun G, Wong Y, Cheng Z, Kankanhalli MS, Geng W, Li X. Deepdance: music-to-dance motion choreography with adversarial learning. IEEE Trans Multimedia. 2020;23:497\u2013509.","journal-title":"IEEE Trans Multimedia"},{"key":"3910_CR101","doi-asserted-by":"crossref","unstructured":"Tulyakov S, Liu M.-Y, Yang X, Kautz J. Mocogan: Decomposing motion and content for video generation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1526\u20131535 (2018)","DOI":"10.1109\/CVPR.2018.00165"},{"key":"3910_CR102","doi-asserted-by":"crossref","unstructured":"Ferstl Y, Neff M, McDonnell R. Understanding the predictability of gesture parameters from speech and their perceptual importance. In: Proceedings of the 20th ACM International Conference on Intelligent Virtual Agents, pp. 1\u20138 (2020)","DOI":"10.1145\/3383652.3423882"},{"issue":"2","key":"3910_CR103","doi-asserted-by":"publisher","first-page":"190","DOI":"10.1109\/TAFFC.2015.2457417","volume":"7","author":"F Eyben","year":"2015","unstructured":"Eyben F, Scherer KR, Schuller BW, Sundberg J, Andr\u00e9 E, Busso C, Devillers LY, Epps J, Laukka P, Narayanan SS, et al. The geneva minimalistic acoustic parameter set (gemaps) for voice research and affective computing. IEEE Trans Affect Comput. 2015;7(2):190\u2013202.","journal-title":"IEEE Trans Affect Comput"},{"key":"3910_CR104","doi-asserted-by":"crossref","unstructured":"Eyben F, Weninger F, Gross F, Schuller B. Recent developments in opensmile, the munich open-source multimedia feature extractor. In: Proceedings of the 21st ACM International Conference on Multimedia, pp. 835\u2013838 (2013)","DOI":"10.1145\/2502081.2502224"},{"key":"3910_CR105","unstructured":"Ltd, A. My Voiceovers. Accessed: 2024-07-16. https:\/\/artlist.io\/voice-over\/my-voiceovers"},{"key":"3910_CR106","unstructured":"EducationalTestingService: Toefl Sample Test. https:\/\/ets.org"},{"key":"3910_CR107","doi-asserted-by":"crossref","unstructured":"Liu H, Iwamoto N, Zhu Z, Li Z, Zhou Y, Bozkurt E, Zheng B. Disco: Disentangled implicit content and rhythm learning for diverse co-speech gestures synthesis. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 3764\u20133773 (2022)","DOI":"10.1145\/3503161.3548400"},{"key":"3910_CR108","doi-asserted-by":"crossref","unstructured":"Kucherenko T, Jonell P, Van\u00a0Waveren S, Henter G.E, Alexandersson S, Leite I, Kjellstr\u00f6m H. Gesticulator: A framework for semantically-aware speech-driven gesture generation. In: Proceedings of the 2020 International Conference on Multimodal Interaction, pp. 242\u2013250 (2020)","DOI":"10.1145\/3382507.3418815"},{"key":"3910_CR109","doi-asserted-by":"crossref","unstructured":"Ye S, Wen Y.-H, Sun Y, He Y, Zhang Z, Wang Y, He W, Liu Y.-J. Audio-driven stylized gesture generation with flow-based model. In: European Conference on Computer Vision, pp. 712\u2013728 (2022). Springer","DOI":"10.1007\/978-3-031-20065-6_41"}],"container-title":["SN Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-03910-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42979-025-03910-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-03910-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T08:03:55Z","timestamp":1745309035000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42979-025-03910-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":109,"journal-issue":{"issue":"5","published-online":{"date-parts":[[2025,6]]}},"alternative-id":["3910"],"URL":"https:\/\/doi.org\/10.1007\/s42979-025-03910-9","relation":{},"ISSN":["2661-8907"],"issn-type":[{"value":"2661-8907","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"17 October 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 April 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"On behalf of all authors, the corresponding author states that there is no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"409"}}