{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T14:50:33Z","timestamp":1774968633157,"version":"3.50.1"},"reference-count":50,"publisher":"Springer Science and Business Media LLC","issue":"22","license":[{"start":{"date-parts":[[2024,8,26]],"date-time":"2024-08-26T00:00:00Z","timestamp":1724630400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,8,26]],"date-time":"2024-08-26T00:00:00Z","timestamp":1724630400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Basic and Applied Basic Research Program of Guangdong Province","award":["2020A1515110523"],"award-info":[{"award-number":["2020A1515110523"]}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["QTZX22079"],"award-info":[{"award-number":["QTZX22079"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100019026","name":"Guangxi Key Laboratory of Metabolic Disease Research","doi-asserted-by":"publisher","award":["KX202045"],"award-info":[{"award-number":["KX202045"]}],"id":[{"id":"10.13039\/100019026","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2024,11]]},"DOI":"10.1007\/s10489-024-05769-4","type":"journal-article","created":{"date-parts":[[2024,8,26]],"date-time":"2024-08-26T07:02:16Z","timestamp":1724655736000},"page":"11525-11535","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Cospeech body motion generation using a transformer"],"prefix":"10.1007","volume":"54","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2743-2017","authenticated-orcid":false,"given":"Zixiang","family":"Lu","sequence":"first","affiliation":[]},{"given":"Zhitong","family":"He","sequence":"additional","affiliation":[]},{"given":"Jiale","family":"Hong","sequence":"additional","affiliation":[]},{"given":"Ping","family":"Gao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,8,26]]},"reference":[{"key":"5769_CR1","doi-asserted-by":"crossref","unstructured":"Kopp S, Krenn B, Marsella S, Marshall AN, Pelachaud C, Pirker H, Th\u00f3risson KR, Vilhj\u00e1lmsson H (2006) Towards a common framework for multimodal generation: The behavior markup language. In: International workshop on intelligent virtual agents, pp 205\u2013217. Springer","DOI":"10.1007\/11821830_17"},{"key":"5769_CR2","doi-asserted-by":"crossref","unstructured":"Wagner P, Malisz Z, Kopp S (2014) Gesture and speech in interaction: An overview. Elsevier","DOI":"10.1016\/j.specom.2013.09.008"},{"key":"5769_CR3","doi-asserted-by":"crossref","unstructured":"Levine S, Kr\u00e4henb\u00fchl P, Thrun S, Koltun V (2010) Gesture controllers. In: ACM SIGGRAPH 2010 Papers, pp 1\u201311","DOI":"10.1145\/1833349.1778861"},{"key":"5769_CR4","doi-asserted-by":"crossref","unstructured":"Kucherenko T, Hasegawa D, Henter G.E, Kaneko N, Kjellstr\u00f6m H (2019) Analyzing input and output representations for speech-driven gesture generation. In: Proceedings of the 19th ACM international conference on intelligent virtual agents, pp 97\u2013104","DOI":"10.1145\/3308532.3329472"},{"key":"5769_CR5","doi-asserted-by":"crossref","unstructured":"Ferstl Y, Neff M, McDonnell R (2019) Multi-objective adversarial gesture generation. In: Motion, interaction and games, pp 1\u201310","DOI":"10.1145\/3359566.3360053"},{"key":"5769_CR6","doi-asserted-by":"crossref","unstructured":"Ginosar S, Bar A, Kohavi G, Chan C, Owens A, Malik J (2019) Learning individual styles of conversational gesture. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3497\u20133506","DOI":"10.1109\/CVPR.2019.00361"},{"key":"5769_CR7","doi-asserted-by":"crossref","unstructured":"Li X, Yin X, Li C, Zhang P, Hu X, Zhang L, Wang L, Hu H, Dong L, Wei F,et al (2020) Oscar: Object-semantics aligned pre-training for vision-language tasks. In: European conference on computer vision, pp 121\u2013137. Springer","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"5769_CR8","doi-asserted-by":"crossref","unstructured":"Qian S, Tu Z, Zhi Y, Liu W, Gao S (2021) Speech drives templates: Co-speech gesture synthesis with learned templates. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 11077\u201311086","DOI":"10.1109\/ICCV48922.2021.01089"},{"key":"5769_CR9","doi-asserted-by":"crossref","unstructured":"Li R, Yang S, Ross D.A, Kanazawa A (2021) Ai choreographer: Music conditioned 3d dance generation with aist++. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 13401\u201313412","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"5769_CR10","unstructured":"Kingma DP, Welling M (2013) Auto-encoding variational bayes. arXiv:1312.6114"},{"key":"5769_CR11","doi-asserted-by":"crossref","unstructured":"Cassell J, Vilhj\u00e1lmsson HH, Bickmore T (2001) Beat: the behavior expression animation toolkit. In: Proceedings of the 28th annual conference on computer graphics and interactive techniques, pp 477\u2013486","DOI":"10.1145\/383259.383315"},{"issue":"4","key":"5769_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn S, Seitz SM, Kemelmacher-Shlizerman I (2017) Synthesizing obama: learning lip sync from audio. ACM Trans Graph (ToG) 36(4):1\u201313","journal-title":"ACM Trans Graph (ToG)"},{"key":"5769_CR13","doi-asserted-by":"crossref","unstructured":"Prajwal K, Mukhopadhyay R, Namboodiri VP, Jawahar C (2020) A lip sync expert is all you need for speech to lip generation in the wild. In: Proceedings of the 28th ACM international conference on multimedia, pp 484\u2013492","DOI":"10.1145\/3394171.3413532"},{"key":"5769_CR14","doi-asserted-by":"crossref","unstructured":"Chen L, Cui G, Liu C, Li Z, Kou Z, Xu Y, Xu C (2020) Talking-head generation with rhythmic head motion. In: European conference on computer vision, pp 35\u201351. Springer","DOI":"10.1007\/978-3-030-58545-7_3"},{"issue":"4","key":"5769_CR15","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3306346.3323028","volume":"38","author":"O Fried","year":"2019","unstructured":"Fried O, Tewari A, Zollh\u00f6fer M, Finkelstein A, Shechtman E, Goldman DB, Genova K, Jin Z, Theobalt C, Agrawala M (2019) Text-based editing of talking-head video. ACM Trans Graph (TOG) 38(4):1\u201314","journal-title":"ACM Trans Graph (TOG)"},{"key":"5769_CR16","unstructured":"Yi R, Ye Z, Zhang J, Bao H, Liu Y-J (2020) Audio-driven talking face video generation with learning-based personalized head pose. arXiv:2002.10137"},{"key":"5769_CR17","doi-asserted-by":"crossref","unstructured":"Sadoughi N, Busso C (2018) Novel realizations of speech-driven head movements with generative adversarial networks. In: 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 6169\u20136173. IEEE","DOI":"10.1109\/ICASSP.2018.8461967"},{"key":"5769_CR18","doi-asserted-by":"crossref","unstructured":"Bergmann K, Kopp S (2009) Gnetic\u2013using bayesian decision networks for iconic gesture generation. In: International workshop on intelligent virtual agents, pp 76\u201389. Springer","DOI":"10.1007\/978-3-642-04380-2_12"},{"key":"5769_CR19","doi-asserted-by":"publisher","first-page":"90","DOI":"10.1016\/j.specom.2019.04.005","volume":"110","author":"N Sadoughi","year":"2019","unstructured":"Sadoughi N, Busso C (2019) Speech-driven animation with meaningful behaviors. Speech Commun 110:90\u2013100","journal-title":"Speech Commun"},{"key":"5769_CR20","doi-asserted-by":"crossref","unstructured":"Yoon Y, Ko W-R, Jang M, Lee J, Kim J, Lee G (2019) Robots learn social skills: End-to-end learning of co-speech gesture generation for humanoid robots. In: 2019 International conference on robotics and automation (ICRA), pp 4303\u20134309. IEEE","DOI":"10.1109\/ICRA.2019.8793720"},{"key":"5769_CR21","doi-asserted-by":"crossref","unstructured":"Shlizerman E, Dery L, Schoen H, Kemelmacher-Shlizerman I (2018) Audio to body dynamics. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 7574\u20137583","DOI":"10.1109\/CVPR.2018.00790"},{"key":"5769_CR22","doi-asserted-by":"crossref","unstructured":"Fan B, Wang L, Soong FK, Xie L (2015) Photo-real talking head with deep bidirectional lstm. In: 2015 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 4884\u20134888","DOI":"10.1109\/ICASSP.2015.7178899"},{"key":"5769_CR23","first-page":"1","volume":"72","author":"C Han","year":"2023","unstructured":"Han C, Sun J, Bian Y, Que W, Shi L (2023) Automated detection and localization of myocardial infarction with interpretability analysis based on deep learning. IEEE Trans Instrum Meas 72:1\u201312","journal-title":"IEEE Trans Instrum Meas"},{"issue":"3","key":"5769_CR24","doi-asserted-by":"publisher","first-page":"374","DOI":"10.1016\/j.jrp.2010.04.002","volume":"44","author":"M Koppensteiner","year":"2010","unstructured":"Koppensteiner M, Grammer K (2010) Motion patterns in political speech and their influence on personality ratings. J Res Pers 44(3):374\u2013379","journal-title":"J Res Pers"},{"issue":"4","key":"5769_CR25","first-page":"1","volume":"36","author":"HJ Smith","year":"2017","unstructured":"Smith HJ, Neff M (2017) Understanding the impact of animated gesture performance on personality perceptions. ACM Trans Graph (TOG) 36(4):1\u201312","journal-title":"ACM Trans Graph (TOG)"},{"key":"5769_CR26","unstructured":"Castillo G, Neff M (2019) What do we express without knowing? emotion in gesture. In: Proceedings of the 18th international conference on autonomous agents and multiagent systems, pp 702\u2013710"},{"key":"5769_CR27","doi-asserted-by":"crossref","unstructured":"Hsu E, Pulli K, Popovi\u0107 J (2005) Style translation for human motion. In: ACM SIGGRAPH 2005 Papers, pp 1082\u20131089","DOI":"10.1145\/1186822.1073315"},{"issue":"4","key":"5769_CR28","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2766999","volume":"34","author":"S Xia","year":"2015","unstructured":"Xia S, Wang C, Chai J, Hodgins J (2015) Realtime style transfer for unlabeled heterogeneous human motion. ACM Trans Graph (TOG) 34(4):1\u201310","journal-title":"ACM Trans Graph (TOG)"},{"issue":"2","key":"5769_CR29","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3340254","volume":"2","author":"HJ Smith","year":"2019","unstructured":"Smith HJ, Cao C, Neff M, Wang Y (2019) Efficient neural networks for real-time motion style transfer. Proc ACM Comput Graph Interac Tech 2(2):1\u201317","journal-title":"Proc ACM Comput Graph Interac Tech"},{"issue":"11","key":"5769_CR30","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow I, Pouget-Abadie J, Mirza M, Xu B, Warde-Farley D, Ozair S, Courville A, Bengio Y (2020) Generative adversarial networks. Commun ACM 63(11):139\u2013144","journal-title":"Commun ACM"},{"key":"5769_CR31","unstructured":"Brock A, Donahue J, Simonyan K (2018) Large scale gan training for high fidelity natural image synthesis. arXiv:1809.11096"},{"key":"5769_CR32","unstructured":"Pham HX, Wang Y, Pavlovic V (2018) Generative adversarial talking head: Bringing portraits to life with a weakly supervised neural network. arXiv:1803.07716"},{"key":"5769_CR33","doi-asserted-by":"crossref","unstructured":"Pumarola A, Agudo A, Martinez AM, Sanfeliu A, Moreno-Noguer F (2018) Ganimation: Anatomically-aware facial animation from a single image. In: Proceedings of the european conference on computer vision (ECCV), pp 818\u2013833","DOI":"10.1007\/978-3-030-01249-6_50"},{"issue":"5","key":"5769_CR34","doi-asserted-by":"publisher","first-page":"1398","DOI":"10.1007\/s11263-019-01251-8","volume":"128","author":"K Vougioukas","year":"2020","unstructured":"Vougioukas K, Petridis S, Pantic M (2020) Realistic speech-driven facial animation with gans. Int J Comput Vis 128(5):1398\u20131413","journal-title":"Int J Comput Vis"},{"key":"5769_CR35","unstructured":"Lucic M, Kurach K, Michalski M, Gelly S, Bousquet O (2018) Are gans created equal? a large-scale study. Advances in Neural Information Processing Systems 31"},{"key":"5769_CR36","doi-asserted-by":"crossref","unstructured":"Alexanderson S, Henter GE, Kucherenko T, Beskow J (2020) Style-controllable speech-driven gesture synthesis using normalising flows. In: Computer graphics forum, vol 39, pp 487\u2013496. Wiley Online Library","DOI":"10.1111\/cgf.13946"},{"key":"5769_CR37","doi-asserted-by":"crossref","unstructured":"Ahuja C, Lee DW, Nakano YI, Morency L-P (2020) Style transfer for co-speech gesture animation: A multi-speaker conditional-mixture approach. In: European conference on computer vision, pp 248\u2013265. Springer","DOI":"10.1007\/978-3-030-58523-5_15"},{"issue":"6","key":"5769_CR38","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3414685.3417838","volume":"39","author":"Y Yoon","year":"2020","unstructured":"Yoon Y, Cha B, Lee J-H, Jang M, Lee J, Kim J, Lee G (2020) Speech gesture generation from the trimodal context of text, audio, and speaker identity. ACM Trans Graph (TOG) 39(6):1\u201316","journal-title":"ACM Trans Graph (TOG)"},{"issue":"4","key":"5769_CR39","doi-asserted-by":"publisher","first-page":"543","DOI":"10.1016\/j.specom.2011.11.004","volume":"54","author":"M Sahidullah","year":"2012","unstructured":"Sahidullah M, Saha G (2012) Design, analysis and experimental evaluation of block based transformation in mfcc computation for speaker recognition. Speech Commun 54(4):543\u2013565","journal-title":"Speech Commun"},{"key":"5769_CR40","doi-asserted-by":"crossref","unstructured":"Fang H-S, Xie S, Tai Y-W, Lu C (2017) Rmpe: Regional multi-person pose estimation. In: Proceedings of the IEEE international conference on computer vision, pp 2334\u20132343","DOI":"10.1109\/ICCV.2017.256"},{"key":"5769_CR41","unstructured":"Xiu Y, Li J, Wang H, Fang Y, Lu C (2018) Pose flow: Efficient online pose tracking. arXiv:1802.00977"},{"key":"5769_CR42","unstructured":"Xu J, Zhang W, Bai Y, Sun Q, Mei T (2022) Freeform body motion generation from speech. arXiv:2203.02291"},{"key":"5769_CR43","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez A.N, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Advances In Neural Information Processing Systems 30"},{"issue":"6","key":"5769_CR44","first-page":"1","volume":"39","author":"Y Zhou","year":"2020","unstructured":"Zhou Y, Han X, Shechtman E, Echevarria J, Kalogerakis E, Li D (2020) Makelttalk: speaker-aware talking-head animation. ACM Trans Graph (TOG) 39(6):1\u201315","journal-title":"ACM Trans Graph (TOG)"},{"key":"5769_CR45","unstructured":"Heusel M, Ramsauer H, Unterthiner T, Nessler B, Hochreiter S (2017) Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in Neural Information Processing Systems 30"},{"key":"5769_CR46","doi-asserted-by":"crossref","unstructured":"Zhu L, Liu X, Liu X, Qian R, Liu Z, Yu L (2023) Taming diffusion models for audio-driven co-speech gesture generation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10544\u201310553","DOI":"10.1109\/CVPR52729.2023.01016"},{"key":"5769_CR47","doi-asserted-by":"crossref","unstructured":"Liu X, Wu Q, Zhou H, Xu Y, Qian R, Lin X, Zhou X, Wu W, Dai B, Zhou B (2022) Learning hierarchical cross-modal association for co-speech gesture generation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10462\u201310472","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"5769_CR48","doi-asserted-by":"crossref","unstructured":"Yoon Y, Ko W-R, Jang M, Lee J, Kim J, Lee G (2019) Robots learn social skills: End-to-end learning of co-speech gesture generation for humanoid robots. In: Proc. of the international conference in robotics and automation (ICRA)","DOI":"10.1109\/ICRA.2019.8793720"},{"key":"5769_CR49","doi-asserted-by":"crossref","unstructured":"Ahuja C, Morency L-P (2019) Language2pose: Natural language grounded pose forecasting. In: Proceedings of 2019 international conference on 3D vision (3DV), pp 719\u2013728","DOI":"10.1109\/3DV.2019.00084"},{"key":"5769_CR50","unstructured":"Wang T-C, Liu M-Y, Zhu J-Y, Liu G, Tao A, Kautz J, Catanzaro B (2018) Video-to-video synthesis. In: Conference on neural information processing systems (NeurIPS)"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-024-05769-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-024-05769-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-024-05769-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,18]],"date-time":"2024-09-18T15:26:20Z","timestamp":1726673180000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-024-05769-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,26]]},"references-count":50,"journal-issue":{"issue":"22","published-print":{"date-parts":[[2024,11]]}},"alternative-id":["5769"],"URL":"https:\/\/doi.org\/10.1007\/s10489-024-05769-4","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,8,26]]},"assertion":[{"value":"11 August 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 August 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors certify that there is no conflict of interest with any individual or organization for this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Authors have cited any publicly available data on which the conclusions of the paper rely.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical and informed consent for data used"}}]}}