{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,28]],"date-time":"2025-10-28T15:07:30Z","timestamp":1761664050966,"version":"build-2065373602"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62306029"],"award-info":[{"award-number":["62306029"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"CCF-Tencent Rhino-Bird Open Research Fund"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J of Soc Robotics"],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1007\/s12369-024-01136-y","type":"journal-article","created":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T13:01:52Z","timestamp":1715605312000},"page":"2065-2076","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Dual-Path Transformer-Based GAN for Co-speech Gesture Synthesis"],"prefix":"10.1007","volume":"17","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9511-6713","authenticated-orcid":false,"given":"Xinyuan","family":"Qian","sequence":"first","affiliation":[]},{"given":"Hao","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Jichen","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Hongxu","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Xu-Cheng","family":"Yin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,5,13]]},"reference":[{"key":"1136_CR1","doi-asserted-by":"publisher","first-page":"1222","DOI":"10.1016\/j.chb.2015.04.005","volume":"55","author":"J Li","year":"2016","unstructured":"Li J, Kizilcec R, Bailenson J, Ju W (2016) Social robots and virtual agents as lecturers for video instruction. Comput Hum Behav 55:1222\u20131230","journal-title":"Comput Hum Behav"},{"key":"1136_CR2","doi-asserted-by":"crossref","unstructured":"Liao M-Y, Sung C-Y, Wang H-C, Lin W-C (2019) Virtual classmates: embodying historical learners\u2019 messages as learning companions in a VR classroom through comment mapping. In: IEEE conference on virtual reality and 3D user interfaces. IEEE, pp 163\u2013171","DOI":"10.1109\/VR.2019.8797708"},{"key":"1136_CR3","doi-asserted-by":"crossref","unstructured":"Baur T, Damian I, Gebhard P, Porayska-Pomsta K, Andr\u00e9 E (2013) A job interview simulation: social cue-based interaction with a virtual character. In: International conference on social computing. IEEE, pp 220\u2013227","DOI":"10.1109\/SocialCom.2013.39"},{"key":"1136_CR4","doi-asserted-by":"publisher","first-page":"90","DOI":"10.1016\/j.specom.2019.04.005","volume":"110","author":"N Sadoughi","year":"2019","unstructured":"Sadoughi N, Busso C (2019) Speech-driven animation with meaningful behaviors. Speech Commun 110:90\u2013100","journal-title":"Speech Commun"},{"issue":"4","key":"1136_CR5","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073658","volume":"36","author":"T Karras","year":"2017","unstructured":"Karras T, Aila T, Laine S, Herva A, Lehtinen J (2017) Audio-driven facial animation by joint end-to-end learning of pose and emotion. ACM Trans Gr 36(4):1\u201312","journal-title":"ACM Trans Gr"},{"key":"1136_CR6","doi-asserted-by":"crossref","unstructured":"Ahuja C, Morency L-P (2019) Language2pose: natural language grounded pose forecasting. In: International conference on 3D vision. IEEE, pp 719\u2013728","DOI":"10.1109\/3DV.2019.00084"},{"key":"1136_CR7","first-page":"2","volume":"2018","author":"AS Lin","year":"2018","unstructured":"Lin AS, Wu L, Corona R, Tai K, Huang Q, Mooney RJ (2018) Generating animated videos of human activities from natural language descriptions. Learning 2018:2","journal-title":"Learning"},{"issue":"6","key":"1136_CR8","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3414685.3417838","volume":"39","author":"Y Yoon","year":"2020","unstructured":"Yoon Y, Cha B, Lee J-H, Jang M, Lee J, Kim J, Lee G (2020) Speech gesture generation from the trimodal context of text, audio, and speaker identity. ACM Trans Gr 39(6):1\u201316","journal-title":"ACM Trans Gr"},{"key":"1136_CR9","unstructured":"Robotics S, NAO https:\/\/www.aldebaran.com\/en\/nao"},{"issue":"2","key":"1136_CR10","doi-asserted-by":"publisher","first-page":"391","DOI":"10.1037\/xge0000646","volume":"149","author":"W Pouw","year":"2020","unstructured":"Pouw W, Harrison SJ, Dixon JA (2020) Gesture-speech physics: the biomechanical basis for the emergence of gesture-speech synchrony. J Exp Psychol Gen 149(2):391","journal-title":"J Exp Psychol Gen"},{"issue":"1","key":"1136_CR11","doi-asserted-by":"publisher","first-page":"266","DOI":"10.1111\/lang.12376","volume":"70","author":"M Graziano","year":"2020","unstructured":"Graziano M, Nicoladis E, Marentette P (2020) How referential gestures align with speech: evidence from monolingual and bilingual speakers. Lang Learn 70(1):266\u2013304","journal-title":"Lang Learn"},{"issue":"1","key":"1136_CR12","doi-asserted-by":"crossref","first-page":"71","DOI":"10.1515\/lp-2012-0006","volume":"3","author":"DP Loehr","year":"2012","unstructured":"Loehr DP (2012) Temporal, structural, and pragmatic synchrony between intonation and gesture. Lab Phonol 3(1):71\u201389","journal-title":"Lab Phonol"},{"key":"1136_CR13","doi-asserted-by":"crossref","unstructured":"Chiu C-C, Morency L-P, Marsella S (2015) Predicting co-verbal gestures: a deep and temporal modeling approach. In: International conference on intelligent virtual agents. Springer, pp 152\u2013166","DOI":"10.1007\/978-3-319-21996-7_17"},{"key":"1136_CR14","volume-title":"Gesture and speech in interaction: an overview","author":"P Wagner","year":"2014","unstructured":"Wagner P, Malisz Z, Kopp S (2014) Gesture and speech in interaction: an overview. Elsevier"},{"key":"1136_CR15","doi-asserted-by":"crossref","unstructured":"Kucherenko T, Hasegawa D, Henter GE, Kaneko N, Kjellstr\u00f6m H (2019) Analyzing input and output representations for speech-driven gesture generation. In: Proceedings of ACM international conference on intelligent virtual agents, pp 97\u2013104","DOI":"10.1145\/3308532.3329472"},{"key":"1136_CR16","unstructured":"Robotics S, Pepper and NAO robots education"},{"key":"1136_CR17","doi-asserted-by":"crossref","unstructured":"17. Yoon Y, Ko W-R, Jang M, Lee J, Kim J, Lee G (2019) Robots learn social skills: end-to-end learning of co-speech gesture generation for humanoid robots. In: International conference on robotics and automation. IEEE, pp 4303\u20134309","DOI":"10.1109\/ICRA.2019.8793720"},{"key":"1136_CR18","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1016\/j.cag.2020.04.007","volume":"89","author":"Y Ferstl","year":"2020","unstructured":"Ferstl Y, Neff M, McDonnell R (2020) Adversarial gesture generation with realistic gesture phasing. Comput Gr 89:117\u2013130","journal-title":"Comput Gr"},{"key":"1136_CR19","doi-asserted-by":"crossref","unstructured":"Ginosar S, Bar A, Kohavi G, Chan C, Owens A, Malik J (2019) Learning individual styles of conversational gesture. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3497\u20133506","DOI":"10.1109\/CVPR.2019.00361"},{"key":"1136_CR20","doi-asserted-by":"crossref","unstructured":"Hasegawa D, Kaneko N, Shirakawa S, Sakuta H, Sumi K (2018) Evaluation of speech-to-gesture generation using bi-directional LSTM network. In: Proceedings of the international conference on intelligent virtual agents, pp 79\u201386","DOI":"10.1145\/3267851.3267878"},{"issue":"4","key":"1136_CR21","doi-asserted-by":"publisher","first-page":"3757","DOI":"10.1109\/LRA.2018.2856281","volume":"3","author":"CT Ishi","year":"2018","unstructured":"Ishi CT, Machiyashiki D, Mikata R, Ishiguro H (2018) A speech-driven hand gesture generation method and evaluation in android robots. IEEE Robot Autom Lett 3(4):3757\u20133764","journal-title":"IEEE Robot Autom Lett"},{"key":"1136_CR22","doi-asserted-by":"crossref","unstructured":"Monahan S, Johnson E, Lucas G, Finch J, Gratch J (2018) Autonomous agent that provides automated feedback improves negotiation skills. In: International conference on artificial intelligence in education. Springer, pp 225\u2013229","DOI":"10.1007\/978-3-319-93846-2_41"},{"issue":"1","key":"1136_CR23","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1330511.1330516","volume":"27","author":"M Neff","year":"2008","unstructured":"Neff M, Kipp M, Albrecht I, Seidel H-P (2008) Gesture modeling and animation based on a probabilistic re-creation of speaker style. ACM Trans Gr 27(1):1\u201324","journal-title":"ACM Trans Gr"},{"key":"1136_CR24","doi-asserted-by":"crossref","unstructured":"Yang S, Wu Z, Li M, Zhang Z, Hao L, Bao W, Cheng M, Xiao L (2023) Diffusestylegesture: stylized audio-driven co-speech gesture generation with diffusion models. arXiv preprint arXiv:2305.04919","DOI":"10.24963\/ijcai.2023\/650"},{"key":"1136_CR25","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I (2017) Attention is all you need. arXiv preprint arXiv:1706.03762"},{"key":"1136_CR26","doi-asserted-by":"crossref","unstructured":"Qiu M, Rong Q, Liang D, Tu H (2023) Visual Scanpath transformer: guiding computers to see the world. In: IEEE International symposium on mixed and augmented reality, pp 223\u2013232","DOI":"10.1109\/ISMAR59233.2023.00037"},{"key":"1136_CR27","doi-asserted-by":"crossref","unstructured":"Tsai Y-HH, Bai S, Liang PP, Kolter JZ, Morency L-P, Salakhutdinov R (2019) Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the conference. association for computational linguistics, vol 2019. NIH Public Access, p 6558","DOI":"10.18653\/v1\/P19-1656"},{"key":"1136_CR28","doi-asserted-by":"crossref","unstructured":"Bhattacharya U, Rewkowski N, Banerjee A, Guhan P, Bera A, Manocha D (2021) Text2gestures: a transformer-based network for generating emotive body gestures for virtual agents. arXiv preprint arXiv:2101.11101","DOI":"10.1109\/VR50410.2021.00037"},{"key":"1136_CR29","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J et al (2021) Learning transferable visual models from natural language supervision. In: Proceedings of the international conference on machine learning, pp 8748\u20138763"},{"key":"1136_CR30","doi-asserted-by":"crossref","unstructured":"Wu Y, Chen K, Zhang T, Hui Y, Berg-Kirkpatrick T, Dubnov S (2023) Large-scale contrastive language-audio pretraining with feature fusion and keyword-to-caption augmentation. In: Proceedings of the international conference on audio, speech, signal process, pp 1\u20135","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"1136_CR31","unstructured":"Goodfellow IJ, Pouget-Abadie J, Mirza M, Xu B, Warde-Farley D, Ozair S, Courville A, Bengio Y (2014) Generative adversarial networks. arXiv preprint arXiv:1406.2661"},{"key":"1136_CR32","doi-asserted-by":"crossref","unstructured":"Chu J, An D, Ma Y, Cui W, Zhai S, Gu XD, Bi X (2023) Wordgesture-GAN: modeling word-gesture movement with generative adversarial network. In: Proceedings of the 2023 CHI conference on human factors in computing systems, pp 1\u201315","DOI":"10.1145\/3544548.3581279"},{"key":"1136_CR33","doi-asserted-by":"crossref","unstructured":"Liu CY, Mohammadi G, Song Y, Johal W (2023) Speech-gesture GAN: gesture generation for robots and embodied agents. In: IEEE international conference on robot and human interactive communication, pp 405\u2013412","DOI":"10.1109\/RO-MAN57019.2023.10309493"},{"key":"1136_CR34","doi-asserted-by":"crossref","unstructured":"Liu X, Wu Q, Zhou H, Xu Y, Qian R, Lin X, Zhou X, Wu W, Dai B, Zhou B (2022) Learning hierarchical cross-modal association for co-speech gesture generation. In: Proceedings of the international conference on computer vision and pattern recognition, pp 10462\u201310472 (2022)","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"1136_CR35","doi-asserted-by":"crossref","unstructured":"Vo\u00df H, Kopp S (2023) AQ-GT: a temporally aligned and quantized GRU-transformer for co-speech gesture synthesis. arXiv preprint arXiv:2305.01241","DOI":"10.1145\/3577190.3614135"},{"key":"1136_CR36","doi-asserted-by":"crossref","unstructured":"Liang Y, Feng Q, Zhu L, Hu L, Pan P, Yang Y (2022) SEEG: semantic energized co-speech gesture generation. In: Proceedings of the international conference on computer vision and pattern recognition, pp 10473\u201310482","DOI":"10.1109\/CVPR52688.2022.01022"},{"key":"1136_CR37","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1162\/tacl_a_00051","volume":"5","author":"P Bojanowski","year":"2017","unstructured":"Bojanowski P, Grave E, Joulin A, Mikolov T (2017) Enriching word vectors with subword information. Trans Assoc Comput Linguist 5:135\u2013146","journal-title":"Trans Assoc Comput Linguist"},{"key":"1136_CR38","doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning CD (2014) Glove: global vectors for word representation. In: Proceedings of the conference on empirical methods in natural language processing (EMNLP), pp 1532\u20131543","DOI":"10.3115\/v1\/D14-1162"},{"key":"1136_CR39","unstructured":"Bai S, Kolter JZ, Koltun V (2018) An empirical evaluation of generic convolutional and recurrent networks for sequence modeling. arXiv preprint arXiv:1803.01271"},{"key":"1136_CR40","doi-asserted-by":"crossref","unstructured":"Cho K, Van\u00a0Merri\u00ebnboer B, Gulcehre C, Bahdanau D, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078","DOI":"10.3115\/v1\/D14-1179"},{"key":"1136_CR41","doi-asserted-by":"crossref","unstructured":"Sadoughi N, Liu Y, Busso C (2015) MSP-AVATAR corpus: motion capture recordings to study the role of discourse functions in the design of intelligent virtual agents. In: International conference and workshops on automatic face gesture recognition, pp 1\u20135","DOI":"10.1109\/FG.2015.7284885"},{"key":"1136_CR42","unstructured":"Tolins J, Liu K, Wang Y, Tree JEF, Walker M, Neff M (2016) A multimodal motion-captured corpus of matched and mismatched extravert\u2013introvert conversational pairs. In: Proceedings of the international conference on language resources and evaluation, pp 3469\u20133476"},{"key":"1136_CR43","unstructured":"Kingma DP, Ba J (2014) Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980"},{"issue":"7","key":"1136_CR44","doi-asserted-by":"publisher","first-page":"1325","DOI":"10.1109\/TPAMI.2013.248","volume":"36","author":"C Ionescu","year":"2013","unstructured":"Ionescu C, Papava D, Olaru V, Sminchisescu C (2013) Human3.6m: large scale datasets and predictive methods for 3d human sensing in natural environments. IEEE Trans Pattern Anal Mach Intell 36(7):1325\u20131339","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"1136_CR45","doi-asserted-by":"crossref","unstructured":"Shen Y, Feng Y, Wang W, Liang D, Qin J, Xie H, Wei M (2022) MBA-RainGAN: a multi-branch attention generative adversarial network for mixture of rain removal. In: Proceedings of the international conference on audio, speech, signal processing, pp 3418\u20133422","DOI":"10.1109\/ICASSP43922.2022.9746588"}],"container-title":["International Journal of Social Robotics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s12369-024-01136-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s12369-024-01136-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s12369-024-01136-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,28]],"date-time":"2025-10-28T14:54:04Z","timestamp":1761663244000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s12369-024-01136-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,13]]},"references-count":45,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2025,10]]}},"alternative-id":["1136"],"URL":"https:\/\/doi.org\/10.1007\/s12369-024-01136-y","relation":{},"ISSN":["1875-4791","1875-4805"],"issn-type":[{"type":"print","value":"1875-4791"},{"type":"electronic","value":"1875-4805"}],"subject":[],"published":{"date-parts":[[2024,5,13]]},"assertion":[{"value":"19 March 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 May 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}