{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T04:08:07Z","timestamp":1781582887116,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":95,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62076144"],"award-info":[{"award-number":["62076144"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shenzhen Key Laboratory of next generation interactive media innovative technology","award":["ZDSYS20210623092001004"],"award-info":[{"award-number":["ZDSYS20210623092001004"]}]},{"name":"Shenzhen Science and Technology Program","award":["WDZC20220816140515001, JCYJ20220818101014030"],"award-info":[{"award-number":["WDZC20220816140515001, JCYJ20220818101014030"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612503","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"1033-1044","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":17,"title":["UnifiedGesture: A Unified Gesture Synthesis Model for Multiple Skeletons"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0928-034X","authenticated-orcid":false,"given":"Sicheng","family":"Yang","sequence":"first","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6062-3015","authenticated-orcid":false,"given":"Zilin","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8533-0524","authenticated-orcid":false,"given":"Zhiyong","family":"Wu","sequence":"additional","affiliation":[{"name":"Tsinghua University &amp; The Chinese University of Hong Kong, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1427-3507","authenticated-orcid":false,"given":"Minglei","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Cloud Computing Technologies Co., Ltd, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7911-7564","authenticated-orcid":false,"given":"Zhensong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8113-6459","authenticated-orcid":false,"given":"Qiaochu","family":"Huang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6977-119X","authenticated-orcid":false,"given":"Lei","family":"Hao","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0022-0906","authenticated-orcid":false,"given":"Songcen","family":"Xu","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0143-1485","authenticated-orcid":false,"given":"Xiaofei","family":"Wu","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7043-6657","authenticated-orcid":false,"given":"Changpeng","family":"Yang","sequence":"additional","affiliation":[{"name":"Huawei Cloud Computing Technologies Co., Ltd, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7723-4130","authenticated-orcid":false,"given":"Zonghong","family":"Dai","sequence":"additional","affiliation":[{"name":"Huawei Cloud Computing Technologies Co., Ltd, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392462"},{"key":"e_1_3_2_2_2_1","volume-title":"Ryo Ishii, and Louis-Philippe Morency.","author":"Ahuja Chaitanya","year":"2020","unstructured":"Chaitanya Ahuja, Dong Won Lee, Ryo Ishii, and Louis-Philippe Morency. 2020. No Gestures Left Behind: Learning Relationships between Spoken Language and Freeform Gestures. In Findings of the Association for Computational Linguistics, Vol. EMNLP 2020. Association for Computational Linguistics, 1884--1895."},{"key":"e_1_3_2_2_3_1","volume-title":"Computer Vision - ECCV 2020 - 16th European Conference","author":"Ahuja Chaitanya","unstructured":"Chaitanya Ahuja, DongWon Lee, Yukiko I. Nakano, and Louis-Philippe Morency. 2020. Style Transfer for Co-speech Gesture Animation: A Multi-speaker Conditional-Mixture Approach. In Computer Vision - ECCV 2020 - 16th European Conference, Vol. 12363. Springer, 248--265."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.13946"},{"key":"e_1_3_2_2_5_1","volume-title":"denoise, action! Audio-driven motion synthesis with diffusion models. CoRR abs\/2211.09707","author":"Alexanderson Simon","year":"2022","unstructured":"Simon Alexanderson, Rajmund Nagy, Jonas Beskow, and Gustav Eje Henter. 2022. Listen, denoise, action! Audio-driven motion synthesis with diffusion models. CoRR abs\/2211.09707 (2022). arXiv:2211.09707"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3383652.3423874"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555435"},{"key":"e_1_3_2_2_8_1","unstructured":"Autodesk. 2023. Maya. https:\/\/www.autodesk.com.cn\/products\/maya\/overview."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.1983.6313077"},{"key":"e_1_3_2_2_10_1","unstructured":"Zal\u00e1n Borsos Rapha\u00ebl Marinier Damien Vincent et al. 2022. AudioLM: a Language Modeling Approach to Audio Generation. CoRR abs\/2209.03143 (2022). arXiv:2209.03143"},{"key":"e_1_3_2_2_11_1","volume-title":"Conference on robot learning. PMLR, 330--359","author":"Brown Daniel S","year":"2020","unstructured":"Daniel S Brown, Wonjoon Goo, and Scott Niekum. 2020. Better-thandemonstrator imitation learning via automatically-ranked demonstrations. In Conference on robot learning. PMLR, 330--359."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11791"},{"key":"e_1_3_2_2_14_1","unstructured":"Gabriel Dulac-Arnold Richard Evans Hado van Hasselt et al. 2015. Deep reinforcement learning in large discrete action spaces. arXiv preprint arXiv:1512.07679 (2015)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"crossref","unstructured":"Shuo Feng Haowei Sun Xintao Yan et al. 2023. Dense reinforcement learning for safety validation of autonomous vehicles. Nature 615 7953 (2023) 620--627.","DOI":"10.1038\/s41586-023-05732-2"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267851.3267898"},{"key":"e_1_3_2_2_17_1","first-page":"1","article-title":"Multi-objective adversarial gesture generation. In Motion, Interaction and Games, MIG 2019","volume":"3","author":"Ferstl Ylva","year":"2019","unstructured":"Ylva Ferstl, Michael Neff, and Rachel McDonnell. 2019. Multi-objective adversarial gesture generation. In Motion, Interaction and Games, MIG 2019. ACM, 3:1--3:10.","journal-title":"ACM"},{"key":"e_1_3_2_2_18_1","unstructured":"Blender Foundation. 2023. Blender. https:\/\/www.blender.org\/."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.14734"},{"key":"e_1_3_2_2_20_1","volume-title":"Learning Individual Styles of Conversational Gesture. In Conference on Computer Vision and Pattern Recognition, CVPR","author":"Ginosar Shiry","year":"2019","unstructured":"Shiry Ginosar, Amir Bar, Gefen Kohavi, et al. 2019. Learning Individual Styles of Conversational Gesture. In Conference on Computer Vision and Pattern Recognition, CVPR 2019. CVF \/ IEEE, 3497--3506."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_34"},{"key":"e_1_3_2_2_22_1","unstructured":"Tuomas Haarnoja Aurick Zhou Kristian Hartikainen et al. 2018. Soft actor-critic algorithms and applications. arXiv preprint arXiv:1812.05905 (2018)."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528233.3530750"},{"key":"e_1_3_2_2_24_1","volume-title":"International Conference on Intelligent Virtual Agents. ACM, 101--108","author":"Habibie Ikhsanul","year":"2021","unstructured":"Ikhsanul Habibie, Weipeng Xu, Dushyant Mehta, et al. 2021. Learning Speechdriven 3D Conversational Gestures from Video. In International Conference on Intelligent Virtual Agents. ACM, 101--108."},{"key":"e_1_3_2_2_25_1","volume-title":"Denoising Diffusion Probabilistic Models. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual."},{"key":"e_1_3_2_2_26_1","volume-title":"Classifier-Free Diffusion Guidance. CoRR abs\/2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-Free Diffusion Guidance. CoRR abs\/2207.12598 (2022). arXiv:2207.12598"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530094"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"crossref","unstructured":"Peter J Huber. 1992. Robust estimation of a location parameter. In Breakthroughs in statistics. 492--518.","DOI":"10.1007\/978-1-4612-4380-9_35"},{"key":"e_1_3_2_2_29_1","volume-title":"FLAME: Free-form Languagebased Motion Synthesis & Editing. CoRR abs\/2209.00349","author":"Kim Jihoon","year":"2022","unstructured":"Jihoon Kim, Jiseob Kim, and Sungjoon Choi. 2022. FLAME: Free-form Languagebased Motion Synthesis & Editing. CoRR abs\/2209.00349 (2022). arXiv:2209.00349"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.13947"},{"key":"e_1_3_2_2_31_1","first-page":"4","article-title":"A variational U-Net for motion retargeting","volume":"31","author":"Kim Seong Uk","year":"2020","unstructured":"Seong Uk Kim, Hanyoung Jang, and Jongmin Kim. 2020. A variational U-Net for motion retargeting. Comput. Animat. Virtual Worlds 31, 4--5 (2020).","journal-title":"Comput. Animat. Virtual Worlds"},{"key":"e_1_3_2_2_32_1","volume-title":"Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR.","author":"Diederik","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR."},{"key":"e_1_3_2_2_33_1","unstructured":"Michael Kipp. 2003. Gesture generation by imitation: from human behavior to computer character animation. Ph.D. Dissertation. Saarland University Saarbr\u00fccken Germany."},{"key":"e_1_3_2_2_34_1","volume-title":"Reformer: The Efficient Transformer. In 8th International Conference on Learning Representations, ICLR April 26-30","author":"Kitaev Nikita","year":"2020","unstructured":"Nikita Kitaev, Lukasz Kaiser, and Anselm Levskaya. 2020. Reformer: The Efficient Transformer. In 8th International Conference on Learning Representations, ICLR April 26-30."},{"key":"e_1_3_2_2_35_1","volume-title":"Gustav Eje Henter, et al","author":"Kucherenko Taras","year":"2019","unstructured":"Taras Kucherenko, Dai Hasegawa, Gustav Eje Henter, et al. 2019. Analyzing Input and Output Representations for Speech-Driven Gesture Generation. In Intelligent Virtual Agents, IVA 2019. ACM, 97--104."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3382507.3418815"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397481.3450692"},{"key":"e_1_3_2_2_38_1","volume-title":"Multimodal Analysis of the Predictability of Hand-gesture Properties. In 21st International Conference on Autonomous Agents and Multiagent Systems. IFAAMAS, 770--779","author":"Kucherenko Taras","year":"2022","unstructured":"Taras Kucherenko, Rajmund Nagy, Michael Neff, et al. 2022. Multimodal Analysis of the Predictability of Hand-gesture Properties. In 21st International Conference on Autonomous Agents and Multiagent Systems. IFAAMAS, 770--779."},{"key":"e_1_3_2_2_39_1","unstructured":"Taras Kucherenko PieterWolfert Youngwoo Yoon et al. 2023. Evaluating gesturegeneration in a large-scale open challenge: The GENEA Challenge 2022. CoRR abs\/2303.08737 (2023). arXiv:2303.08737"},{"key":"e_1_3_2_2_40_1","unstructured":"Aviral Kumar Aurick Zhou George Tucker and Sergey Levine. 2020. Conservative Q-Learning for Offline Reinforcement Learning. In Advances in Neural Information Processing Systems 33."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00085"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"crossref","unstructured":"Joonho Lee Jemin Hwangbo Lorenz Wellhausen et al. 2020. Learning quadrupedal locomotion over challenging terrain. Science robotics 5 47 (2020).","DOI":"10.1126\/scirobotics.abc5986"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cag.2022.04.001"},{"key":"e_1_3_2_2_45_1","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR. IEEE, 11040--11049","author":"Li Siyao","year":"2022","unstructured":"Siyao Li, Weijiang Yu, Tianpei Gu, et al. 2022. Bailando: 3D Dance Generation by Actor-Critic GPT with Choreographic Memory. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR. IEEE, 11040--11049."},{"key":"e_1_3_2_2_46_1","volume-title":"CLIFF: Carrying Location Information in Full Frames into Human Pose and Shape Estimation. arXiv:2208.00571 [cs.CV]","author":"Li Zhihao","year":"2022","unstructured":"Zhihao Li, Jianzhuang Liu, Zhensong Zhang, Songcen Xu, and Youliang Yan. 2022. CLIFF: Carrying Location Information in Full Frames into Human Pose and Shape Estimation. arXiv:2208.00571 [cs.CV]"},{"key":"e_1_3_2_2_47_1","volume-title":"SEEG: Semantic Energized Co-speech Gesture Generation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR. IEEE, 10463--10472","author":"Liang Yuanzhi","year":"2022","unstructured":"Yuanzhi Liang, Qianyu Feng, Linchao Zhu, et al. 2022. SEEG: Semantic Energized Co-speech Gesture Generation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR. IEEE, 10463--10472."},{"key":"e_1_3_2_2_48_1","volume-title":"PMnet: Learning of Disentangled Pose and Movement for Unsupervised Motion Retargeting. In 30th British Machine Vision Conference 2019, BMVC 2019","author":"Lim Jongin","year":"2019","unstructured":"Jongin Lim, Hyung Jin Chang, and Jin Young Choi. 2019. PMnet: Learning of Disentangled Pose and Movement for Unsupervised Motion Retargeting. In 30th British Machine Vision Conference 2019, BMVC 2019, Cardiff, UK, September 9-12, 2019. BMVA Press, 136."},{"key":"e_1_3_2_2_49_1","volume-title":"BEAT: A Large-Scale Semantic and Emotional Multi-modal Dataset for Conversational Gestures Synthesis. In ECCV","author":"Liu Haiyang","year":"2022","unstructured":"Haiyang Liu, Zihao Zhu, Naoya Iwamoto, et al. 2022. BEAT: A Large-Scale Semantic and Emotional Multi-modal Dataset for Conversational Gestures Synthesis. In ECCV 2022, Vol. 13667. Springer, 612--630."},{"key":"e_1_3_2_2_50_1","volume-title":"Learning Hierarchical Cross- Modal Association for Co-Speech Gesture Generation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR. IEEE, 10452--10462","author":"Liu Xian","year":"2022","unstructured":"Xian Liu, Qianyi Wu, Hang Zhou, et al. 2022. Learning Hierarchical Cross- Modal Association for Co-Speech Gesture Generation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR. IEEE, 10452--10462."},{"key":"e_1_3_2_2_51_1","volume-title":"DecoupledWeight Decay Regularization. In 7th International Conference on Learning Representations, ICLR, May 6-9.","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. DecoupledWeight Decay Regularization. In 7th International Conference on Learning Representations, ICLR, May 6-9."},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3558059"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20068-7_24"},{"key":"e_1_3_2_2_54_1","volume-title":"Pretrained Diffusion Models for Unified Human Motion Synthesis. CoRR abs\/2212.02837","author":"Ma Jianxin","year":"2022","unstructured":"Jianxin Ma, Shuai Bai, and Chang Zhou. 2022. Pretrained Diffusion Models for Unified Human Motion Synthesis. CoRR abs\/2212.02837 (2022). arXiv:2212.02837"},{"key":"e_1_3_2_2_55_1","first-page":"2","article-title":"Algorithms for inverse reinforcement learning","volume":"1","author":"Ng Andrew Y","year":"2000","unstructured":"Andrew Y Ng, Stuart Russell, et al. 2000. Algorithms for inverse reinforcement learning.. In Icml, Vol. 1. 2.","journal-title":"Icml"},{"key":"e_1_3_2_2_56_1","unstructured":"Simbarashe Nyatsanga Taras Kucherenko Chaitanya Ahuja et al. 2023. A Comprehensive Review of Data-Driven Co-Speech Gesture Generation. CoRR abs\/2301.05339 (2023). arXiv:2301.05339"},{"key":"e_1_3_2_2_57_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. CoRR abs\/2303.08774 (2023). arXiv:2303.08774"},{"key":"e_1_3_2_2_58_1","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, et al. 2022. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems 35 (2022), 27730--27744.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01245-6"},{"key":"e_1_3_2_2_60_1","unstructured":"Andr\u00e9 Susano Pinto Alexander Kolesnikov Yuge Shi et al. 2023. Tuning computer vision models with task rewards. arXiv preprint arXiv:2302.08242 (2023)."},{"key":"e_1_3_2_2_61_1","unstructured":"Xingqun Qi Chen Liu Muyi Sun et al. 2023. Diverse 3D Hand Gesture Prediction from Body Dynamics by Bilateral Hand Disentanglement. CoRR abs\/2303.01765 (2023). arXiv:2303.01765"},{"key":"e_1_3_2_2_62_1","volume-title":"Speech Drives Templates: Co-Speech Gesture Synthesis with Learned Templates. In IEEE\/CVF International Conference on Computer Vision, ICCV. IEEE, 11057--11066","author":"Qian Shenhan","year":"2021","unstructured":"Shenhan Qian, Zhi Tu, Yihao Zhi, et al. 2021. Speech Drives Templates: Co-Speech Gesture Synthesis with Learned Templates. In IEEE\/CVF International Conference on Computer Vision, ICCV. IEEE, 11057--11066."},{"key":"e_1_3_2_2_63_1","volume-title":"Diffusion Motion: Generate Text-Guided 3D Human Motion by Diffusion Model. CoRR abs\/2210.12315","author":"Ren Zhiyuan","year":"2022","unstructured":"Zhiyuan Ren, Zhihong Pan, Xin Zhou, and Le Kang. 2022. Diffusion Motion: Generate Text-Guided 3D Human Motion by Diffusion Model. CoRR abs\/2210.12315 (2022). arXiv:2210.12315"},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00353"},{"key":"e_1_3_2_2_65_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2019.04.005"},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530178"},{"key":"e_1_3_2_2_67_1","unstructured":"Mingyang Sun Mengchen Zhao Yaqing Hou et al. 2023. Co-speech Gesture Synthesis by Reinforcement Learning with Contrastive Pre-trained Rewards. (2023)."},{"key":"e_1_3_2_2_68_1","volume-title":"Generalization in reinforcement learning: Successful examples using sparse coarse coding. Advances in neural information processing systems 8","author":"Sutton Richard S","year":"1995","unstructured":"Richard S Sutton. 1995. Generalization in reinforcement learning: Successful examples using sparse coarse coding. Advances in neural information processing systems 8 (1995)."},{"key":"e_1_3_2_2_69_1","volume-title":"Barto","author":"Sutton Richard S.","year":"1998","unstructured":"Richard S. Sutton and Andrew G. Barto. 1998. Reinforcement learning - an introduction. MIT Press."},{"key":"e_1_3_2_2_70_1","volume-title":"Policy gradient methods for reinforcement learning with function approximation. Advances in neural information processing systems 12","author":"Sutton Richard S","year":"1999","unstructured":"Richard S Sutton, David McAllester, Satinder Singh, and Yishay Mansour. 1999. Policy gradient methods for reinforcement learning with function approximation. Advances in neural information processing systems 12 (1999)."},{"key":"e_1_3_2_2_71_1","volume-title":"Discretizing Continuous Action Space for On-Policy Optimization. In The Thirty-Fourth Conference on Artificial Intelligence, AAAI","author":"Tang Yunhao","year":"2020","unstructured":"Yunhao Tang and Shipra Agrawal. 2020. Discretizing Continuous Action Space for On-Policy Optimization. In The Thirty-Fourth Conference on Artificial Intelligence, AAAI 2020. AAAI Press, 5981--5988."},{"key":"e_1_3_2_2_72_1","unstructured":"Guy Tevet Sigal Raab Brian Gordon et al. 2022. Human Motion Diffusion Model. CoRR abs\/2209.14916 (2022). arXiv:2209.14916"},{"key":"e_1_3_2_2_73_1","volume-title":"EDGE: Editable Dance Generation From Music. CoRR abs\/2211.10658","author":"Tseng Jonathan","year":"2022","unstructured":"Jonathan Tseng, Rodrigo Castellon, and C. Karen Liu. 2022. EDGE: Editable Dance Generation From Music. CoRR abs\/2211.10658 (2022). arXiv:2211.10658"},{"key":"e_1_3_2_2_74_1","unstructured":"A\u00e4ron van den Oord Oriol Vinyals and Koray Kavukcuoglu. 2017. Neural Discrete Representation Learning. In Advances in Neural Information Processing Systems 30. 6306--6315."},{"key":"e_1_3_2_2_75_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar et al. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30. 5998--6008."},{"key":"e_1_3_2_2_76_1","volume-title":"Contact-Aware Retargeting of Skinned Motion. In IEEE\/CVF International Conference on Computer Vision, ICCV. IEEE, 9700--9709","author":"Villegas Ruben","year":"2021","unstructured":"Ruben Villegas, Duygu Ceylan, Aaron Hertzmann, et al. 2021. Contact-Aware Retargeting of Skinned Motion. In IEEE\/CVF International Conference on Computer Vision, ICCV. IEEE, 9700--9709."},{"key":"e_1_3_2_2_77_1","volume-title":"Neural Kinematic Networks for Unsupervised Motion Retargetting. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR. CVF \/ IEEE, 8639--8648","author":"Villegas Ruben","year":"2018","unstructured":"Ruben Villegas, Jimei Yang, Duygu Ceylan, and Honglak Lee. 2018. Neural Kinematic Networks for Unsupervised Motion Retargetting. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR. CVF \/ IEEE, 8639--8648."},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"crossref","unstructured":"Oriol Vinyals Igor Babuschkin Wojciech M Czarnecki et al. 2019. Grandmaster level in StarCraft II using multi-agent reinforcement learning. Nature 575 7782 (2019) 350--354.","DOI":"10.1038\/s41586-019-1724-z"},{"key":"e_1_3_2_2_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2019.2936810"},{"key":"e_1_3_2_2_80_1","volume-title":"Integrated Speech and Gesture Synthesis. In ICMI '21: International Conference on Multimodal Interaction, Montr\u00e9al, QC","author":"Wang Siyang","year":"2021","unstructured":"Siyang Wang, Simon Alexanderson, Joakim Gustafson, et al. 2021. Integrated Speech and Gesture Synthesis. In ICMI '21: International Conference on Multimodal Interaction, Montr\u00e9al, QC, Canada, October 18-22, 2021. ACM, 177--185."},{"key":"e_1_3_2_2_81_1","volume-title":"Heterogeneous Graph Attention Network. In The World Wide Web Conference, WWW 2019","author":"Wang Xiao","year":"2019","unstructured":"Xiao Wang, Houye Ji, Chuan Shi, et al. 2019. Heterogeneous Graph Attention Network. In The World Wide Web Conference, WWW 2019. ACM, 2022--2032."},{"key":"e_1_3_2_2_82_1","volume-title":"Machine learning 8","author":"Watkins Christopher JCH","year":"1992","unstructured":"Christopher JCH Watkins and Peter Dayan. 1992. Q-learning. Machine learning 8 (1992), 279--292."},{"key":"e_1_3_2_2_83_1","volume-title":"Simple statistical gradient-following algorithms for connectionist reinforcement learning. Reinforcement learning","author":"Williams Ronald J","year":"1992","unstructured":"Ronald J Williams. 1992. Simple statistical gradient-following algorithms for connectionist reinforcement learning. Reinforcement learning (1992), 5--32."},{"key":"e_1_3_2_2_84_1","unstructured":"Pan Xie Qipeng Zhang Zexian Li et al. 2022. Vector Quantized Diffusion Model with CodeUnet for Text-to-Sign Pose Sequences Generation. CoRR abs\/2208.09141 (2022). arXiv:2208.09141"},{"key":"e_1_3_2_2_85_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3558066"},{"key":"e_1_3_2_2_86_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/650"},{"key":"e_1_3_2_2_87_1","volume-title":"QPGesture: Quantization- Based and Phase-Guided Motion Matching for Natural Speech-Driven Gesture Generation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR. IEEE.","author":"Yang Sicheng","year":"2023","unstructured":"Sicheng Yang, Zhiyong Wu, Minglei Li, et al. 2023. QPGesture: Quantization- Based and Phase-Guided Motion Matching for Natural Speech-Driven Gesture Generation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR. IEEE."},{"key":"e_1_3_2_2_88_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_2_2_89_1","volume-title":"Robots Learn Social Skills: End-to-End Learning of Co-Speech Gesture Generation for Humanoid Robots. In International Conference on Robotics and Automation, ICRA. IEEE, 4303--4309","author":"Yoon Youngwoo","year":"2019","unstructured":"Youngwoo Yoon, Woo-Ri Ko, Minsu Jang, et al. 2019. Robots Learn Social Skills: End-to-End Learning of Co-Speech Gesture Generation for Humanoid Robots. In International Conference on Robotics and Automation, ICRA. IEEE, 4303--4309."},{"key":"e_1_3_2_2_90_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3558058"},{"key":"e_1_3_2_2_91_1","volume-title":"MMM","volume":"13833","author":"Zhang Fan","year":"2023","unstructured":"Fan Zhang, Naye Ji, Fuxing Gao, and Yongping Li. 2023. DiffMotion: Speech- Driven Gesture Synthesis Using Denoising Diffusion Model. In MultiMedia Modeling - 29th, MMM 2023, Vol. 13833. Springer, 231--242."},{"key":"e_1_3_2_2_92_1","volume-title":"GestureMaster: Graph-based Speech-driven Gesture Generation. In International Conference on Multimodal Interaction, ICMI. 764--770","author":"Zhou Chi","year":"2022","unstructured":"Chi Zhou, Tengyue Bian, and Kang Chen. 2022. GestureMaster: Graph-based Speech-driven Gesture Generation. In International Conference on Multimodal Interaction, ICMI. 764--770."},{"key":"e_1_3_2_2_93_1","volume-title":"UDE: A Unified Driving Engine for Human Motion Generation. CoRR abs\/2211.16016","author":"Zhou Zixiang","year":"2022","unstructured":"Zixiang Zhou and Baoyuan Wang. 2022. UDE: A Unified Driving Engine for Human Motion Generation. CoRR abs\/2211.16016 (2022). arXiv:2211.16016"},{"key":"e_1_3_2_2_94_1","unstructured":"Lingting Zhu Xian Liu Xuanyu Liu et al. 2023. Taming Diffusion Models for Audio-Driven Co-Speech Gesture Generation. CoRR abs\/2303.09119 (2023). arXiv:2303.09119"},{"key":"e_1_3_2_2_95_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20274"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612503","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612503","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:02:14Z","timestamp":1755820934000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612503"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":95,"alternative-id":["10.1145\/3581783.3612503","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612503","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}