{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T00:42:48Z","timestamp":1768264968005,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3680528.3687677","type":"proceedings-article","created":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T08:14:37Z","timestamp":1733213677000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["SIGGesture: Generalized Co-Speech Gesture Synthesis via Semantic Injection with Large-Scale Pre-Training Diffusion Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6631-1504","authenticated-orcid":false,"given":"Qingrong","family":"Cheng","sequence":"first","affiliation":[{"name":"Tencent AI Lab, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1365-6546","authenticated-orcid":false,"given":"Xu","family":"Li","sequence":"additional","affiliation":[{"name":"Tencent AI Lab, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7143-0185","authenticated-orcid":false,"given":"Xinghui","family":"Fu","sequence":"additional","affiliation":[{"name":"Tencent AI Lab, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2024,12,3]]},"reference":[{"key":"e_1_3_3_3_2_1","unstructured":"Josh Achiam Steven Adler et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_3_3_1","doi-asserted-by":"crossref","unstructured":"Simon Alexanderson Rajmund Nagy Jonas Beskow and Gustav\u00a0Eje Henter. 2023. Listen denoise action! audio-driven motion synthesis with diffusion models. ACM Transactions on Graphics (TOG) 42 4 (2023) 1\u201320.","DOI":"10.1145\/3592458"},{"key":"e_1_3_3_3_4_1","doi-asserted-by":"crossref","unstructured":"Tenglong Ao Qingzhe Gao Yuke Lou Baoquan Chen and Libin Liu. 2022. Rhythmic gesticulator: Rhythm-aware co-speech gesture synthesis with hierarchical neural embeddings. ACM Transactions on Graphics (TOG) 41 6 (2022) 1\u201319.","DOI":"10.1145\/3550454.3555435"},{"key":"e_1_3_3_3_5_1","unstructured":"Tenglong Ao Zeyi Zhang and Libin Liu. 2023. GestureDiffuCLIP: Gesture diffusion model with CLIP latents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.14613 (2023)."},{"key":"e_1_3_3_3_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475223"},{"key":"e_1_3_3_3_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/VR50410.2021.00037"},{"key":"e_1_3_3_3_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_34"},{"key":"e_1_3_3_3_9_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared\u00a0D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et\u00a0al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877\u20131901."},{"key":"e_1_3_3_3_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.143"},{"key":"e_1_3_3_3_11_1","doi-asserted-by":"crossref","unstructured":"Justine Cassell David McNeill and Karl-Erik McCullough. 1999. Speech-gesture mismatches: Evidence for one underlying representation of linguistic and nonlinguistic information. Pragmatics & cognition 7 1 (1999) 1\u201334.","DOI":"10.1075\/pc.7.1.03cas"},{"key":"e_1_3_3_3_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/192161.192272"},{"key":"e_1_3_3_3_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/383259.383315"},{"key":"e_1_3_3_3_14_1","doi-asserted-by":"crossref","unstructured":"Junming Chen Yunfei Liu Jianan Wang Ailing Zeng Yu Li and Qifeng Chen. 2024. DiffSHEG: A Diffusion-Based Approach for Real-Time Speech-driven Holistic 3D Expression and Gesture Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.04747 (2024).","DOI":"10.1109\/CVPR52733.2024.00702"},{"key":"e_1_3_3_3_15_1","doi-asserted-by":"crossref","unstructured":"Sanyuan Chen Chengyi Wang Zhengyang Chen Yu Wu Shujie Liu Zhuo Chen Jinyu Li Naoyuki Kanda Takuya Yoshioka Xiong Xiao et\u00a0al. 2022. Wavlm: Large-scale self-supervised pre-training for full stack speech processing. IEEE Journal of Selected Topics in Signal Processing 16 6 (2022) 1505\u20131518.","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_3_3_16_1","unstructured":"Prafulla Dhariwal Heewoo Jun Christine Payne Jong\u00a0Wook Kim Alec Radford and Ilya Sutskever. 2020. Jukebox: A generative model for music. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2005.00341 (2020)."},{"key":"e_1_3_3_3_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267851.3267898"},{"key":"e_1_3_3_3_18_1","unstructured":"Nan Gao Zeyu Zhao Zhi Zeng Shuwu Zhang and Dongdong Weng. 2023. GesGPT: Speech Gesture Synthesis With Text Parsing from GPT. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.13013 (2023)."},{"key":"e_1_3_3_3_19_1","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.14734"},{"key":"e_1_3_3_3_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00361"},{"key":"e_1_3_3_3_21_1","doi-asserted-by":"crossref","unstructured":"Susan Goldin-Meadow and Martha\u00a0Wagner Alibali. 2013. Gesture\u2019s role in speaking learning and creating language. Annual review of psychology 64 (2013) 257\u2013283.","DOI":"10.1146\/annurev-psych-113011-143802"},{"key":"e_1_3_3_3_22_1","doi-asserted-by":"crossref","unstructured":"Ian Goodfellow Jean Pouget-Abadie Mehdi Mirza Bing Xu David Warde-Farley Sherjil Ozair Aaron Courville and Yoshua Bengio. 2020. Generative adversarial networks. Commun. ACM 63 11 (2020) 139\u2013144.","DOI":"10.1145\/3422622"},{"key":"e_1_3_3_3_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472306.3478335"},{"key":"e_1_3_3_3_24_1","doi-asserted-by":"crossref","unstructured":"Markus Hafner Maria Katsantoni Tino K\u00f6ster James Marks Joyita Mukherjee Dorothee Staiger Jernej Ule and Mihaela Zavolan. 2021. CLIP and complementary methods. Nature Reviews Methods Primers 1 1 (2021) 20.","DOI":"10.1038\/s43586-021-00018-1"},{"key":"e_1_3_3_3_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267851.3267878"},{"key":"e_1_3_3_3_26_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020) 6840\u20136851."},{"key":"e_1_3_3_3_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/2157689.2157694"},{"key":"e_1_3_3_3_28_1","unstructured":"Longbin Ji Pengfei Wei Yi Ren Jinglin Liu Chen Zhang and Xiang Yin. 2023. C2G2: Controllable Co-speech Gesture Generation with Latent Diffusion Model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.15016 (2023)."},{"key":"e_1_3_3_3_29_1","unstructured":"Diederik\u00a0P Kingma and Max Welling. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1312.6114 (2013)."},{"key":"e_1_3_3_3_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3382507.3418815"},{"key":"e_1_3_3_3_31_1","doi-asserted-by":"crossref","unstructured":"Alex Lascarides and Matthew Stone. 2009. A formal semantic analysis of gesture. Journal of Semantics 26 4 (2009) 393\u2013449.","DOI":"10.1093\/jos\/ffp004"},{"key":"e_1_3_3_3_32_1","first-page":"763","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Lee Gilwoo","year":"2019","unstructured":"Gilwoo Lee, Zhiwei Deng, Shugao Ma, Takaaki Shiratori, Siddhartha\u00a0S Srinivasa, and Yaser Sheikh. 2019. Talking with hands 16.2 m: A large-scale dataset of synchronized body-finger motion and audio for conversational motion analysis and synthesis. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 763\u2013772."},{"key":"e_1_3_3_3_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00339"},{"key":"e_1_3_3_3_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"e_1_3_3_3_35_1","unstructured":"Haiyang Liu Zihao Zhu Giorgio Becherini Yichen Peng Mingyang Su You Zhou Naoya Iwamoto Bo Zheng and Michael\u00a0J Black. 2023. EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via Masked Audio Gesture Modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.00374 (2023)."},{"key":"e_1_3_3_3_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_36"},{"key":"e_1_3_3_3_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"e_1_3_3_3_38_1","unstructured":"Shuhong Lu Youngwoo Yoon and Andrew Feng. 2023. Co-Speech Gesture Synthesis using Discrete Gesture Token Learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.12822 (2023)."},{"key":"e_1_3_3_3_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485895.2485900"},{"key":"e_1_3_3_3_40_1","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.14776"},{"key":"e_1_3_3_3_41_1","unstructured":"Aaron Van\u00a0Den Oord Oriol Vinyals and Koray Kavukcuoglu. 2017. Neural Discrete Representation Learning. (2017)."},{"key":"e_1_3_3_3_42_1","doi-asserted-by":"crossref","unstructured":"Kunkun Pang Dafei Qin Yingruo Fan Julian Habekost Takaaki Shiratori Junichi Yamagishi and Taku Komura. 2023. Bodyformer: Semantics-guided 3d body gesture synthesis with transformer. ACM Transactions on Graphics (TOG) 42 4 (2023) 1\u201312.","DOI":"10.1145\/3592456"},{"key":"e_1_3_3_3_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01089"},{"key":"e_1_3_3_3_44_1","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_3_45_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et\u00a0al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_3_3_46_1","unstructured":"Jiaming Song Chenlin Meng and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.02502 (2020)."},{"key":"e_1_3_3_3_47_1","first-page":"2331","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Sun Mingyang","year":"2023","unstructured":"Mingyang Sun, Mengchen Zhao, Yaqing Hou, Minglei Li, Huang Xu, Songcen Xu, and Jianye Hao. 2023. Co-Speech Gesture Synthesis by Reinforcement Learning With Contrastive Pre-Trained Rewards. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2331\u20132340."},{"key":"e_1_3_3_3_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00051"},{"key":"e_1_3_3_3_49_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/650"},{"key":"e_1_3_3_3_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00230"},{"key":"e_1_3_3_3_51_1","doi-asserted-by":"crossref","unstructured":"Sicheng Yang Zunnan Xu Haiwei Xue Yongkang Cheng Shaoli Huang Mingming Gong and Zhiyong Wu. 2024. Freetalker: Controllable Speech and Text-Driven Gesture Generation Based on Diffusion Models for Enhanced Speaker Naturalness. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.03476 (2024).","DOI":"10.1109\/ICASSP48485.2024.10447978"},{"key":"e_1_3_3_3_52_1","unstructured":"Heyuan Yao Zhenhua Song Yuyang Zhou Tenglong Ao Baoquan Chen and Libin Liu. 2023. MoConVQ: Unified Physics-Based Motion Control via Scalable Discrete Representations. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.10198 (2023)."},{"key":"e_1_3_3_3_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00053"},{"key":"e_1_3_3_3_54_1","doi-asserted-by":"crossref","unstructured":"Youngwoo Yoon Bok Cha Joo-Haeng Lee Minsu Jang Jaeyeon Lee Jaehong Kim and Geehyuk Lee. 2020. Speech gesture generation from the trimodal context of text audio and speaker identity. ACM Transactions on Graphics (TOG) 39 6 (2020) 1\u201316.","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_3_3_55_1","doi-asserted-by":"crossref","unstructured":"Mingyuan Zhang Zhongang Cai Liang Pan Fangzhou Hong Xinying Guo Lei Yang and Ziwei Liu. 2024b. Motiondiffuse: Text-driven human motion generation with diffusion model. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024).","DOI":"10.1109\/TPAMI.2024.3355414"},{"key":"e_1_3_3_3_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00040"},{"key":"e_1_3_3_3_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00982"},{"key":"e_1_3_3_3_58_1","doi-asserted-by":"crossref","unstructured":"Zeyi Zhang Tenglong Ao Yuyao Zhang Qingzhe Gao Chuan Lin Baoquan Chen and Libin Liu. 2024a. Semantic Gesticulator: Semantics-Aware Co-Speech Gesture Synthesis. ACM Transactions on Graphics (TOG) 43 4 (2024) 1\u201317.","DOI":"10.1145\/3658134"},{"key":"e_1_3_3_3_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01902"},{"key":"e_1_3_3_3_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01016"}],"event":{"name":"SA '24: SIGGRAPH Asia 2024 Conference Papers","location":"Tokyo Japan","acronym":"SA '24","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["SIGGRAPH Asia 2024 Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687677","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3680528.3687677","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:20Z","timestamp":1750295900000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687677"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":59,"alternative-id":["10.1145\/3680528.3687677","10.1145\/3680528"],"URL":"https:\/\/doi.org\/10.1145\/3680528.3687677","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}