{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:56:35Z","timestamp":1765310195790,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":76,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755140","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"9803-9812","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Contextual Gesture: Co-Speech Gesture Video Generation through Context-aware Gesture Representation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-6538-7174","authenticated-orcid":false,"given":"Pinxin","family":"Liu","sequence":"first","affiliation":[{"name":"University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1056-2752","authenticated-orcid":false,"given":"Pengfei","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of California, Irvine, Irvine, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2509-8230","authenticated-orcid":false,"given":"Hyeongwoo","family":"Kim","sequence":"additional","affiliation":[{"name":"Imperial College London, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8273-6737","authenticated-orcid":false,"given":"Pablo","family":"Garrido","sequence":"additional","affiliation":[{"name":"Flawless AI, Santa Monica, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8626-4119","authenticated-orcid":false,"given":"Ari","family":"Shapiro","sequence":"additional","affiliation":[{"name":"FlawlessAI, Los Angeles, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8775-6879","authenticated-orcid":false,"given":"Kyle","family":"Olszewski","sequence":"additional","affiliation":[{"name":"Flawless AI, Santa Monica, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.170"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555435"},{"key":"e_1_3_2_1_3_1","volume-title":"WhisperX: Time-Accurate Speech Transcription of Long-Form Audio. INTERSPEECH 2023","author":"Bain Max","year":"2023","unstructured":"Max Bain, Jaesung Huh, Tengda Han, and Andrew Zisserman. 2023. WhisperX: Time-Accurate Speech Transcription of Long-Form Audio. INTERSPEECH 2023 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Human communication research 17, 1","author":"Burgoon Judee K","year":"1990","unstructured":"Judee K Burgoon, Thomas Birk, and Michael Pfau. 1990. Nonverbal Behaviors, Persuasion, and Credibility. Human communication research 17, 1 (1990), 140--169."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Caroline Chan Shiry Ginosar Tinghui Zhou and Alexei A Efros. 2019. Everybody Dance Now. In ICCV.","DOI":"10.1109\/ICCV.2019.00603"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Junming Chen Yunfei Liu Jianan Wang Ailing Zeng Yu Li and Qifeng Chen. 2024. DiffSHEG: A Diffusion-Based Approach for Real-Time Speech-driven Holistic 3D Expression and Gesture Generation. arXiv:2401.04747 [cs.SD] https:\/\/arxiv.org\/abs\/2401.04747","DOI":"10.1109\/CVPR52733.2024.00702"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_1_8_1","volume-title":"The Interplay Between Gesture and Speech in the Production of Referring Expressions: Investigating the Tradeoff Hypothesis. Topics in cognitive science 4, 2","author":"De Ruiter Jan P","year":"2012","unstructured":"Jan P De Ruiter, Adrian Bangerter, and Paula Dings. 2012. The Interplay Between Gesture and Speech in the Production of Referring Expressions: Investigating the Tradeoff Hypothesis. Topics in cognitive science 4, 2 (2012), 232--248."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577190.3616117"},{"key":"e_1_3_2_1_10_1","volume-title":"EraseAnything: Enabling Concept Erasure in Rectified Flow Transformers. In Forty-second International Conference on Machine Learning. https:\/\/openreview.net\/forum?id=vvBAZJh2nQ","author":"Gao Daiheng","year":"2025","unstructured":"Daiheng Gao, Shilin Lu, Wenbo Zhou, Jiaming Chu, Jie Zhang, Mengxi Jia, Bang Zhang, Zhaoxin Fan, and Weiming Zhang. 2025. EraseAnything: Enabling Concept Erasure in Rectified Flow Transformers. In Forty-second International Conference on Machine Learning. https:\/\/openreview.net\/forum?id=vvBAZJh2nQ"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"S. Ginosar A. Bar G. Kohavi C. Chan A. Owens and J. Malik. 2019. Learning Individual Styles of Conversational Gesture. In CVPR. IEEE.","DOI":"10.1109\/CVPR.2019.00361"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00186"},{"key":"e_1_3_2_1_13_1","volume-title":"AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning. ICLR","author":"Guo Yuwei","year":"2024","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Zhengyang Liang, Yaohui Wang, Yu Qiao, Maneesh Agrawala, Dahua Lin, and Bo Dai. 2024. AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning. ICLR (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Xu He Qiaochu Huang Zhensong Zhang Zhiwei Lin ZhiyongWu Sicheng Yang Minglei Li Zhiyi Chen Songcen Xu and Xiaofei Wu. 2024. Co-Speech Gesture Video Generation via Motion-Decoupled Diffusion Model. In CVPR. 2263--2273.","DOI":"10.1109\/CVPR52733.2024.00220"},{"key":"e_1_3_2_1_15_1","unstructured":"Xingzhe He Bastian Wandt and Helge Rhodin. 2023. AutoLink: Self-Supervised Learning of Human Skeletons and Object Outlines by Linking Keypoints. arXiv:2205.10636 [cs.CV] https:\/\/arxiv.org\/abs\/2205.10636"},{"key":"e_1_3_2_1_16_1","unstructured":"Edward J Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In ICLR. https:\/\/openreview.net\/forum?id=nZeVKeeFYf9"},{"key":"e_1_3_2_1_17_1","volume-title":"Animate Anyone: Consistent and Controllable Image-to-Video Synthesis for Character Animation. arXiv preprint arXiv:2311.17117","author":"Hu Li","year":"2023","unstructured":"Li Hu, Xin Gao, Peng Zhang, Ke Sun, Bang Zhang, and Liefeng Bo. 2023. Animate Anyone: Consistent and Controllable Image-to-Video Synthesis for Character Animation. arXiv preprint arXiv:2311.17117 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"FreSca: Unveiling the Scaling Space in Diffusion Models. arXiv preprint arXiv:2504.02154","author":"Huang Chao","year":"2025","unstructured":"Chao Huang, Susan Liang, Yunlong Tang, Li Ma, Yapeng Tian, and Chenliang Xu. 2025. FreSca: Unveiling the Scaling Space in Diffusion Models. arXiv preprint arXiv:2504.02154 (2025)."},{"key":"e_1_3_2_1_19_1","volume-title":"Scaling Concept with Text-Guided Diffusion Models. arXiv preprint arXiv:2410.24151","author":"Huang Chao","year":"2024","unstructured":"Chao Huang, Susan Liang, Yunlong Tang, Yapeng Tian, Anurag Kumar, and Chenliang Xu. 2024. Scaling Concept with Text-Guided Diffusion Models. arXiv preprint arXiv:2410.24151 (2024)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Xun Huang and Serge Belongie. 2017. Arbitrary Style Transfer in Real-time with Adaptive Instance Normalization. arXiv:1703.06868 [cs.CV] https:\/\/arxiv.org\/abs\/1703.06868","DOI":"10.1109\/ICCV.2017.167"},{"key":"e_1_3_2_1_21_1","volume-title":"Make-Your-Anchor: A Diffusion-based 2D Avatar Generation Framework. arXiv preprint arXiv:2403.16510","author":"Huang Ziyao","year":"2024","unstructured":"Ziyao Huang, Fan Tang, Yong Zhang, Xiaodong Cun, Juan Cao, Jintao Li, and Tong-Yee Lee. 2024. Make-Your-Anchor: A Diffusion-based 2D Avatar Generation Framework. arXiv preprint arXiv:2403.16510 (2024)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Justin Johnson Alexandre Alahi and Li Fei-Fei. 2016. Perceptual Losses for Real-Time Style Transfer and Super-Resolution. arXiv:1603.08155 [cs.CV] https:\/\/arxiv.org\/abs\/1603.08155","DOI":"10.1007\/978-3-319-46475-6_43"},{"key":"e_1_3_2_1_23_1","volume-title":"DreamPose: Fashion Image-to-Video Synthesis via Stable Diffusion. arXiv preprint arXiv:2304.06025","author":"Karras Johanna","year":"2023","unstructured":"Johanna Karras, Aleksander Holynski, Ting-Chun Wang, and Ira Kemelmacher-Shlizerman. 2023. DreamPose: Fashion Image-to-Video Synthesis via Stable Diffusion. arXiv preprint arXiv:2304.06025 (2023)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00336"},{"key":"e_1_3_2_1_25_1","volume-title":"Dancing to Music. NeurIPS 32","author":"Lee Hsin-Ying","year":"2019","unstructured":"Hsin-Ying Lee, Xiaodong Yang, Ming-Yu Liu, Ting-Chun Wang, Yu-Ding Lu, Ming-Hsuan Yang, and Jan Kautz. 2019. Dancing to Music. NeurIPS 32 (2019)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548400"},{"key":"e_1_3_2_1_28_1","volume-title":"TANGO: Co-Speech Gesture Video Reenactment with Hierarchical Audio Motion Embedding and Diffusion Interpolation. arXiv preprint arXiv:2410.04221","author":"Liu Haiyang","year":"2024","unstructured":"Haiyang Liu, Xingchao Yang, Tomoya Akiyama, Yuantian Huang, Qiaoge Li, Shigeru Kuriyama, and Takafumi Taketomi. 2024. TANGO: Co-Speech Gesture Video Reenactment with Hierarchical Audio Motion Embedding and Diffusion Interpolation. arXiv preprint arXiv:2410.04221 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via Masked Audio Gesture Modeling. arXiv preprint arXiv:2401.00374","author":"Liu Haiyang","year":"2023","unstructured":"Haiyang Liu, Zihao Zhu, Giorgio Becherini, Yichen Peng, Mingyang Su, You Zhou, Naoya Iwamoto, Bo Zheng, and Michael J Black. 2023. EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via Masked Audio Gesture Modeling. arXiv preprint arXiv:2401.00374 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"BEAT: A Large-Scale Semantic and Emotional Multi-Modal Dataset for Conversational Gestures Synthesis. arXiv preprint arXiv:2203.05297","author":"Liu Haiyang","year":"2022","unstructured":"Haiyang Liu, Zihao Zhu, Naoya Iwamoto, Yichen Peng, Zhengqing Li, You Zhou, Elif Bozkurt, and Bo Zheng. 2022. BEAT: A Large-Scale Semantic and Emotional Multi-Modal Dataset for Conversational Gestures Synthesis. arXiv preprint arXiv:2203.05297 (2022)."},{"key":"e_1_3_2_1_31_1","volume-title":"Intentional Gesture: Deliver Your Intentions with Gestures for Speech. arXiv:2505.15197 [cs.CV] https:\/\/arxiv.org\/abs\/2505.15197","author":"Liu Pinxin","year":"2025","unstructured":"Pinxin Liu, Haiyang Liu, Luchuan Song, and Chenliang Xu. 2025. Intentional Gesture: Deliver Your Intentions with Gestures for Speech. arXiv:2505.15197 [cs.CV] https:\/\/arxiv.org\/abs\/2505.15197"},{"key":"e_1_3_2_1_32_1","volume-title":"GestureLSM: Latent Shortcut based Co-Speech Gesture Generation with Spatial-Temporal Modeling. arXiv preprint arXiv:2501.18898","author":"Liu Pinxin","year":"2025","unstructured":"Pinxin Liu, Luchuan Song, Junhua Huang, and Chenliang Xu. 2025. GestureLSM: Latent Shortcut based Co-Speech Gesture Generation with Spatial-Temporal Modeling. arXiv preprint arXiv:2501.18898 (2025)."},{"key":"e_1_3_2_1_33_1","volume-title":"GaussianStyle: Gaussian Head Avatar via StyleGAN. arXiv preprint arXiv:2402.00827","author":"Liu Pinxin","year":"2024","unstructured":"Pinxin Liu, Luchuan Song, Daoan Zhang, Hang Hua, Yunlong Tang, Huaijin Tu, Jiebo Luo, and Chenliang Xu. 2024. GaussianStyle: Gaussian Head Avatar via StyleGAN. arXiv preprint arXiv:2402.00827 (2024)."},{"key":"e_1_3_2_1_34_1","first-page":"21386","article-title":"Audio-Driven Co-Speech Gesture Video Generation","volume":"35","author":"Liu Xian","year":"2022","unstructured":"Xian Liu, Qianyi Wu, Hang Zhou, Yuanqi Du, Wayne Wu, Dahua Lin, and Ziwei Liu. 2022. Audio-Driven Co-Speech Gesture Video Generation. NeurIPS 35 (2022), 21386--21399.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Xian Liu QianyiWu Hang Zhou Yinghao Xu Rui Qian Xinyi Lin Xiaowei Zhou Wayne Wu Bo Dai and Bolei Zhou. 2022. Learning Hierarchical Cross-Modal Association for Co-Speech Gesture Generation. In CVPR. 10462--10472.","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"e_1_3_2_1_36_1","unstructured":"Yinhan Liu Myle Ott Naman Goyal Jingfei Du Mandar Joshi Danqi Chen Omer Levy Mike Lewis Luke Zettlemoyer and Veselin Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv:1907.11692 [cs.CL] https:\/\/arxiv.org\/abs\/1907.11692"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-022-12755-w"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00218"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00615"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01841"},{"key":"e_1_3_2_1_41_1","volume-title":"SplatFace: Gaussian splat face reconstruction leveraging an optimizable surface. arXiv preprint arXiv:2403.18784","author":"Luo Jiahao","year":"2024","unstructured":"Jiahao Luo, Jing Liu, and James Davis. 2024. SplatFace: Gaussian splat face reconstruction leveraging an optimizable surface. arXiv preprint arXiv:2403.18784 (2024)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Rang Meng Xingyu Zhang Yuming Li and Chenguang Ma. 2024. EchoMimicV2: Towards Striking Simplified and Semi-Body Human Animation. arXiv:2411.10061 [cs.CV]","DOI":"10.1109\/CVPR52734.2025.00516"},{"key":"e_1_3_2_1_44_1","volume-title":"Albert Ali Salah, and Itir Onal Ertugrul","author":"Ning Mang","year":"2024","unstructured":"Mang Ning, Mingxiao Li, Jianlin Su, Haozhe Jia, Lanmiao Liu, Martin Bene\u0161, Wenshuo Chen, Albert Ali Salah, and Itir Onal Ertugrul. 2024. Dctdiff: Intriguing properties of image generative modeling in the dct space. arXiv preprint arXiv:2412.15032 (2024)."},{"key":"e_1_3_2_1_45_1","unstructured":"OpenMMLab. 2020. OpenMMLab Pose Estimation Toolbox and Benchmark. https:\/\/github.com\/open-mmlab\/mmpose."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Taesung Park Ming-Yu Liu Ting-Chun Wang and Jun-Yan Zhu. 2019. Semantic Image Synthesis with Spatially-Adaptive Normalization. In CVPR.","DOI":"10.1109\/CVPR.2019.00244"},{"key":"e_1_3_2_1_47_1","volume-title":"Black","author":"Pavlakos Georgios","year":"2019","unstructured":"Georgios Pavlakos, Vasileios Choutas, Nima Ghorbani, Timo Bolkart, Ahmed A. A. Osman, Dimitrios Tzionas, and Michael J. Black. 2019. Expressive Body Capture: 3D Hands, Face, and Body from a Single Image. In CVPR."},{"volume-title":"Theory and Applications of Digital Speech Processing","author":"Rabiner Lawrence","key":"e_1_3_2_1_48_1","unstructured":"Lawrence Rabiner and Ronald Schafer. 2010. Theory and Applications of Digital Speech Processing. Prentice Hall Press."},{"key":"e_1_3_2_1_49_1","volume-title":"Qin Jin, and Baining Guo.","author":"Ruan Ludan","year":"2023","unstructured":"Ludan Ruan, Yiyang Ma, Huan Yang, Huiguo He, Bei Liu, Jianlong Fu, Nicholas Jing Yuan, Qin Jin, and Baining Guo. 2023. MM-Diffusion: Learning Multi-Modal Diffusion Models for Joint Audio and Video Generation. In CVPR."},{"key":"e_1_3_2_1_50_1","unstructured":"Aliaksandr Siarohin St\u00e9phane Lathuili\u00e8re Sergey Tulyakov Elisa Ricci and Nicu Sebe. 2019. First Order Motion Model for Image Animation. In NeurIPS."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Aliaksandr Siarohin Oliver Woodford Jian Ren Menglei Chai and Sergey Tulyakov. 2021. Motion Representations for Articulated Animation. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01344"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687632"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475196"},{"key":"e_1_3_2_1_54_1","volume-title":"European Conference on Computer Vision. Springer, 1--20","author":"Song Luchuan","year":"2024","unstructured":"Luchuan Song, Pinxin Liu, Lele Chen, Guojun Yin, and Chenliang Xu. 2024. Tri 2-plane: Thinking Head Avatar via Feature Pyramid. In European Conference on Computer Vision. Springer, 1--20."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446837"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446837"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01905"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP42928.2021.9506512"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","unstructured":"Yunlong Tang Junjia Guo Hang Hua Susan Liang Mingqian Feng Xinyang Li Rui Mao Chao Huang Jing Bi Zeliang Zhang et al. 2024. VidComposition: Can MLLMs Analyze Compositions in Compiled Videos? arXiv preprint arXiv:2411.10979 (2024).","DOI":"10.1109\/CVPR52734.2025.00794"},{"key":"e_1_3_2_1_60_1","unstructured":"Yunlong Tang Junjia Guo Pinxin Liu ZhiyuanWang Hang Hua Jia-Xing Zhong Yunzhong Xiao Chao Huang Luchuan Song Susan Liang et al. 2025. Generative AI for Cel-Animation: A Survey. arXiv preprint arXiv:2501.06250 (2025)."},{"key":"e_1_3_2_1_61_1","volume-title":"Cardiff: Video salient object ranking chain of thought reasoning for saliency prediction with diffusion. arXiv preprint arXiv:2408.12009","author":"Tang Yunlong","year":"2024","unstructured":"Yunlong Tang, Gen Zhan, Li Yang, Yiting Liao, and Chenliang Xu. 2024. Cardiff: Video salient object ranking chain of thought reasoning for saliency prediction with diffusion. arXiv preprint arXiv:2408.12009 (2024)."},{"key":"e_1_3_2_1_62_1","volume-title":"EMO: Emote Portrait Alive-Generating Expressive Portrait Videos with Audio2Video Diffusion Model Under Weak Conditions. arXiv preprint arXiv:2402.17485","author":"Tian Linrui","year":"2024","unstructured":"Linrui Tian, Qi Wang, Bang Zhang, and Liefeng Bo. 2024. EMO: Emote Portrait Alive-Generating Expressive Portrait Videos with Audio2Video Diffusion Model Under Weak Conditions. arXiv preprint arXiv:2402.17485 (2024)."},{"key":"e_1_3_2_1_63_1","volume-title":"Karol Kurach, Raphael Marinier, Marcin Michalski, and Sylvain Gelly.","author":"Unterthiner Thomas","year":"2018","unstructured":"Thomas Unterthiner, Sjoerd Van Steenkiste, Karol Kurach, Raphael Marinier, Marcin Michalski, and Sylvain Gelly. 2018. Towards Accurate Generative Models of Video: A New Metric & Challenges. arXiv preprint arXiv:1812.01717 (2018)."},{"key":"e_1_3_2_1_64_1","unstructured":"Aaron van den Oord Yazhe Li and Oriol Vinyals. 2019. Representation Learning with Contrastive Predictive Coding. arXiv:1807.03748 [cs.LG] https:\/\/arxiv.org\/abs\/1807.03748"},{"key":"e_1_3_2_1_65_1","unstructured":"Aaron van den Oord Oriol Vinyals and Koray Kavukcuoglu. 2018. Neural Discrete Representation Learning. arXiv:1711.00937 [cs.LG] https:\/\/arxiv.org\/abs\/1711.00937"},{"key":"e_1_3_2_1_66_1","unstructured":"QingfuWan Wei Zhang and Xiangyang Xue. 2017. DeepSkeleton: Skeleton Map for 3D Human Pose Regression. arXiv:1711.10796 [cs.CV] https:\/\/arxiv.org\/abs\/1711.10796"},{"key":"e_1_3_2_1_67_1","unstructured":"Ting-Chun Wang Ming-Yu Liu Jun-Yan Zhu Andrew Tao Jan Kautz and Bryan Catanzaro. 2018. High-Resolution Image Synthesis and Semantic Manipulation with Conditional GANs. In CVPR."},{"key":"e_1_3_2_1_68_1","volume-title":"Annan Wang, Wenxiu Sun Sun, Qiong Yan, and Weisi Lin.","author":"Wu Haoning","year":"2023","unstructured":"Haoning Wu, Erli Zhang, Liang Liao, Chaofeng Chen, Jingwen Hou Hou, Annan Wang, Wenxiu Sun Sun, Qiong Yan, and Weisi Lin. 2023. Exploring Video Quality Assessment on User Generated Contents from Aesthetic and Technical Perspectives. In ICCV."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"e_1_3_2_1_70_1","unstructured":"Zunnan Xu Yachao Zhang Sicheng Yang Ronghui Li and Xiu Li. 2023. Chain of Generation: Multi-Modal Gesture Synthesis via Cascaded Conditional Control. arXiv:2312.15900 [cs.CV] https:\/\/arxiv.org\/abs\/2312.15900"},{"key":"e_1_3_2_1_71_1","unstructured":"Hongwei Yi Hualin Liang Yifei Liu Qiong Cao Yandong Wen Timo Bolkart Dacheng Tao and Michael J Black. 2023. Generating Holistic 3D Human Motion from Speech. In CVPR."},{"key":"e_1_3_2_1_72_1","volume-title":"Speech Gesture Generation from the Trimodal Context of Text, Audio, and Speaker Identity. ACM TOG 39, 6","author":"Yoon Youngwoo","year":"2020","unstructured":"Youngwoo Yoon, Bok Cha, Joo-Haeng Lee, Minsu Jang, Jaeyeon Lee, Jaehong Kim, and Geehyuk Lee. 2020. Speech Gesture Generation from the Trimodal Context of Text, Audio, and Speaker Identity. ACM TOG 39, 6 (2020)."},{"key":"e_1_3_2_1_73_1","unstructured":"Pengfei Zhang Pinxin Liu Hyeongwoo Kim Pablo Garrido and Bindita Chaudhuri. 2025. KinMo: Kinematic-aware Human Motion Understanding and Generation. arXiv:2411.15472 [cs.CV] https:\/\/arxiv.org\/abs\/2411.15472"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"crossref","unstructured":"Jian Zhao and Hui Zhang. 2022. Thin-Plate Spline Motion Model for Image Animation. In CVPR. 3657--3666.","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"e_1_3_2_1_75_1","volume-title":"Zuozhuo Dai, Yinghui Xu, Xun Cao, Yao Yao, Hao Zhu, and Siyu Zhu.","author":"Zhu Shenhao","year":"2024","unstructured":"Shenhao Zhu, Junming Leo Chen, Zuozhuo Dai, Yinghui Xu, Xun Cao, Yao Yao, Hao Zhu, and Siyu Zhu. 2024. Champ: Controllable and Consistent Human Image Animation with 3D Parametric Guidance. arXiv preprint arXiv:2403.14781 (2024)."},{"key":"e_1_3_2_1_76_1","volume-title":"OFTSR: One-Step Flow for Image Super-Resolution with Tunable Fidelity-Realism Trade-offs. arXiv:2412.09465 [cs.CV] https:\/\/arxiv.org\/abs\/2412.09465","author":"Zhu Yuanzhi","year":"2024","unstructured":"Yuanzhi Zhu, Ruiqing Wang, Shilin Lu, Junnan Li, Hanshu Yan, and Kai Zhang. 2024. OFTSR: One-Step Flow for Image Super-Resolution with Tunable Fidelity-Realism Trade-offs. arXiv:2412.09465 [cs.CV] https:\/\/arxiv.org\/abs\/2412.09465"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755140","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:54:53Z","timestamp":1765310093000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755140"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":76,"alternative-id":["10.1145\/3746027.3755140","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755140","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}