{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T08:55:37Z","timestamp":1773392137654,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Natural Science Foundation of China","award":["62171139"],"award-info":[{"award-number":["62171139"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680684","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"3266-3274","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["MDT-A2G: Exploring Masked Diffusion Transformers for Co-Speech Gesture Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2666-753X","authenticated-orcid":false,"given":"Xiaofeng","family":"Mao","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4064-994X","authenticated-orcid":false,"given":"Zhengkai","family":"Jiang","sequence":"additional","affiliation":[{"name":"Tencent Youtu Lab, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0788-3259","authenticated-orcid":false,"given":"Qilin","family":"Wang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3732-3035","authenticated-orcid":false,"given":"Chencan","family":"Fu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8891-6766","authenticated-orcid":false,"given":"Jiangning","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tencent Youtu Lab, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1036-5076","authenticated-orcid":false,"given":"Jiafu","family":"Wu","sequence":"additional","affiliation":[{"name":"Tencent Youtu Lab, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6592-8411","authenticated-orcid":false,"given":"Yabiao","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang University &amp; Tencent Youtu Lab, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4216-8090","authenticated-orcid":false,"given":"Chengjie","family":"Wang","sequence":"additional","affiliation":[{"name":"Tencent Youtu Lab, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8308-6360","authenticated-orcid":false,"given":"Wei","family":"Li","sequence":"additional","affiliation":[{"name":"Vivo Communication Technology Co. Ltd, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2650-4146","authenticated-orcid":false,"given":"Mingmin","family":"Chi","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592458"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592097"},{"key":"e_1_3_2_1_3_1","volume-title":"Diffsheg: A diffusion-based approach for real-time speech-driven holistic 3d expression and gesture generation. arXiv preprint arXiv:2401.04747","author":"Chen Junming","year":"2024","unstructured":"Junming Chen, Yunfei Liu, Jianan Wang, Ailing Zeng, Yu Li, and Qifeng Chen. 2024. Diffsheg: A diffusion-based approach for real-time speech-driven holistic 3d expression and gesture generation. arXiv preprint arXiv:2401.04747 (2024)."},{"key":"e_1_3_2_1_4_1","unstructured":"Junsong Chen Jincheng Yu Chongjian Ge Lewei Yao Enze Xie Yue Wu Zhongdao Wang James Kwok Ping Luo Huchuan Lu et al. 2023. PixArt-??: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis. arXiv preprint arXiv:2310.00426 (2023)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Sanyuan Chen Chengyi Wang Zhengyang Chen Yu Wu Shujie Liu Zhuo Chen Jinyu Li Naoyuki Kanda Takuya Yoshioka Xiong Xiao Jian Wu Long Zhou Shuo Ren Yanmin Qian Yao Qian Jian Wu Michael Zeng and Furu Wei. 2021. WavLM: Large-Scale Self-Supervised Pre-training for Full Stack Speech Processing. (2021). arXiv:2110.13900 [cs.CL]","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_1_6_1","volume-title":"Emotional Speechdriven 3D Body Animation via Disentangled Latent Diffusion. arXiv preprint arXiv:2312.04466","author":"Chhatre Kiran","year":"2023","unstructured":"Kiran Chhatre, Nikos Athanasiou, Giorgio Becherini, Christopher Peters, Michael J Black, and Timo Bolkart. 2023. Emotional Speechdriven 3D Body Animation via Disentangled Latent Diffusion. arXiv preprint arXiv:2312.04466 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"International Conference on Machine Learning. PMLR, 2166--2177","author":"Crabb\u00e9 Jonathan","year":"2021","unstructured":"Jonathan Crabb\u00e9 and Mihaela Van Der Schaar. 2021. Explaining time series predictions with dynamic masks. In International Conference on Machine Learning. PMLR, 2166--2177."},{"key":"e_1_3_2_1_8_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02117"},{"key":"e_1_3_2_1_10_1","volume-title":"MDTv2: Masked Diffusion Transformer is a Strong Image Synthesizer. arXiv preprint arXiv:2303.14389","author":"Gao Shanghua","year":"2023","unstructured":"Shanghua Gao, Pan Zhou, Ming-Ming Cheng, and Shuicheng Yan. 2023. MDTv2: Masked Diffusion Transformer is a Strong Image Synthesizer. arXiv preprint arXiv:2303.14389 (2023)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00361"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_13_1","first-page":"28708","article-title":"Masked autoencoders that listen","volume":"35","author":"Huang Po-Yao","year":"2022","unstructured":"Po-Yao Huang, Hu Xu, Juncheng Li, Alexei Baevski, Michael Auli, Wojciech Galuba, Florian Metze, and Christoph Feichtenhofer. 2022. Masked autoencoders that listen. Advances in Neural Information Processing Systems 35 (2022), 28708-- 28720.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i7.25996"},{"key":"e_1_3_2_1_15_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma Diederik P","year":"2013","unstructured":"Diederik P Kingma and Max Welling. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_1_16_1","volume-title":"DiffWave: A Versatile Diffusion Model for Audio Synthesis. In International Conference on Learning Representations.","author":"Kong Zhifeng","year":"2020","unstructured":"Zhifeng Kong, Wei Ping, Jiaji Huang, Kexin Zhao, and Bryan Catanzaro. 2020. DiffWave: A Versatile Diffusion Model for Audio Synthesis. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_17_1","volume-title":"Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942","author":"Lan Zhenzhong","year":"2019","unstructured":"Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut. 2019. Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942 (2019)."},{"key":"e_1_3_2_1_18_1","volume-title":"Dancing to music. Advances in neural information processing systems 32","author":"Lee Hsin-Ying","year":"2019","unstructured":"Hsin-Ying Lee, Xiaodong Yang, Ming-Yu Liu, Ting-Chun Wang, Yu-Ding Lu, Ming-Hsuan Yang, and Jan Kautz. 2019. Dancing to music. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548400"},{"key":"e_1_3_2_1_22_1","volume-title":"EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via Masked Audio Gesture Modeling. arXiv preprint arXiv:2401.00374","author":"Liu Haiyang","year":"2023","unstructured":"Haiyang Liu, Zihao Zhu, Giorgio Becherini, Yichen Peng, Mingyang Su, You Zhou, Naoya Iwamoto, Bo Zheng, and Michael J Black. 2023. EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via Masked Audio Gesture Modeling. arXiv preprint arXiv:2401.00374 (2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_36"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"e_1_3_2_1_25_1","volume-title":"Latte: Latent diffusion transformer for video generation. arXiv preprint arXiv:2401.03048","author":"Ma Xin","year":"2024","unstructured":"Xin Ma, Yaohui Wang, Gengyun Jia, Xinyuan Chen, Ziwei Liu, Yuan-Fang Li, Cunjian Chen, and Yu Qiao. 2024. Latte: Latent diffusion transformer for video generation. arXiv preprint arXiv:2401.03048 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the International Conference on Language Resources and Evaluation (LREC","author":"Mikolov Tomas","year":"2018","unstructured":"Tomas Mikolov, Edouard Grave, Piotr Bojanowski, Christian Puhrsch, and Armand Joulin. 2018. Advances in Pre-Training Distributed Word Representations. In Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)."},{"key":"e_1_3_2_1_27_1","volume-title":"Dit-3d: Exploring plain diffusion transformers for 3d shape generation. Advances in Neural Information Processing Systems 36","author":"Mo Shentong","year":"2024","unstructured":"Shentong Mo, Enze Xie, Ruihang Chu, Lanqing Hong, Matthias Niessner, and Zhenguo Li. 2024. Dit-3d: Exploring plain diffusion transformers for 3d shape generation. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"Albert Ali Salah, and Itir Onal Ertugrul","author":"Ning Mang","year":"2023","unstructured":"Mang Ning, Mingxiao Li, Jianlin Su, Albert Ali Salah, and Itir Onal Ertugrul. 2023. Elucidating the exposure bias in diffusion models. arXiv preprint arXiv:2308.15321 (2023)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_2_1_30_1","volume-title":"Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)."},{"key":"e_1_3_2_1_31_1","volume-title":"Human Motion Diffusion Model. In The Eleventh International Conference on Learning Representations.","author":"Tevet Guy","year":"2022","unstructured":"Guy Tevet, Sigal Raab, Brian Gordon, Yoni Shafir, Daniel Cohen-or, and Amit Haim Bermano. 2022. Human Motion Diffusion Model. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_32_1","volume-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems 35","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2022. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems 35 (2022), 10078--10093."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Sen Wang Jiangning Zhang Weijian Cao Xiaobin Hu Moran Li Xiaozhong Ji Xin Tan Mengtian Li Zhifeng Xie Chengjie Wang et al. 2024. MMoFusion: Multi-modal Co-Speech Motion Generation with Diffusion Model. arXiv preprint arXiv:2403.02905 (2024).","DOI":"10.2139\/ssrn.4965116"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612503"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/650"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Sicheng Yang Zunnan Xu Haiwei Xue Yongkang Cheng Shaoli Huang Mingming Gong and Zhiyong Wu. 2024. Freetalker: Controllable Speech and Text- Driven Gesture Generation Based on Diffusion Models for Enhanced Speaker Naturalness. In ICASSP 2024 - 2024 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP).","DOI":"10.1109\/ICASSP48485.2024.10447978"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577190.3616114"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00053"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_2_1_40_1","volume-title":"Motiondiffuse: Text-driven human motion generation with diffusion model","author":"Zhang Mingyuan","year":"2024","unstructured":"Mingyuan Zhang, Zhongang Cai, Liang Pan, Fangzhou Hong, Xinying Guo, Lei Yang, and Ziwei Liu. 2024. Motiondiffuse: Text-driven human motion generation with diffusion model. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01016"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680684","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680684","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:57Z","timestamp":1750295877000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680684"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":41,"alternative-id":["10.1145\/3664647.3680684","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680684","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}