{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,17]],"date-time":"2026-07-17T06:10:47Z","timestamp":1784268647408,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","funder":[{"name":"Young Scientists Fund of the National Natural Science Foundation of China , the National Key Research","award":["No. 624B2110"],"award-info":[{"award-number":["No. 624B2110"]}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["No. 2024YFC3015600"],"award-info":[{"award-number":["No. 2024YFC3015600"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Fundamental Research Funds for Central Universities","award":["No.2042023KF0180 & No.2042025KF0053"],"award-info":[{"award-number":["No.2042023KF0180 & No.2042025KF0053"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754847","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:56:44Z","timestamp":1761375404000},"page":"10827-10836","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["EchoMask: Speech-Queried Attention-based Mask Modeling for Holistic Co-Speech Motion Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-5642-9474","authenticated-orcid":false,"given":"Xiangyue","family":"Zhang","sequence":"first","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2716-1886","authenticated-orcid":false,"given":"Jianfang","family":"Li","sequence":"additional","affiliation":[{"name":"Tongyi Lab, Alibaba Group, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9551-2708","authenticated-orcid":false,"given":"Jiaxu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3840-5416","authenticated-orcid":false,"given":"Jianqiang","family":"Ren","sequence":"additional","affiliation":[{"name":"Tongyi Lab, Alibaba Group, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3142-0189","authenticated-orcid":false,"given":"Liefeng","family":"Bo","sequence":"additional","affiliation":[{"name":"Tongyi Lab, Alibaba Group, Bellevue, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5003-2260","authenticated-orcid":false,"given":"Zhigang","family":"Tu","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01991"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592458"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555435"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592097"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 14507-14517","author":"Chaminda Bandara Wele Gedara","year":"2023","unstructured":"Wele Gedara Chaminda Bandara, Naman Patel, Ali Gholami, Mehdi Nikkhah, Motilal Agrawal, and Vishal M Patel. 2023. Adamae: Adaptive masking for efficient spatiotemporal learning with masked autoencoders. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 14507-14517."},{"key":"e_1_3_2_1_6_1","volume-title":"Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254","author":"Bao Hangbo","year":"2021","unstructured":"Hangbo Bao, Li Dong, Songhao Piao, and Furu Wei. 2021. Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254 (2021)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Zal\u00e1n Borsos Rapha\u00ebl Marinier Damien Vincent Eugene Kharitonov Olivier Pietquin Matt Sharifi Dominik Roblek Olivier Teboul David Grangier Marco Tagliasacchi et al. 2023. Audiolm: a language modeling approach to audio generation. IEEE\/ACM transactions on audio speech and language processing Vol. 31 (2023) 2523-2533.","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/383259.383315"},{"key":"e_1_3_2_1_9_1","volume-title":"Muse: Text-to-image generation via masked generative transformers. arXiv preprint arXiv:2301.00704","author":"Chang Huiwen","year":"2023","unstructured":"Huiwen Chang, Han Zhang, Jarred Barber, AJ Maschinot, Jose Lezama, Lu Jiang, Ming-Hsuan Yang, Kevin Murphy, William T Freeman, Michael Rubinstein, et al., 2023. Muse: Text-to-image generation via masked generative transformers. arXiv preprint arXiv:2301.00704 (2023)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01103"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680847"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00702"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00190"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618183"},{"key":"e_1_3_2_1_15_1","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171-4186."},{"key":"e_1_3_2_1_16_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00361"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00186"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472306.3478335"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00220"},{"key":"e_1_3_2_1_23_1","volume-title":"Milan: Masked image pretraining on language assisted representation. arXiv preprint arXiv:2208.06049","author":"Hou Zejiang","year":"2022","unstructured":"Zejiang Hou, Fei Sun, Yen-Kuang Chen, Yuan Xie, and Sun-Yuan Kung. 2022. Milan: Masked image pretraining on language assisted representation. arXiv preprint arXiv:2208.06049 (2022)."},{"key":"e_1_3_2_1_24_1","volume-title":"Kushal Lakhotia","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio, speech, and language processing, Vol. 29 (2021), 3451-3460."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/2157689.2157694"},{"key":"e_1_3_2_1_26_1","volume-title":"The Thirteenth International Conference on Learning Representations.","author":"Jeong Minjae","unstructured":"Minjae Jeong, Yechan Hwang, Jaejin Lee, Sungyoon Jung, and Won Hwa Kim. [n.d.]. HGM^3: Hierarchical Generative Masked Motion Modeling with Hard Token Mining. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20056-4_18"},{"key":"e_1_3_2_1_28_1","unstructured":"Michael Kipp. Universal-Publishers 2005. Gesture generation by imitation: From human behavior to computer character animation."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/11821830_17"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"e_1_3_2_1_32_1","unstructured":"Zhe Li Weihao Yuan Yisheng He Lingteng Qiu Shenhao Zhu Xiaodong Gu Weichao Shen Yuan Dong Zilong Dong and Laurence T Yang. 2024. LaMP: Language-Motion Pretraining for Motion Generation Retrieval and Captioning. arXiv preprint arXiv:2410.07093 (2024)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548400"},{"key":"e_1_3_2_1_34_1","volume-title":"Tango: Co-speech gesture video reenactment with hierarchical audio motion embedding and diffusion interpolation. arXiv preprint arXiv:2410.04221","author":"Liu Haiyang","year":"2024","unstructured":"Haiyang Liu, Xingchao Yang, Tomoya Akiyama, Yuantian Huang, Qiaoge Li, Shigeru Kuriyama, and Takafumi Taketomi. 2024b. Tango: Co-speech gesture video reenactment with hierarchical audio motion embedding and diffusion interpolation. arXiv preprint arXiv:2410.04221 (2024)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00115"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_36"},{"key":"e_1_3_2_1_37_1","first-page":"21386","article-title":"Audio-driven co-speech gesture video generation","volume":"35","author":"Liu Xian","year":"2022","unstructured":"Xian Liu, Qianyi Wu, Hang Zhou, Yuanqi Du, Wayne Wu, Dahua Lin, and Ziwei Liu. 2022b. Audio-driven co-speech gesture video generation. Advances in Neural Information Processing Systems, Vol. 35 (2022), 21386-21399.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00155"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25269"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01060"},{"key":"e_1_3_2_1_42_1","volume-title":"European Conference on Computer Vision. Springer, 172-190","author":"Pinyoanuntapong Ekkasit","year":"2024","unstructured":"Ekkasit Pinyoanuntapong, Muhammad Usama Saleem, Pu Wang, Minwoo Lee, Srijan Das, and Chen Chen. 2024a. BAMM: bidirectional autoregressive motion model. In European Conference on Computer Vision. Springer, 172-190."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00153"},{"key":"e_1_3_2_1_44_1","volume-title":"Haoyang Wang, Jiechao Gao, et al.","author":"Shu Kai","year":"2024","unstructured":"Kai Shu, Haoyi Zhang, Wai Seng Cheang, Haoyang Wang, Jiechao Gao, et al., 2024. Eggesture: Entropy-guided vector quantized variational autoencoder for co-speech gesture generation. In ACM Multimedia 2024."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02486"},{"key":"e_1_3_2_1_46_1","unstructured":"Aaron Van Den Oord Oriol Vinyals et al. 2017. Neural discrete representation learning. Advances in neural information processing systems Vol. 30 (2017)."},{"key":"e_1_3_2_1_47_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01000"},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 9332-9341","author":"Wang Yaxing","unstructured":"Yaxing Wang, Abel Gonzalez-Garcia, David Berga, Luis Herranz, Fahad Shahbaz Khan, and Joost van de Weijer. 2020. Minegan: effective knowledge transfer from gans to target domains with few images. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 9332-9341."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/GlobalSIP45357.2019.8969272"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681321"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"e_1_3_2_1_54_1","volume-title":"Diffusestylegesture: Stylized audio-driven co-speech gesture generation with diffusion models. arXiv preprint arXiv:2305.04919","author":"Yang Sicheng","year":"2023","unstructured":"Sicheng Yang, Zhiyong Wu, Minglei Li, Zhensong Zhang, Lei Hao, Weihong Bao, Ming Cheng, and Long Xiao. 2023. Diffusestylegesture: Stylized audio-driven co-speech gesture generation with diffusion models. arXiv preprint arXiv:2305.04919 (2023)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20065-6_41"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00053"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"e_1_3_2_1_59_1","volume-title":"SemTalk: Holistic Co-speech Motion Generation with Frame-level Semantic Emphasis. arXiv preprint arXiv:2412.16563","author":"Zhang Xiangyue","year":"2024","unstructured":"Xiangyue Zhang, Jianfang Li, Jiaxu Zhang, Ziqiang Dang, Jianqiang Ren, Liefeng Bo, and Zhigang Tu. 2024. SemTalk: Holistic Co-speech Motion Generation with Frame-level Semantic Emphasis. arXiv preprint arXiv:2412.16563 (2024)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01016"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754847","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:42:29Z","timestamp":1765309349000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754847"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":60,"alternative-id":["10.1145\/3746027.3754847","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754847","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}